From 9de5300c7fc0ff944e02d5d1a1ae5742234930e0 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Wed, 3 May 2023 18:00:14 +0200 Subject: [PATCH 01/25] Optimize check_filesystem() and process_error_log() Integrate check_clones() into check_filesystem() and implement a list instead of iterating recursively over the clones, thus eliminating the risk of a stack overflow. Also use kmem_zalloc() to allocate large structures in process_error_log() reducing its stack size from ~700 to ~128 bytes. Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14744 --- module/zfs/spa_errlog.c | 138 ++++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 54 deletions(-) diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 3bc8619b51a8..e0604c4a84af 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -72,6 +72,11 @@ #define NAME_MAX_LEN 64 +typedef struct clones { + uint64_t clone_ds; + list_node_t node; +} clones_t; + /* * spa_upgrade_errlog_limit : A zfs module parameter that controls the number * of on-disk error log entries that will be converted to the new @@ -135,10 +140,6 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) } #ifdef _KERNEL -static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, - uint64_t *snap_obj_array, zbookmark_err_phys_t *zep, void* uaddr, - uint64_t *count); - static void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) { @@ -291,7 +292,7 @@ copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count) */ static int check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, - void *uaddr, uint64_t *count) + void *uaddr, uint64_t *count, list_t *clones_list) { dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; @@ -412,24 +413,10 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, dsl_dataset_rele(ds, FTAG); } - if (zap_clone != 0 && aff_snap_count > 0) { - error = check_clones(spa, zap_clone, snap_count, snap_obj_array, - zep, uaddr, count); - } + if (zap_clone == 0 || aff_snap_count == 0) + return (0); -out: - kmem_free(snap_obj_array, sizeof (*snap_obj_array)); - return (error); -} - -/* - * Clone checking. - */ -static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, - uint64_t *snap_obj_array, zbookmark_err_phys_t *zep, void* uaddr, - uint64_t *count) -{ - int error = 0; + /* Check clones. */ zap_cursor_t *zc; zap_attribute_t *za; @@ -440,7 +427,6 @@ static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { - dsl_pool_t *dp = spa->spa_dsl_pool; dsl_dataset_t *clone; error = dsl_dataset_hold_obj(dp, za->za_first_integer, FTAG, &clone); @@ -463,17 +449,17 @@ static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, if (!found) continue; - error = check_filesystem(spa, za->za_first_integer, zep, - uaddr, count); - - if (error != 0) - break; + clones_t *ct = kmem_zalloc(sizeof (*ct), KM_SLEEP); + ct->clone_ds = za->za_first_integer; + list_insert_tail(clones_list, ct); } zap_cursor_fini(zc); kmem_free(za, sizeof (*za)); kmem_free(zc, sizeof (*zc)); +out: + kmem_free(snap_obj_array, sizeof (*snap_obj_array)); return (error); } @@ -523,8 +509,30 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t top_affected_fs; int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); if (error == 0) { + clones_t *ct; + list_t clones_list; + + list_create(&clones_list, sizeof (clones_t), + offsetof(clones_t, node)); + error = check_filesystem(spa, top_affected_fs, zep, - uaddr, count); + uaddr, count, &clones_list); + + while ((ct = list_remove_head(&clones_list)) != NULL) { + error = check_filesystem(spa, ct->clone_ds, zep, + uaddr, count, &clones_list); + kmem_free(ct, sizeof (*ct)); + + if (error) { + while (!list_is_empty(&clones_list)) { + ct = list_remove_head(&clones_list); + kmem_free(ct, sizeof (*ct)); + } + break; + } + } + + list_destroy(&clones_list); } return (error); @@ -827,62 +835,84 @@ spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) static int process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) { - zap_cursor_t zc; - zap_attribute_t za; - if (obj == 0) return (0); + zap_cursor_t *zc; + zap_attribute_t *za; + + zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { + for (zap_cursor_init(zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { if (*count == 0) { - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); return (SET_ERROR(ENOMEM)); } zbookmark_phys_t zb; - name_to_bookmark(za.za_name, &zb); + name_to_bookmark(za->za_name, &zb); int error = copyout_entry(&zb, uaddr, count); if (error != 0) { - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); return (error); } } - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); return (0); } - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { + for (zap_cursor_init(zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { - zap_cursor_t head_ds_cursor; - zap_attribute_t head_ds_attr; + zap_cursor_t *head_ds_cursor; + zap_attribute_t *head_ds_attr; - uint64_t head_ds_err_obj = za.za_first_integer; + head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + + uint64_t head_ds_err_obj = za->za_first_integer; uint64_t head_ds; - name_to_object(za.za_name, &head_ds); - for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, - head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor, - &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { + name_to_object(za->za_name, &head_ds); + for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, + head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { zbookmark_err_phys_t head_ds_block; - name_to_errphys(head_ds_attr.za_name, &head_ds_block); + name_to_errphys(head_ds_attr->za_name, &head_ds_block); int error = process_error_block(spa, head_ds, &head_ds_block, uaddr, count); if (error != 0) { - zap_cursor_fini(&head_ds_cursor); - zap_cursor_fini(&zc); + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, + sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); + + zap_cursor_fini(zc); + kmem_free(za, sizeof (*za)); + kmem_free(zc, sizeof (*zc)); return (error); } } - zap_cursor_fini(&head_ds_cursor); + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); } - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(za, sizeof (*za)); + kmem_free(zc, sizeof (*zc)); return (0); } From a46001adb9b143eebf43cd7ca4b508c044f80f00 Mon Sep 17 00:00:00 2001 From: buzzingwires <131118055+buzzingwires@users.noreply.github.com> Date: Wed, 3 May 2023 12:03:57 -0400 Subject: [PATCH 02/25] Allow zhack label repair to restore detached devices. This commit expands on the zhack label repair command in d04b5c9 by adding the -u option to undetach a device by regenerating uberblocks, in addition to the existing functionality of fixing checksums, now represented by -c. Previous behavior is retained in the case of no options. The changes are heavily inspired by Jeff Bonwick's labelfix utility, as archived at: https://gist.github.com/jjwhitney/baaa63144da89726e482 Additionally, it is now capable of properly determining the size of block devices and other media, as well as handling sizes which are not divisible by 2^18. This should make it viable for use on physical devices and partitions, in addition to files. These changes should make it possible to import zpools that have had their uberblocks erased, such as in the case of pools rendered inaccessible by erroneous detach commands. Reviewed-by: Brian Behlendorf Signed-off-by: buzzingwires Closes #14773 --- cmd/zhack.c | 516 ++++++++++++++---- man/man1/zhack.1 | 23 +- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 6 +- .../functional/cli_root/zhack/library.kshlib | 361 ++++++++++++ .../cli_root/zhack/zhack_label_checksum.ksh | 64 --- .../cli_root/zhack/zhack_label_repair_001.ksh | 30 + .../cli_root/zhack/zhack_label_repair_002.ksh | 31 ++ .../cli_root/zhack/zhack_label_repair_003.ksh | 33 ++ .../cli_root/zhack/zhack_label_repair_004.ksh | 30 + 10 files changed, 932 insertions(+), 165 deletions(-) create mode 100644 tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib delete mode 100755 tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_checksum.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_002.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_003.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_004.ksh diff --git a/cmd/zhack.c b/cmd/zhack.c index 0b6da31ec573..44611887dd25 100644 --- a/cmd/zhack.c +++ b/cmd/zhack.c @@ -58,6 +58,12 @@ static importargs_t g_importargs; static char *g_pool; static boolean_t g_readonly; +typedef enum { + ZHACK_REPAIR_OP_UNKNOWN = 0, + ZHACK_REPAIR_OP_CKSUM = (1 << 0), + ZHACK_REPAIR_OP_UNDETACH = (1 << 1) +} zhack_repair_op_t; + static __attribute__((noreturn)) void usage(void) { @@ -81,7 +87,10 @@ usage(void) " : should be a feature guid\n" "\n" " label repair \n" - " repair corrupted label checksums\n" + " repair labels of a specified device according to options\n" + " which may be combined to do their functions in one call\n" + " -c repair corrupted label checksums\n" + " -u restore the label on a detached device\n" "\n" " : path to vdev\n"); exit(1); @@ -485,23 +494,374 @@ zhack_do_feature(int argc, char **argv) return (0); } +#define ASHIFT_UBERBLOCK_SHIFT(ashift) \ + MIN(MAX(ashift, UBERBLOCK_SHIFT), \ + MAX_UBERBLOCK_SHIFT) +#define ASHIFT_UBERBLOCK_SIZE(ashift) \ + (1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift)) + +#define REPAIR_LABEL_STATUS_CKSUM (1 << 0) +#define REPAIR_LABEL_STATUS_UB (1 << 1) + static int -zhack_repair_label_cksum(int argc, char **argv) +zhack_repair_read_label(const int fd, vdev_label_t *vl, + const uint64_t label_offset, const int l) { - zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; + const int err = pread64(fd, vl, sizeof (vdev_label_t), label_offset); + + if (err == -1) { + (void) fprintf(stderr, + "error: cannot read label %d: %s\n", + l, strerror(errno)); + return (err); + } else if (err != sizeof (vdev_label_t)) { + (void) fprintf(stderr, + "error: bad label %d read size\n", l); + return (err); + } + + return (0); +} + +static void +zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset, + const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum) +{ + zio_cksum_t verifier; + zio_cksum_t current_cksum; + zio_checksum_info_t *ci; + abd_t *abd; + + ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); + + if (byteswap) + byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + + current_cksum = eck->zec_cksum; + eck->zec_cksum = verifier; + + ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; + abd = abd_get_from_buf(data, abdsize); + ci->ci_func[byteswap](abd, abdsize, NULL, cksum); + abd_free(abd); + + eck->zec_cksum = current_cksum; +} + +static int +zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys, + const size_t cfg_keys_len, nvlist_t *cfg, nvlist_t *vdev_tree_cfg, + uint64_t *ashift) +{ + int err; + + if (ub->ub_txg != 0) { + (void) fprintf(stderr, + "error: label %d: UB TXG of 0 expected, but got %" + PRIu64 "\n", + l, ub->ub_txg); + (void) fprintf(stderr, "It would appear the device was not " + "properly removed.\n"); + return (1); + } + + for (int i = 0; i < cfg_keys_len; i++) { + uint64_t val; + err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val); + if (err) { + (void) fprintf(stderr, + "error: label %d, %d: " + "cannot find nvlist key %s\n", + l, i, cfg_keys[i]); + return (err); + } + } + + err = nvlist_lookup_nvlist(cfg, + ZPOOL_CONFIG_VDEV_TREE, &vdev_tree_cfg); + if (err) { + (void) fprintf(stderr, + "error: label %d: cannot find nvlist key %s\n", + l, ZPOOL_CONFIG_VDEV_TREE); + return (err); + } + + err = nvlist_lookup_uint64(vdev_tree_cfg, + ZPOOL_CONFIG_ASHIFT, ashift); + if (err) { + (void) fprintf(stderr, + "error: label %d: cannot find nvlist key %s\n", + l, ZPOOL_CONFIG_ASHIFT); + return (err); + } + + if (*ashift == 0) { + (void) fprintf(stderr, + "error: label %d: nvlist key %s is zero\n", + l, ZPOOL_CONFIG_ASHIFT); + return (err); + } + + return (0); +} + +static int +zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l) +{ + /* + * Uberblock root block pointer has valid birth TXG. + * Copying it to the label NVlist + */ + if (ub->ub_rootbp.blk_birth != 0) { + const uint64_t txg = ub->ub_rootbp.blk_birth; + ub->ub_txg = txg; + + if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) { + (void) fprintf(stderr, + "error: label %d: " + "Failed to remove pool creation TXG\n", + l); + return (1); + } + + if (nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG) != 0) { + (void) fprintf(stderr, + "error: label %d: Failed to remove pool TXG to " + "be replaced.\n", + l); + return (1); + } + + if (nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg) != 0) { + (void) fprintf(stderr, + "error: label %d: " + "Failed to add pool TXG of %" PRIu64 "\n", + l, txg); + return (1); + } + } + + return (0); +} + +static boolean_t +zhack_repair_write_label(const int l, const int fd, const int byteswap, + void *data, zio_eck_t *eck, const uint64_t offset, const uint64_t abdsize) +{ + zio_cksum_t actual_cksum; + zhack_repair_calc_cksum(byteswap, data, offset, abdsize, eck, + &actual_cksum); + zio_cksum_t expected_cksum = eck->zec_cksum; + ssize_t err; + + if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) + return (B_FALSE); + + eck->zec_cksum = actual_cksum; + + err = pwrite64(fd, data, abdsize, offset); + if (err == -1) { + (void) fprintf(stderr, "error: cannot write label %d: %s\n", + l, strerror(errno)); + return (B_FALSE); + } else if (err != abdsize) { + (void) fprintf(stderr, "error: bad write size label %d\n", l); + return (B_FALSE); + } else { + (void) fprintf(stderr, + "label %d: wrote %" PRIu64 " bytes at offset %" PRIu64 "\n", + l, abdsize, offset); + } + + return (B_TRUE); +} + +static void +zhack_repair_write_uberblock(vdev_label_t *vl, const int l, + const uint64_t ashift, const int fd, const int byteswap, + const uint64_t label_offset, uint32_t *labels_repaired) +{ + void *ub_data = + (char *)vl + offsetof(vdev_label_t, vl_uberblock); + zio_eck_t *ub_eck = + (zio_eck_t *) + ((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift))) - 1; + + if (ub_eck->zec_magic != 0) { + (void) fprintf(stderr, + "error: label %d: " + "Expected Uberblock checksum magic number to " + "be 0, but got %" PRIu64 "\n", + l, ub_eck->zec_magic); + (void) fprintf(stderr, "It would appear there's already " + "a checksum for the uberblock.\n"); + return; + } + + + ub_eck->zec_magic = byteswap ? BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC; + + if (zhack_repair_write_label(l, fd, byteswap, + ub_data, ub_eck, + label_offset + offsetof(vdev_label_t, vl_uberblock), + ASHIFT_UBERBLOCK_SIZE(ashift))) + labels_repaired[l] |= REPAIR_LABEL_STATUS_UB; +} + +static void +zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum) +{ + (void) fprintf(stream, + "%016llx:%016llx:%016llx:%016llx", + (u_longlong_t)cksum->zc_word[0], + (u_longlong_t)cksum->zc_word[1], + (u_longlong_t)cksum->zc_word[2], + (u_longlong_t)cksum->zc_word[3]); +} + +static int +zhack_repair_test_cksum(const int byteswap, void *vdev_data, + zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset, const int l) +{ + const zio_cksum_t expected_cksum = vdev_eck->zec_cksum; + zio_cksum_t actual_cksum; + zhack_repair_calc_cksum(byteswap, vdev_data, vdev_phys_offset, + VDEV_PHYS_SIZE, vdev_eck, &actual_cksum); + const uint64_t expected_magic = byteswap ? + BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC; + const uint64_t actual_magic = vdev_eck->zec_magic; + int err = 0; + if (actual_magic != expected_magic) { + (void) fprintf(stderr, "error: label %d: " + "Expected " + "the nvlist checksum magic number to not be %" + PRIu64 " not %" PRIu64 "\n", + l, expected_magic, actual_magic); + err = ECKSUM; + } + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { + (void) fprintf(stderr, "error: label %d: " + "Expected the nvlist checksum to be ", l); + (void) zhack_repair_print_cksum(stderr, + &expected_cksum); + (void) fprintf(stderr, " not "); + zhack_repair_print_cksum(stderr, &actual_cksum); + (void) fprintf(stderr, "\n"); + err = ECKSUM; + } + return (err); +} + +static void +zhack_repair_one_label(const zhack_repair_op_t op, const int fd, + vdev_label_t *vl, const uint64_t label_offset, const int l, + uint32_t *labels_repaired) +{ + ssize_t err; + uberblock_t *ub = (uberblock_t *)vl->vl_uberblock; + void *vdev_data = + (char *)vl + offsetof(vdev_label_t, vl_vdev_phys); + zio_eck_t *vdev_eck = + (zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1; + const uint64_t vdev_phys_offset = + label_offset + offsetof(vdev_label_t, vl_vdev_phys); const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION, ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID }; - boolean_t labels_repaired[VDEV_LABELS] = {0}; - boolean_t repaired = B_FALSE; + nvlist_t *cfg; + nvlist_t *vdev_tree_cfg = NULL; + uint64_t ashift; + int byteswap; + + err = zhack_repair_read_label(fd, vl, label_offset, l); + if (err) + return; + + if (vdev_eck->zec_magic == 0) { + (void) fprintf(stderr, "error: label %d: " + "Expected the nvlist checksum magic number to not be zero" + "\n", + l); + (void) fprintf(stderr, "There should already be a checksum " + "for the label.\n"); + return; + } + + byteswap = + (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC)); + + if (byteswap) { + byteswap_uint64_array(&vdev_eck->zec_cksum, + sizeof (zio_cksum_t)); + vdev_eck->zec_magic = BSWAP_64(vdev_eck->zec_magic); + } + + if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 && + zhack_repair_test_cksum(byteswap, vdev_data, vdev_eck, + vdev_phys_offset, l) != 0) { + (void) fprintf(stderr, "It would appear checksums are " + "corrupted. Try zhack repair label -c \n"); + return; + } + + err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, + VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0); + if (err) { + (void) fprintf(stderr, + "error: cannot unpack nvlist label %d\n", l); + return; + } + + err = zhack_repair_check_label(ub, + l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift); + if (err) + return; + + if ((op & ZHACK_REPAIR_OP_UNDETACH) != 0) { + char *buf; + size_t buflen; + + err = zhack_repair_undetach(ub, cfg, l); + if (err) + return; + + buf = vl->vl_vdev_phys.vp_nvlist; + buflen = VDEV_PHYS_SIZE - sizeof (zio_eck_t); + if (nvlist_pack(cfg, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) { + (void) fprintf(stderr, + "error: label %d: Failed to pack nvlist\n", l); + return; + } + + zhack_repair_write_uberblock(vl, + l, ashift, fd, byteswap, label_offset, labels_repaired); + } + + if (zhack_repair_write_label(l, fd, byteswap, vdev_data, vdev_eck, + vdev_phys_offset, VDEV_PHYS_SIZE)) + labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM; + + fsync(fd); +} + +static const char * +zhack_repair_label_status(const uint32_t label_status, + const uint32_t to_check) +{ + return ((label_status & to_check) != 0 ? "repaired" : "skipped"); +} + +static int +zhack_label_repair(const zhack_repair_op_t op, const int argc, char **argv) +{ + uint32_t labels_repaired[VDEV_LABELS] = {0}; vdev_label_t labels[VDEV_LABELS] = {{{0}}}; - struct stat st; + struct stat64 st; int fd; + off_t filesize; + uint32_t repaired = 0; abd_init(); - argc -= 1; - argv += 1; - if (argc < 1) { (void) fprintf(stderr, "error: missing device\n"); usage(); @@ -511,93 +871,21 @@ zhack_repair_label_cksum(int argc, char **argv) fatal(NULL, FTAG, "cannot open '%s': %s", argv[0], strerror(errno)); - if (stat(argv[0], &st) != 0) + if (fstat64_blk(fd, &st) != 0) fatal(NULL, FTAG, "cannot stat '%s': %s", argv[0], strerror(errno)); + filesize = st.st_size; + (void) fprintf(stderr, "Calculated filesize to be %jd\n", + (intmax_t)filesize); + + if (filesize % sizeof (vdev_label_t) != 0) + filesize = + (filesize / sizeof (vdev_label_t)) * sizeof (vdev_label_t); + for (int l = 0; l < VDEV_LABELS; l++) { - uint64_t label_offset, offset; - zio_cksum_t expected_cksum; - zio_cksum_t actual_cksum; - zio_cksum_t verifier; - zio_eck_t *eck; - nvlist_t *cfg; - int byteswap; - uint64_t val; - ssize_t err; - - vdev_label_t *vl = &labels[l]; - - label_offset = vdev_label_offset(st.st_size, l, 0); - err = pread64(fd, vl, sizeof (vdev_label_t), label_offset); - if (err == -1) { - (void) fprintf(stderr, "error: cannot read " - "label %d: %s\n", l, strerror(errno)); - continue; - } else if (err != sizeof (vdev_label_t)) { - (void) fprintf(stderr, "error: bad label %d read size " - "\n", l); - continue; - } - - err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, - VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0); - if (err) { - (void) fprintf(stderr, "error: cannot unpack nvlist " - "label %d\n", l); - continue; - } - - for (int i = 0; i < ARRAY_SIZE(cfg_keys); i++) { - err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val); - if (err) { - (void) fprintf(stderr, "error: label %d: " - "cannot find nvlist key %s\n", - l, cfg_keys[i]); - continue; - } - } - - void *data = (char *)vl + offsetof(vdev_label_t, vl_vdev_phys); - eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1; - - offset = label_offset + offsetof(vdev_label_t, vl_vdev_phys); - ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); - - byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); - if (byteswap) - byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); - - expected_cksum = eck->zec_cksum; - eck->zec_cksum = verifier; - - abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE); - ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum); - abd_free(abd); - - if (byteswap) - byteswap_uint64_array(&expected_cksum, - sizeof (zio_cksum_t)); - - if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) - continue; - - eck->zec_cksum = actual_cksum; - - err = pwrite64(fd, data, VDEV_PHYS_SIZE, offset); - if (err == -1) { - (void) fprintf(stderr, "error: cannot write " - "label %d: %s\n", l, strerror(errno)); - continue; - } else if (err != VDEV_PHYS_SIZE) { - (void) fprintf(stderr, "error: bad write size " - "label %d\n", l); - continue; - } - - fsync(fd); - - labels_repaired[l] = B_TRUE; + zhack_repair_one_label(op, fd, &labels[l], + vdev_label_offset(filesize, l, 0), l, labels_repaired); } close(fd); @@ -605,17 +893,51 @@ zhack_repair_label_cksum(int argc, char **argv) abd_fini(); for (int l = 0; l < VDEV_LABELS; l++) { - (void) printf("label %d: %s\n", l, - labels_repaired[l] ? "repaired" : "skipped"); - repaired |= labels_repaired[l]; + const uint32_t lr = labels_repaired[l]; + (void) printf("label %d: ", l); + (void) printf("uberblock: %s ", + zhack_repair_label_status(lr, REPAIR_LABEL_STATUS_UB)); + (void) printf("checksum: %s\n", + zhack_repair_label_status(lr, REPAIR_LABEL_STATUS_CKSUM)); + repaired |= lr; } - if (repaired) + if (repaired > 0) return (0); return (1); } +static int +zhack_do_label_repair(int argc, char **argv) +{ + zhack_repair_op_t op = ZHACK_REPAIR_OP_UNKNOWN; + int c; + + optind = 1; + while ((c = getopt(argc, argv, "+cu")) != -1) { + switch (c) { + case 'c': + op |= ZHACK_REPAIR_OP_CKSUM; + break; + case 'u': + op |= ZHACK_REPAIR_OP_UNDETACH; + break; + default: + usage(); + break; + } + } + + argc -= optind; + argv += optind; + + if (op == ZHACK_REPAIR_OP_UNKNOWN) + op = ZHACK_REPAIR_OP_CKSUM; + + return (zhack_label_repair(op, argc, argv)); +} + static int zhack_do_label(int argc, char **argv) { @@ -632,7 +954,7 @@ zhack_do_label(int argc, char **argv) subcommand = argv[0]; if (strcmp(subcommand, "repair") == 0) { - err = zhack_repair_label_cksum(argc, argv); + err = zhack_do_label_repair(argc, argv); } else { (void) fprintf(stderr, "error: unknown subcommand: %s\n", subcommand); diff --git a/man/man1/zhack.1 b/man/man1/zhack.1 index 26b8156b4008..937f1e9168c2 100644 --- a/man/man1/zhack.1 +++ b/man/man1/zhack.1 @@ -98,10 +98,29 @@ feature is now required to read the pool MOS. .It Xo .Nm zhack .Cm label repair +.Op Fl cu .Ar device .Xc -Repair corrupted labels by rewriting the checksum using the presumed valid -contents of the label. +Repair labels of a specified +.Ar device +according to options. +.Pp +Flags may be combined to do their functions simultaneously. +. +.Pp +The +.Fl c +flag repairs corrupted label checksums +. +.Pp +The +.Fl u +flag restores the label on a detached device +.Pp +Example: +.Nm zhack Cm label repair Fl cu Ar device + Fix checksums and undetach a device +. .El . .Sh GLOBAL OPTIONS diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 55991cfeaf78..3730f2b27038 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -325,7 +325,8 @@ tests = ['zfs_wait_deleteq', 'zfs_wait_getsubopt'] tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zhack] -tests = ['zhack_label_checksum'] +tests = ['zhack_label_repair_001', 'zhack_label_repair_002', + 'zhack_label_repair_003', 'zhack_label_repair_004'] pre = post = tags = ['functional', 'cli_root', 'zhack'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 74295b86ddc2..0112d28d0c19 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -250,6 +250,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/cli_root/zpool_upgrade/zpool_upgrade.cfg \ functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib \ functional/cli_root/zpool_wait/zpool_wait.kshlib \ + functional/cli_root/zhack/library.kshlib \ functional/cli_user/misc/misc.cfg \ functional/cli_user/zfs_list/zfs_list.cfg \ functional/cli_user/zfs_list/zfs_list.kshlib \ @@ -932,7 +933,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs/zfs_001_neg.ksh \ functional/cli_root/zfs/zfs_002_pos.ksh \ functional/cli_root/zfs/zfs_003_neg.ksh \ - functional/cli_root/zhack/zhack_label_checksum.ksh \ + functional/cli_root/zhack/zhack_label_repair_001.ksh \ + functional/cli_root/zhack/zhack_label_repair_002.ksh \ + functional/cli_root/zhack/zhack_label_repair_003.ksh \ + functional/cli_root/zhack/zhack_label_repair_004.ksh \ functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \ functional/cli_root/zpool_add/add-o_ashift.ksh \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib b/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib new file mode 100644 index 000000000000..880a78861630 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib @@ -0,0 +1,361 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. "$STF_SUITE"/include/libtest.shlib +. "$STF_SUITE"/include/blkdev.shlib + +# +# Description: +# +# Test whether zhack label repair commands can recover detached devices +# and corrupted checksums with a variety of sizes, and ensure +# the purposes of either command is cleanly separated from the others. +# +# Strategy: +# +# Tests are done on loopback devices with sizes divisible by label size and sizes that are not. +# +# Test one: +# +# 1. Create pool on a loopback device with some test data +# 2. Export the pool. +# 3. Corrupt all label checksums in the pool +# 4. Check that pool cannot be imported +# 5. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 6. Use zhack label repair -c on device +# 7. Check that pool can be imported and that data is intact +# +# Test two: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Verify that the remaining detached device cannot be imported +# 6. Verify that it cannot be imported after using zhack label repair -c +# to ensure that the -c option will not undetach a device. +# 7. Use zhack label repair -u on device +# 8. Verify that the detached device can be imported and that data is intact +# +# Test three: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 8. Verify that it cannot be imported after using zhack label repair -c +# -c should repair the checksums, but not undetach a device. +# 9. Use zhack label repair -u on device +# 10. Verify that the detached device can be imported and that data is intact +# +# Test four: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Use zhack label repair -cu on device to attempt to fix checksums and +# undetach the device in a single operation. +# 8. Verify that the detached device can be imported and that data is intact +# + +log_assert "Verify zhack label repair will repair label checksums and uberblocks" +log_onexit cleanup + +LABEL_SIZE="$((2**18))" +LABEL_NVLIST_END="$((LABEL_SIZE / 2))" +LABEL_CKSUM_SIZE="32" +LABEL_CKSUM_START="$(( LABEL_NVLIST_END - LABEL_CKSUM_SIZE ))" + +VIRTUAL_DISK=$TEST_BASE_DIR/disk +VIRTUAL_MIRROR_DISK=$TEST_BASE_DIR/mirrordisk + +VIRTUAL_DEVICE= +VIRTUAL_MIRROR_DEVICE= + +function cleanup_lo +{ + L_DEVICE="$1" + + if [[ -e $L_DEVICE ]]; then + if is_linux; then + log_must losetup -d "$L_DEVICE" + elif is_freebsd; then + log_must mdconfig -d -u "$L_DEVICE" + else + log_must lofiadm -d "$L_DEVICE" + fi + fi +} + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + cleanup_lo "$VIRTUAL_DEVICE" + cleanup_lo "$VIRTUAL_MIRROR_DEVICE" + VIRTUAL_DEVICE= + VIRTUAL_MIRROR_DEVICE= + [[ -f "$VIRTUAL_DISK" ]] && log_must rm "$VIRTUAL_DISK" + [[ -f "$VIRTUAL_MIRROR_DISK" ]] && log_must rm "$VIRTUAL_MIRROR_DISK" +} + +RAND_MAX="$((2**15 - 1))" +function get_devsize +{ + if [ "$RANDOM" -gt "$(( RAND_MAX / 2 ))" ]; then + echo "$(( MINVDEVSIZE + RANDOM ))" + else + echo "$MINVDEVSIZE" + fi +} + +function pick_logop +{ + L_SHOULD_SUCCEED="$1" + + l_logop="log_mustnot" + if [ "$L_SHOULD_SUCCEED" == true ]; then + l_logop="log_must" + fi + + echo "$l_logop" +} + +function check_dataset +{ + L_SHOULD_SUCCEED="$1" + L_LOGOP="$(pick_logop "$L_SHOULD_SUCCEED")" + + "$L_LOGOP" mounted "$TESTPOOL"/"$TESTFS" + + "$L_LOGOP" test -f "$TESTDIR"/"test" +} + +function setup_dataset +{ + log_must zfs create "$TESTPOOL"/"$TESTFS" + + log_must mkdir -p "$TESTDIR" + log_must zfs set mountpoint="$TESTDIR" "$TESTPOOL"/"$TESTFS" + + log_must mounted "$TESTPOOL"/"$TESTFS" + + log_must touch "$TESTDIR"/"test" + log_must test -f "$TESTDIR"/"test" + + log_must zpool sync "$TESTPOOL" + + check_dataset true +} + +function get_practical_size +{ + L_SIZE="$1" + + if [ "$((L_SIZE % LABEL_SIZE))" -ne 0 ]; then + echo "$(((L_SIZE / LABEL_SIZE) * LABEL_SIZE))" + else + echo "$L_SIZE" + fi +} + +function corrupt_sized_label_checksum +{ + L_SIZE="$1" + L_LABEL="$2" + L_DEVICE="$3" + + L_PRACTICAL_SIZE="$(get_practical_size "$L_SIZE")" + + typeset -a L_OFFSETS=("$LABEL_CKSUM_START" \ + "$((LABEL_SIZE + LABEL_CKSUM_START))" \ + "$(((L_PRACTICAL_SIZE - LABEL_SIZE*2) + LABEL_CKSUM_START))" \ + "$(((L_PRACTICAL_SIZE - LABEL_SIZE) + LABEL_CKSUM_START))") + + dd if=/dev/urandom of="$L_DEVICE" \ + seek="${L_OFFSETS["$L_LABEL"]}" bs=1 count="$LABEL_CKSUM_SIZE" \ + conv=notrunc +} + +function corrupt_labels +{ + L_SIZE="$1" + L_DISK="$2" + + corrupt_sized_label_checksum "$L_SIZE" 0 "$L_DISK" + corrupt_sized_label_checksum "$L_SIZE" 1 "$L_DISK" + corrupt_sized_label_checksum "$L_SIZE" 2 "$L_DISK" + corrupt_sized_label_checksum "$L_SIZE" 3 "$L_DISK" +} + +function try_import_and_repair +{ + L_REPAIR_SHOULD_SUCCEED="$1" + L_IMPORT_SHOULD_SUCCEED="$2" + L_OP="$3" + L_POOLDISK="$4" + L_REPAIR_LOGOP="$(pick_logop "$L_REPAIR_SHOULD_SUCCEED")" + L_IMPORT_LOGOP="$(pick_logop "$L_IMPORT_SHOULD_SUCCEED")" + + log_mustnot zpool import "$TESTPOOL" -d "$L_POOLDISK" + + "$L_REPAIR_LOGOP" zhack label repair "$L_OP" "$L_POOLDISK" + + "$L_IMPORT_LOGOP" zpool import "$TESTPOOL" -d "$L_POOLDISK" + + check_dataset "$L_IMPORT_SHOULD_SUCCEED" +} + +function prepare_vdev +{ + L_SIZE="$1" + L_BACKFILE="$2" + + l_devname= + if truncate -s "$L_SIZE" "$L_BACKFILE"; then + if is_linux; then + l_devname="$(losetup -f "$L_BACKFILE" --show)" + elif is_freebsd; then + l_devname=/dev/"$(mdconfig -a -t vnode -f "$L_BACKFILE")" + else + l_devname="$(lofiadm -a "$L_BACKFILE")" + fi + fi + echo "$l_devname" +} + +function run_test_one +{ + L_SIZE="$1" + + VIRTUAL_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_DISK")" + log_must test -e "$VIRTUAL_DEVICE" + + log_must zpool create "$TESTPOOL" "$VIRTUAL_DEVICE" + + setup_dataset + + log_must zpool export "$TESTPOOL" + + corrupt_labels "$L_SIZE" "$VIRTUAL_DISK" + + try_import_and_repair false false "-u" "$VIRTUAL_DEVICE" + + try_import_and_repair true true "-c" "$VIRTUAL_DEVICE" + + cleanup + + log_pass "zhack label repair corruption test passed with a randomized size of $L_SIZE" +} + +function make_mirrored_pool +{ + L_SIZE="$1" + + VIRTUAL_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_DISK")" + log_must test -e "$VIRTUAL_DEVICE" + VIRTUAL_MIRROR_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_MIRROR_DISK")" + log_must test -e "$VIRTUAL_MIRROR_DEVICE" + + log_must zpool create "$TESTPOOL" "$VIRTUAL_DEVICE" + log_must zpool attach "$TESTPOOL" "$VIRTUAL_DEVICE" "$VIRTUAL_MIRROR_DEVICE" +} + +function export_and_cleanup_vdisk +{ + log_must zpool export "$TESTPOOL" + + cleanup_lo "$VIRTUAL_DEVICE" + + VIRTUAL_DEVICE= + + log_must rm "$VIRTUAL_DISK" +} + +function run_test_two +{ + L_SIZE="$1" + + make_mirrored_pool "$L_SIZE" + + setup_dataset + + log_must zpool detach "$TESTPOOL" "$VIRTUAL_MIRROR_DEVICE" + + export_and_cleanup_vdisk + + try_import_and_repair false false "-c" "$VIRTUAL_MIRROR_DEVICE" + + try_import_and_repair true true "-u" "$VIRTUAL_MIRROR_DEVICE" + + cleanup + + log_pass "zhack label repair detached test passed with a randomized size of $L_SIZE" +} + +function run_test_three +{ + L_SIZE="$1" + + make_mirrored_pool "$L_SIZE" + + setup_dataset + + log_must zpool detach "$TESTPOOL" "$VIRTUAL_MIRROR_DEVICE" + + export_and_cleanup_vdisk + + corrupt_labels "$L_SIZE" "$VIRTUAL_MIRROR_DISK" + + try_import_and_repair false false "-u" "$VIRTUAL_MIRROR_DEVICE" + + try_import_and_repair true false "-c" "$VIRTUAL_MIRROR_DEVICE" + + try_import_and_repair true true "-u" "$VIRTUAL_MIRROR_DEVICE" + + cleanup + + log_pass "zhack label repair corruption and detached test passed with a randomized size of $L_SIZE" +} + +function run_test_four +{ + L_SIZE="$1" + + make_mirrored_pool "$L_SIZE" + + setup_dataset + + log_must zpool detach "$TESTPOOL" "$VIRTUAL_MIRROR_DEVICE" + + export_and_cleanup_vdisk + + corrupt_labels "$L_SIZE" "$VIRTUAL_MIRROR_DISK" + + try_import_and_repair true true "-cu" "$VIRTUAL_MIRROR_DEVICE" + + cleanup + + log_pass "zhack label repair corruption and detached single-command test passed with a randomized size of $L_SIZE." +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_checksum.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_checksum.ksh deleted file mode 100755 index 67c7e7c4487d..000000000000 --- a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_checksum.ksh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/ksh - -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright (c) 2021 by vStack. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/include/blkdev.shlib - -# -# Description: -# zhack label repair will calculate and rewrite label checksum if invalid -# -# Strategy: -# 1. Create pool with some number of vdevs and export it -# 2. Corrupt all labels checksums -# 3. Check that pool cannot be imported -# 4. Use zhack to repair labels checksums -# 5. Check that pool can be imported -# - -log_assert "Verify zhack label repair will repair labels checksums" -log_onexit cleanup - -VIRTUAL_DISK=$TEST_BASE_DIR/disk - -function cleanup -{ - poolexists $TESTPOOL && destroy_pool $TESTPOOL - [[ -f $VIRTUAL_DISK ]] && log_must rm $VIRTUAL_DISK -} - -log_must truncate -s $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK - -log_must zpool create $TESTPOOL $VIRTUAL_DISK -log_must zpool export $TESTPOOL - -log_mustnot zhack label repair $VIRTUAL_DISK - -corrupt_label_checksum 0 $VIRTUAL_DISK -corrupt_label_checksum 1 $VIRTUAL_DISK -corrupt_label_checksum 2 $VIRTUAL_DISK -corrupt_label_checksum 3 $VIRTUAL_DISK - -log_mustnot zpool import $TESTPOOL -d $TEST_BASE_DIR - -log_must zhack label repair $VIRTUAL_DISK - -log_must zpool import $TESTPOOL -d $TEST_BASE_DIR - -cleanup - -log_pass "zhack label repair works correctly." diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh new file mode 100755 index 000000000000..2a511e9efcb6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover +# corrupted checksums on devices of varied size, +# but not undetached devices. +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Export the pool. +# 3. Corrupt all label checksums in the pool +# 4. Check that pool cannot be imported +# 5. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 6. Use zhack label repair -c on device +# 7. Check that pool can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_one "$(get_devsize)" diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_002.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_002.ksh new file mode 100755 index 000000000000..4f1e61a39857 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_002.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover +# detached drives on devices of varied size, but not +# repair corrupted checksums. +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Verify that the remaining detached device cannot be imported +# 6. Verify that it cannot be imported after using zhack label repair -c +# to ensure that the -c option will not undetach a device. +# 7. Use zhack label repair -u on device +# 8. Verify that the detached device can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_two "$(get_devsize)" diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_003.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_003.ksh new file mode 100755 index 000000000000..7e82363d2f46 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_003.ksh @@ -0,0 +1,33 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover a device of varied size with +# corrupted checksums and which has been detached. +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 8. Verify that it cannot be imported after using zhack label repair -c +# -c should repair the checksums, but not undetach a device. +# 9. Use zhack label repair -u on device +# 10. Verify that the detached device can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_three "$(get_devsize)" diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_004.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_004.ksh new file mode 100755 index 000000000000..0b739402b199 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_004.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover a device of varied size with +# corrupted checksums and which has been detached (in one command). +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Use zhack label repair -cu on device to attempt to fix checksums and +# undetach the device in a single operation. +# 8. Verify that the detached device can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_four "$(get_devsize)" From 82ac409acc77935ae366b800ee7cefb14939bbae Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 4 May 2023 03:10:32 +0500 Subject: [PATCH 03/25] zpool import -m also removing spare and cache when log device is missing spa_import() relies on a pool config fetched by spa_try_import() for spare/cache devices. Import flags are not passed to spa_tryimport(), which makes it return early due to a missing log device and missing retrieving the cache device and spare eventually. Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch the correct configuration regardless of the missing log device. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Ameer Hamza Closes #14794 --- module/zfs/spa.c | 10 +++ tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_import/import_log_missing.ksh | 75 +++++++++++++++++++ 4 files changed, 87 insertions(+), 1 deletion(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_log_missing.ksh diff --git a/module/zfs/spa.c b/module/zfs/spa.c index dd4a442d97a1..c2a67fbc7c55 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -6378,6 +6378,16 @@ spa_tryimport(nvlist_t *tryconfig) spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } + /* + * spa_import() relies on a pool config fetched by spa_try_import() + * for spare/cache devices. Import flags are not passed to + * spa_tryimport(), which makes it return early due to a missing log + * device and missing retrieving the cache device and spare eventually. + * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch + * the correct configuration regardless of the missing log device. + */ + spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 3730f2b27038..e2137ac596d9 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -422,7 +422,7 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'import_cachefile_mirror_detached', 'import_cachefile_paths_changed', 'import_cachefile_shared_device', - 'import_devices_missing', + 'import_devices_missing', 'import_log_missing', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 0112d28d0c19..9299a4ca9b47 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1056,6 +1056,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh \ functional/cli_root/zpool_import/import_cachefile_shared_device.ksh \ functional/cli_root/zpool_import/import_devices_missing.ksh \ + functional/cli_root/zpool_import/import_log_missing.ksh \ functional/cli_root/zpool_import/import_paths_changed.ksh \ functional/cli_root/zpool_import/import_rewind_config_changed.ksh \ functional/cli_root/zpool_import/import_rewind_device_replaced.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_log_missing.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_log_missing.ksh new file mode 100755 index 000000000000..f12cac78540f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_log_missing.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Import with missing log device should not remove spare/cache. +# +# STRATEGY: +# 1. Create a pool. +# 2. Add spare, cache and log devices to the pool. +# 3. Export the pool. +# 4. Remove the log device. +# 5. Import the pool with -m flag. +# 6. Verify that spare and cache are still present in the pool. +# + +verify_runnable "global" + +log_onexit cleanup + +function test_missing_log +{ + typeset poolcreate="$1" + typeset cachevdev="$2" + typeset sparevdev="$3" + typeset logvdev="$4" + typeset missingvdev="$4" + + log_note "$0: pool '$poolcreate', adding $cachevdev, $sparevdev," \ + "$logvdev then moving away $missingvdev." + + log_must zpool create $TESTPOOL1 $poolcreate + + log_must zpool add $TESTPOOL1 cache $cachevdev spare $sparevdev \ + log $logvdev + + log_must_busy zpool export $TESTPOOL1 + + log_must mv $missingvdev $BACKUP_DEVICE_DIR + + log_must zpool import -m -d $DEVICE_DIR $TESTPOOL1 + + CACHE_PRESENT=$(zpool status -v $TESTPOOL1 | grep $cachevdev) + + SPARE_PRESENT=$(zpool status -v $TESTPOOL1 | grep $sparevdev) + + if [ -z "$CACHE_PRESENT"] || [ -z "SPARE_PRESENT"] + then + log_fail "cache/spare vdev missing after importing with missing" \ + "log device" + fi + + # Cleanup + log_must zpool destroy $TESTPOOL1 + + log_note "" +} + +log_must mkdir -p $BACKUP_DEVICE_DIR + +test_missing_log "$VDEV0" "$VDEV1" "$VDEV2" "$VDEV3" + +log_pass "zpool import succeeded with missing log device" From 599df8204962036c7b1039a8613c13bdb69c2e61 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Sat, 6 May 2023 00:51:41 +0900 Subject: [PATCH 04/25] Plug memory leak in zfsdev_state. On kernel module unload, free all zfsdev state structures, except for zfsdev_state_listhead, which is statically allocated. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14824 --- module/zfs/zfs_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 22e644f75f95..3b1e2ae5fb5d 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -7862,6 +7862,8 @@ zfs_kmod_fini(void) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); + if (zs != &zfsdev_state_listhead) + kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ From 6fa6bb051c2b83f90dc12a64e63d1cb2b0d12c96 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Sat, 6 May 2023 01:09:12 +0900 Subject: [PATCH 05/25] Simplify and optimize random_int_between(). Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14805 --- tests/zfs-tests/include/math.shlib | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/zfs-tests/include/math.shlib b/tests/zfs-tests/include/math.shlib index 38d9fecea7cf..da1e77e5fb97 100644 --- a/tests/zfs-tests/include/math.shlib +++ b/tests/zfs-tests/include/math.shlib @@ -118,9 +118,7 @@ function verify_ne # # A simple function to get a random number between two bounds (inclusive) # -# Probably not the most efficient for large ranges, but it's okay. -# -# Note since we're using $RANDOM, 32767 is the largest number we +# Note since we're using $RANDOM, $min+32767 is the largest number we # can accept as the upper bound. # # $1 lower bound @@ -129,11 +127,6 @@ function random_int_between { typeset -i min=$1 typeset -i max=$2 - typeset -i rand=0 - while [[ $rand -lt $min ]] ; do - rand=$(( $RANDOM % $max + 1)) - done - - echo $rand + echo $(( (RANDOM % (max - min + 1)) + min )) } From 190290a9ac3f2f0dd0021646f2fd787ea51b08bd Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 5 May 2023 12:17:55 -0400 Subject: [PATCH 06/25] Fix two abd_gang_add_gang() issues. - There is no reason to assert that added gang is not empty. It may be weird to add an empty gang, but it is legal. - When moving chain list from the added gang clear its size, or it will trigger assertion in abd_verify() when that gang is freed. Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14816 --- module/zfs/abd.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 26222d2efe3f..745ee8f02ed4 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -370,7 +370,20 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) * will retain all the free_on_free settings after being * added to the parents list. */ +#ifdef ZFS_DEBUG + /* + * If cabd had abd_parent, we have to drop it here. We can't + * transfer it to pabd, nor we can clear abd_size leaving it. + */ + if (cabd->abd_parent != NULL) { + (void) zfs_refcount_remove_many( + &cabd->abd_parent->abd_children, + cabd->abd_size, cabd); + cabd->abd_parent = NULL; + } +#endif pabd->abd_size += cabd->abd_size; + cabd->abd_size = 0; list_move_tail(&ABD_GANG(pabd).abd_gang_chain, &ABD_GANG(cabd).abd_gang_chain); ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); @@ -408,7 +421,6 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) */ if (abd_is_gang(cabd)) { ASSERT(!list_link_active(&cabd->abd_gang_link)); - ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); return (abd_gang_add_gang(pabd, cabd, free_on_free)); } ASSERT(!abd_is_gang(cabd)); From 245f4a346779e29fe995f69d8eb2b724cddf5277 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 8 May 2023 10:09:30 -0700 Subject: [PATCH 07/25] ZTS: add snapshot/snapshot_002_pos exception Add snapshot_002_pos to the known list of occasional failures for FreeBSD until it can be made entirely reliable. Reviewed-by: Tino Reichardt Signed-off-by: Brian Behlendorf Issue #14831 Closes #14832 --- tests/test-runner/bin/zts-report.py.in | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index f3cfca912a57..63470bc041c6 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -257,6 +257,7 @@ if sys.platform.startswith('freebsd'): 'resilver/resilver_restart_001': ['FAIL', known_reason], 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], 'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623], + 'snapshot/snapshot_002_pos': ['FAIL', '14831'], }) elif sys.platform.startswith('linux'): maybe.update({ From dd19821149cb7e3785249eb9be75dd9864c88d56 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 8 May 2023 11:17:41 -0700 Subject: [PATCH 08/25] zdb: consistent xattr output When using zdb to output the value of an xattr only interpret it as printable characters if the entire byte array is printable. Additionally, if the --parseable option is set always output the buffer contents as octal for easy parsing. Reviewed-by: Olaf Faaland Signed-off-by: Brian Behlendorf Closes #14830 --- cmd/zdb/zdb.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index ec5d1acacf85..cea80b690841 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3322,13 +3322,22 @@ dump_znode_sa_xattr(sa_handle_t *hdl) (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", sa_xattr_size, sa_xattr_entries); while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { + boolean_t can_print = !dump_opt['P']; uchar_t *value; uint_t cnt, idx; (void) printf("\t\t%s = ", nvpair_name(elem)); nvpair_value_byte_array(elem, &value, &cnt); + for (idx = 0; idx < cnt; ++idx) { - if (isprint(value[idx])) + if (!isprint(value[idx])) { + can_print = B_FALSE; + break; + } + } + + for (idx = 0; idx < cnt; ++idx) { + if (can_print) (void) putchar(value[idx]); else (void) printf("\\%3.3o", value[idx]); From 3095ca91c261756c509d0afb4422027753e68c90 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 8 May 2023 11:20:23 -0700 Subject: [PATCH 09/25] Verify block pointers before writing them out If a block pointer is corrupted (but the block containing it checksums correctly, e.g. due to a bug that overwrites random memory), we can often detect it before the block is read, with the `zfs_blkptr_verify()` function, which is used in `arc_read()`, `zio_free()`, etc. However, such corruption is not typically recoverable. To recover from it we would need to detect the memory error before the block pointer is written to disk. This PR verifies BP's that are contained in indirect blocks and dnodes before they are written to disk, in `dbuf_write_ready()`. This way, we'll get a panic before the on-disk data is corrupted. This will help us to diagnose what's causing the corruption, as well as being much easier to recover from. To minimize performance impact, only checks that can be done without holding the spa_config_lock are performed. Additionally, when corruption is detected, the raw words of the block pointer are logged. (Note that `dprintf_bp()` is a no-op by default, but if enabled it is not safe to use with invalid block pointers.) Reviewed-by: Rich Ercolani Reviewed-by: Brian Behlendorf Reviewed-by: Paul Zuchowski Reviewed-by: Alexander Motin Signed-off-by: Matthew Ahrens Closes #14817 --- cmd/zdb/zdb.c | 8 ++-- include/sys/zio.h | 8 +++- module/zfs/arc.c | 4 +- module/zfs/dbuf.c | 16 ++++++++ module/zfs/dsl_scan.c | 3 +- module/zfs/spa.c | 2 +- module/zfs/zio.c | 92 +++++++++++++++++++++++++++++++------------ 7 files changed, 98 insertions(+), 35 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index cea80b690841..5ab13b470dc0 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8499,8 +8499,8 @@ zdb_read_block(char *thing, spa_t *spa) !(flags & ZDB_FLAG_DECOMPRESS)) { const blkptr_t *b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); - if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) == - B_FALSE) { + if (zfs_blkptr_verify(spa, b, + BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) { abd_return_buf_copy(pabd, buf, lsize); borrowed = B_FALSE; buf = lbuf; @@ -8508,8 +8508,8 @@ zdb_read_block(char *thing, spa_t *spa) lbuf, lsize, psize, flags); b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); - if (failed || zfs_blkptr_verify(spa, b, B_FALSE, - BLK_VERIFY_LOG) == B_FALSE) { + if (failed || zfs_blkptr_verify(spa, b, + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) { printf("invalid block pointer at this DVA\n"); goto out; } diff --git a/include/sys/zio.h b/include/sys/zio.h index 3463682a1065..695bc09e6cb7 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -531,6 +531,12 @@ enum blk_verify_flag { BLK_VERIFY_HALT }; +enum blk_config_flag { + BLK_CONFIG_HELD, // SCL_VDEV held for writer + BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader + BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV +}; + extern int zio_bookmark_compare(const void *, const void *); extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, @@ -646,7 +652,7 @@ extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, - boolean_t config_held, enum blk_verify_flag blk_verify); + enum blk_config_flag blk_config, enum blk_verify_flag blk_verify); /* * Initial setup and teardown. diff --git a/module/zfs/arc.c b/module/zfs/arc.c index c50228a2682f..bf8d99f94c39 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5696,8 +5696,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * and treat it as a checksum error. This allows an alternate blkptr * to be tried when one is available (e.g. ditto blocks). */ - if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER, - BLK_VERIFY_LOG)) { + if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ? + BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { rc = SET_ERROR(ECKSUM); goto done; } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 8193fb244079..6a50f1927add 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -4636,6 +4636,20 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) i += DNODE_MIN_SIZE; if (dnp->dn_type != DMU_OT_NONE) { fill++; + for (int j = 0; j < dnp->dn_nblkptr; + j++) { + (void) zfs_blkptr_verify(spa, + &dnp->dn_blkptr[j], + BLK_CONFIG_SKIP, + BLK_VERIFY_HALT); + } + if (dnp->dn_flags & + DNODE_FLAG_SPILL_BLKPTR) { + (void) zfs_blkptr_verify(spa, + DN_SPILL_BLKPTR(dnp), + BLK_CONFIG_SKIP, + BLK_VERIFY_HALT); + } i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } @@ -4653,6 +4667,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; + (void) zfs_blkptr_verify(spa, ibp, + BLK_CONFIG_SKIP, BLK_VERIFY_HALT); fill += BP_GET_FILL(ibp); } } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index d6a9365df120..d398b6705551 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1970,7 +1970,8 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); - } else if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) { + } else if (!zfs_blkptr_verify(spa, bp, + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { /* * Sanity check the block pointer contents, this is handled * by arc_read() for the cases above. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index c2a67fbc7c55..16396170273c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -2387,7 +2387,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, * When damaged consider it to be a metadata error since we cannot * trust the BP_GET_TYPE and BP_GET_LEVEL values. */ - if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) { + if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { atomic_inc_64(&sle->sle_meta_count); return (0); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 0924fb6f40bc..365d34832c3a 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -935,9 +935,35 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); + zfs_dbgmsg("bad blkptr at %px: " + "DVA[0]=%#llx/%#llx " + "DVA[1]=%#llx/%#llx " + "DVA[2]=%#llx/%#llx " + "prop=%#llx " + "pad=%#llx,%#llx " + "phys_birth=%#llx " + "birth=%#llx " + "fill=%#llx " + "cksum=%#llx/%#llx/%#llx/%#llx", + bp, + (long long)bp->blk_dva[0].dva_word[0], + (long long)bp->blk_dva[0].dva_word[1], + (long long)bp->blk_dva[1].dva_word[0], + (long long)bp->blk_dva[1].dva_word[1], + (long long)bp->blk_dva[2].dva_word[0], + (long long)bp->blk_dva[2].dva_word[1], + (long long)bp->blk_prop, + (long long)bp->blk_pad[0], + (long long)bp->blk_pad[1], + (long long)bp->blk_phys_birth, + (long long)bp->blk_birth, + (long long)bp->blk_fill, + (long long)bp->blk_cksum.zc_word[0], + (long long)bp->blk_cksum.zc_word[1], + (long long)bp->blk_cksum.zc_word[2], + (long long)bp->blk_cksum.zc_word[3]); switch (blk_verify) { case BLK_VERIFY_HALT: - dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: @@ -958,47 +984,54 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, * If everything checks out B_TRUE is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * - * Modes for zfs_blkptr_verify: - * 1) BLK_VERIFY_ONLY (evaluate the block) - * 2) BLK_VERIFY_LOG (evaluate the block and log problems) - * 3) BLK_VERIFY_HALT (call zfs_panic_recover on error) + * Values for blk_verify_flag: + * BLK_VERIFY_ONLY: evaluate the block + * BLK_VERIFY_LOG: evaluate the block and log problems + * BLK_VERIFY_HALT: call zfs_panic_recover on error + * + * Values for blk_config_flag: + * BLK_CONFIG_HELD: caller holds SCL_VDEV for writer + * BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be + * obtained for reader + * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better + * performance */ boolean_t -zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, - enum blk_verify_flag blk_verify) +zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, + enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { int errors = 0; if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid TYPE %llu", + "blkptr at %px has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid CHECKSUM %llu", + "blkptr at %px has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid COMPRESS %llu", + "blkptr at %px has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid LSIZE %llu", + "blkptr at %px has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid PSIZE %llu", + "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid ETYPE %llu", + "blkptr at %px has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } @@ -1010,10 +1043,19 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, if (!spa->spa_trust_config) return (errors == 0); - if (!config_held) - spa_config_enter(spa, SCL_VDEV, bp, RW_READER); - else + switch (blk_config) { + case BLK_CONFIG_HELD: ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); + break; + case BLK_CONFIG_NEEDED: + spa_config_enter(spa, SCL_VDEV, bp, RW_READER); + break; + case BLK_CONFIG_SKIP: + return (errors == 0); + default: + panic("invalid blk_config %u", blk_config); + } + /* * Pool-specific checks. * @@ -1028,20 +1070,20 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, if (vdevid >= spa->spa_root_vdev->vdev_children) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid VDEV %llu", + "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid VDEV %llu", + "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has hole VDEV %llu", + "blkptr at %px DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } @@ -1059,13 +1101,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid OFFSET %llu", + "blkptr at %px DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } - if (errors > 0) - dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); - if (!config_held) + if (blk_config == BLK_CONFIG_NEEDED) spa_config_exit(spa, SCL_VDEV, bp); return (errors == 0); @@ -1203,7 +1243,7 @@ void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { - (void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT); + (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We @@ -1282,8 +1322,8 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; - (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER, - BLK_VERIFY_HALT); + (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ? + BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); From 4eca03faaf6a1c05d739c738e3d5c0df2931da98 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Mon, 8 May 2023 22:35:03 +0200 Subject: [PATCH 10/25] Fixes in head_errlog feature with encryption For the head_errlog feature use dsl_dataset_hold_obj_flags() instead of dsl_dataset_hold_obj() in order to enable access to the encryption keys (if loaded). This enables reporting of errors in encrypted filesystems which are not mounted but have their keys loaded. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14837 --- man/man7/zpool-features.7 | 7 +- module/zfs/spa_errlog.c | 76 ++++++++----------- .../zfs_receive/zfs_receive_corrective.ksh | 19 ++++- .../zpool_status/zpool_status_005_pos.ksh | 6 +- 4 files changed, 56 insertions(+), 52 deletions(-) diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index efe9e833996a..2b7dcb63829c 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -562,13 +562,12 @@ This feature enables the upgraded version of errlog, which required an on-disk error log format change. Now the error log of each head dataset is stored separately in the zap object and keyed by the head id. -In case of encrypted filesystems with unloaded keys or unmounted encrypted -filesystems we are unable to check their snapshots or clones for errors and -these will not be reported. -In this case no filenames will be reported either. With this feature enabled, every dataset affected by an error block is listed in the output of .Nm zpool Cm status . +In case of encrypted filesystems with unloaded keys we are unable to check +their snapshots or clones for errors and these will not be reported. +An "access denied" error will be reported. .Pp \*[instant-never] . diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index e0604c4a84af..44950a769d3b 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -163,15 +163,15 @@ name_to_object(char *buf, uint64_t *obj) static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds) { dsl_dataset_t *ds; - int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, - dsobj, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, + dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); ASSERT(head_ds); *head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } @@ -297,7 +297,8 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; - int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(dp, head_ds, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); @@ -306,23 +307,6 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, boolean_t check_snapshot = B_TRUE; error = find_birth_txg(ds, zep, &latest_txg); - /* - * If the filesystem is encrypted and the key is not loaded - * or the encrypted filesystem is not mounted the error will be EACCES. - * In that case report an error in the head filesystem and return. - */ - if (error == EACCES) { - dsl_dataset_rele(ds, FTAG); - zbookmark_phys_t zb; - zep_to_zb(head_ds, zep, &zb); - error = copyout_entry(&zb, uaddr, count); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - return (0); - } - /* * If find_birth_txg() errors out otherwise, let txg_to_consider be * equal to the spa's syncing txg: if check_filesystem() errors out @@ -334,7 +318,7 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, zep_to_zb(head_ds, zep, &zb); error = copyout_entry(&zb, uaddr, count); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } check_snapshot = B_FALSE; @@ -352,14 +336,14 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } } if (snap_count == 0) { /* Filesystem without snapshots. */ - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (0); } @@ -371,20 +355,21 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); /* Check only snapshots created from this file system. */ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && snap_obj_txg <= txg_to_consider) { - error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); + error = dsl_dataset_hold_obj_flags(dp, snap_obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) goto out; if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) { snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } @@ -404,13 +389,14 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, zep_to_zb(snap_obj, zep, &zb); error = copyout_entry(&zb, uaddr, count); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, + FTAG); goto out; } } snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); } if (zap_clone == 0 || aff_snap_count == 0) @@ -428,8 +414,8 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, zap_cursor_advance(zc)) { dsl_dataset_t *clone; - error = dsl_dataset_hold_obj(dp, za->za_first_integer, - FTAG, &clone); + error = dsl_dataset_hold_obj_flags(dp, za->za_first_integer, + DS_HOLD_FLAG_DECRYPT, FTAG, &clone); if (error != 0) break; @@ -444,7 +430,7 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, == snap_obj_array[i]) found = B_TRUE; } - dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele_flags(clone, DS_HOLD_FLAG_DECRYPT, FTAG); if (!found) continue; @@ -474,14 +460,14 @@ find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, return (error); dsl_dataset_t *ds; - error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj, - FTAG, &ds); + error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); *top_affected_fs = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (0); } @@ -744,7 +730,8 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, dsl_dataset_t *ds; objset_t *os; - int error = dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(dp, zb.zb_objset, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) continue; @@ -759,7 +746,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, * truly persistent, it should re-appear after a scan. */ if (dmu_objset_from_ds(ds, &os) != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } @@ -767,7 +754,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, blkptr_t bp; if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } @@ -781,7 +768,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); if (error != 0 || BP_IS_HOLE(&bp)) continue; @@ -1259,7 +1246,8 @@ find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; - int error = dsl_dataset_hold_obj(dp, old_head, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(dp, old_head, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); @@ -1267,9 +1255,9 @@ find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; while (prev_obj != 0) { - dsl_dataset_rele(ds, FTAG); - if ((error = dsl_dataset_hold_obj(dp, prev_obj, - FTAG, &ds)) == 0 && + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + if ((error = dsl_dataset_hold_obj_flags(dp, prev_obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) == 0 && dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head) break; @@ -1279,7 +1267,7 @@ find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); ASSERT(prev_obj != 0); *txg = prev_obj_txg; return (0); diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh index 9ebde1cd9d32..261fc5eed8cb 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh @@ -163,7 +163,24 @@ corrupt_blocks_at_level "/$TESTPOOL/testfs5/$TESTFILE0" 0 log_must zfs unmount $TESTPOOL/testfs5 log_must zfs unload-key $TESTPOOL/testfs5 # test healing recv (on an encrypted dataset) using a raw send file -test_corrective_recv "$TESTPOOL/testfs5@snap1" $raw_backup +# This is a special case since with unloaded keys we cannot report errors +# in the filesystem. +log_must zpool scrub -w $TESTPOOL +log_must zpool status -v $TESTPOOL +log_mustnot eval "zpool status -v $TESTPOOL | \ + grep \"permission denied\"" +# make sure we will read the corruption from disk by flushing the ARC +log_must zinject -a +log_must eval "zfs recv -c $TESTPOOL/testfs5@snap1 < $raw_backup" + +log_must zpool scrub -w $TESTPOOL +log_must zpool status -v $TESTPOOL +log_mustnot eval "zpool status -v $TESTPOOL | \ + grep \"Permanent errors have been detected\"" +typeset cksum=$(md5digest $file) +[[ "$cksum" == "$checksum" ]] || \ + log_fail "Checksums differ ($cksum != $checksum)" + # non raw send file healing an encrypted dataset with an unloaded key will fail log_mustnot eval "zfs recv -c $TESTPOOL/testfs5@snap1 < $backup" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh index 04cd1892380d..ec4c67fb42f5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh @@ -29,7 +29,7 @@ # Verify correct output with 'zpool status -v' after corrupting a file # # STRATEGY: -# 1. Create a pool, an ancrypted filesystem and a file +# 1. Create a pool, an encrypted filesystem and a file # 2. zinject checksum errors # 3. Unmount the filesystem and unload the key # 4. Scrub the pool @@ -76,8 +76,8 @@ log_must zpool sync $TESTPOOL2 log_must zpool scrub $TESTPOOL2 log_must zpool wait -t scrub $TESTPOOL2 log_must zpool status -v $TESTPOOL2 -log_must eval "zpool status -v $TESTPOOL2 | \ - grep \"Permanent errors have been detected\"" +log_mustnot eval "zpool status -v $TESTPOOL2 | \ + grep \"permission denied\"" log_mustnot eval "zpool status -v $TESTPOOL2 | grep '$file'" log_must eval "cat /$TESTPOOL2/pwd | zfs load-key $TESTPOOL2/$TESTFS1" From 6839ec6f1098c28ff7b772f1b31b832d05e6b567 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 9 May 2023 17:53:27 +0200 Subject: [PATCH 11/25] Enable the head_errlog feature to remove errors In case check_filesystem() does not error out and does not report an error, remove that error block from error lists and logs without requiring a scrub. This can happen when the original file and all snapshots/clones referencing it have been removed. Otherwise zpool status will still report that "Permanent errors have been detected..." without actually reporting any of them. To implement this change the functions introduced in corrective receive were modified to take into account the head_errlog feature. Before this change: ============================= pool: test state: ONLINE status: One or more devices has experienced an error resulting in data corruption. Applications may be affected. action: Restore the file in question if possible. Otherwise restore the entire pool from backup. see: https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-8A config: NAME STATE READ WRITE CKSUM test ONLINE 0 0 0 /home/user/vdev_a ONLINE 0 0 2 errors: Permanent errors have been detected in the following files: ============================= After this change: ============================= pool: test state: ONLINE status: One or more devices has experienced an unrecoverable error. An attempt was made to correct the error. Applications are unaffected. action: Determine if the device needs to be replaced, and clear the errors using 'zpool clear' or replace the device with 'zpool replace'. see: https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-9P config: NAME STATE READ WRITE CKSUM test ONLINE 0 0 0 /home/user/vdev_a ONLINE 0 0 2 errors: No known data errors ============================= Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: George Amanakis Closes #14813 --- include/sys/spa.h | 3 +- man/man8/zpool-status.8 | 3 + module/zfs/dmu_recv.c | 2 +- module/zfs/spa_errlog.c | 175 ++++++++++++++---- .../zpool_status/zpool_status_007_pos.ksh | 13 ++ 5 files changed, 159 insertions(+), 37 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index b96a9ef1d42f..460ea2bfee4e 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1140,7 +1140,8 @@ extern const char *spa_state_to_name(spa_t *spa); struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth); -extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb); +extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, + const uint64_t *birth); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index ed572e29f51f..8f9580cf086e 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -119,6 +119,9 @@ See .It Fl v Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub. +If the head_errlog feature is enabled and files containing errors have been +removed then the respective filenames will not be reported in subsequent runs +of this command. .It Fl x Only display status for pools that are exhibiting errors or are otherwise unavailable. diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index c2ce5ce000ac..c22a95f8647f 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1353,7 +1353,7 @@ corrective_read_done(zio_t *zio) cr_cb_data_t *data = zio->io_private; /* Corruption corrected; update error log if needed */ if (zio->io_error == 0) - spa_remove_error(data->spa, &data->zb); + spa_remove_error(data->spa, &data->zb, &zio->io_bp->blk_birth); kmem_free(data, sizeof (cr_cb_data_t)); abd_free(zio->io_abd); } diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 44950a769d3b..31719063a227 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -493,6 +493,7 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, } uint64_t top_affected_fs; + uint64_t init_count = *count; int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); if (error == 0) { clones_t *ct; @@ -520,6 +521,16 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, list_destroy(&clones_list); } + if (error == 0 && init_count == *count) { + /* + * If we reach this point, no errors have been detected + * in the checked filesystems/snapshots. Before returning mark + * the error block to be removed from the error lists and logs. + */ + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + spa_remove_error(spa, &zb, &zep->zb_birth); + } return (error); } @@ -530,37 +541,111 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, * so that we can later remove the related log entries in sync context. */ static void -spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb) +spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb, + const uint64_t *birth) { char name[NAME_MAX_LEN]; if (obj == 0) return; - bookmark_to_name(healed_zb, name, sizeof (name)); - mutex_enter(&spa->spa_errlog_lock); - if (zap_contains(spa->spa_meta_objset, obj, name) == 0) { - /* - * Found an error matching healed zb, add zb to our - * tree of healed errors - */ - avl_tree_t *tree = &spa->spa_errlist_healed; - spa_error_entry_t search; - spa_error_entry_t *new; - avl_index_t where; - search.se_bookmark = *healed_zb; - mutex_enter(&spa->spa_errlist_lock); - if (avl_find(tree, &search, &where) != NULL) { - mutex_exit(&spa->spa_errlist_lock); - mutex_exit(&spa->spa_errlog_lock); - return; + boolean_t held_list = B_FALSE; + boolean_t held_log = B_FALSE; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + bookmark_to_name(healed_zb, name, sizeof (name)); + + if (zap_contains(spa->spa_meta_objset, healed_zb->zb_objset, + name) == 0) { + if (!MUTEX_HELD(&spa->spa_errlog_lock)) { + mutex_enter(&spa->spa_errlog_lock); + held_log = B_TRUE; + } + + /* + * Found an error matching healed zb, add zb to our + * tree of healed errors + */ + avl_tree_t *tree = &spa->spa_errlist_healed; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_index_t where; + search.se_bookmark = *healed_zb; + if (!MUTEX_HELD(&spa->spa_errlist_lock)) { + mutex_enter(&spa->spa_errlist_lock); + held_list = B_TRUE; + } + if (avl_find(tree, &search, &where) != NULL) { + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + return; + } + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *healed_zb; + avl_insert(tree, new, where); + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); } - new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); - new->se_bookmark = *healed_zb; - avl_insert(tree, new, where); - mutex_exit(&spa->spa_errlist_lock); + return; } - mutex_exit(&spa->spa_errlog_lock); + + zbookmark_err_phys_t healed_zep; + healed_zep.zb_object = healed_zb->zb_object; + healed_zep.zb_level = healed_zb->zb_level; + healed_zep.zb_blkid = healed_zb->zb_blkid; + + if (birth != NULL) + healed_zep.zb_birth = *birth; + else + healed_zep.zb_birth = 0; + + errphys_to_name(&healed_zep, name, sizeof (name)); + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + if (zap_contains(spa->spa_meta_objset, za.za_first_integer, + name) == 0) { + if (!MUTEX_HELD(&spa->spa_errlog_lock)) { + mutex_enter(&spa->spa_errlog_lock); + held_log = B_TRUE; + } + + avl_tree_t *tree = &spa->spa_errlist_healed; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_index_t where; + search.se_bookmark = *healed_zb; + + if (!MUTEX_HELD(&spa->spa_errlist_lock)) { + mutex_enter(&spa->spa_errlist_lock); + held_list = B_TRUE; + } + + if (avl_find(tree, &search, &where) != NULL) { + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + continue; + } + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *healed_zb; + new->se_zep = healed_zep; + avl_insert(tree, new, where); + + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + } + } + zap_cursor_fini(&zc); } /* @@ -598,12 +683,36 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) &cookie)) != NULL) { remove_error_from_list(spa, s, &se->se_bookmark); remove_error_from_list(spa, l, &se->se_bookmark); - bookmark_to_name(&se->se_bookmark, name, sizeof (name)); kmem_free(se, sizeof (spa_error_entry_t)); - (void) zap_remove(spa->spa_meta_objset, - spa->spa_errlog_last, name, tx); - (void) zap_remove(spa->spa_meta_objset, - spa->spa_errlog_scrub, name, tx); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + bookmark_to_name(&se->se_bookmark, name, sizeof (name)); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_last, name, tx); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_scrub, name, tx); + } else { + errphys_to_name(&se->se_zep, name, sizeof (name)); + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_errlog_last); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + zap_remove(spa->spa_meta_objset, + za.za_first_integer, name, tx); + } + zap_cursor_fini(&zc); + + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_errlog_scrub); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + zap_remove(spa->spa_meta_objset, + za.za_first_integer, name, tx); + } + zap_cursor_fini(&zc); + } } } @@ -612,14 +721,10 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) * later in spa_remove_healed_errors(). */ void -spa_remove_error(spa_t *spa, zbookmark_phys_t *zb) +spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, const uint64_t *birth) { - char name[NAME_MAX_LEN]; - - bookmark_to_name(zb, name, sizeof (name)); - - spa_add_healed_error(spa, spa->spa_errlog_last, zb); - spa_add_healed_error(spa, spa->spa_errlog_scrub, zb); + spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth); + spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth); } static uint64_t diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh index c9849379f779..666ac9bfc9dd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh @@ -39,6 +39,9 @@ # 7. Verify we report errors in the pool in 'zpool status -v' # 8. Promote clone1 # 9. Verify we report errors in the pool in 'zpool status -v' +# 10. Delete the corrupted file and origin snapshots. +# 11. Verify we do not report data errors anymore, without requiring +# a scrub. . $STF_SUITE/include/libtest.shlib @@ -95,4 +98,14 @@ log_mustnot eval "zpool status -v | grep '$TESTPOOL2/clonexx/$TESTFILE0'" log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'" log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'" +log_must rm /$TESTPOOL2/clone1/$TESTFILE0 +log_must zfs destroy -R $TESTPOOL2/clone1@snap1 +log_must zfs destroy -R $TESTPOOL2/clone1@snap2 +log_must zfs list -r $TESTPOOL2 +log_must zpool status -v $TESTPOOL2 +log_must zpool sync +log_must zpool status -v $TESTPOOL2 +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"No known data errors\"" + log_pass "Verify reporting errors when deleting corrupted files after scrub" From b035f2b2cb9b88b1330c4b48641b8793d6460c9b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 9 May 2023 11:54:01 -0400 Subject: [PATCH 12/25] Remove single parent assertion from zio_nowait(). We only need to know if ZIO has any parent there. We do not care if it has more than one, but use of zio_unique_parent() == NULL asserts that. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14823 --- module/zfs/zio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 365d34832c3a..c17ca5e1d651 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2341,7 +2341,7 @@ zio_nowait(zio_t *zio) ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && - zio_unique_parent(zio) == NULL) { + list_is_empty(&zio->io_parent_list)) { zio_t *pio; /* From d38c815fe27c033564d1f7cc769e74eba11cfb83 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 9 May 2023 17:54:41 +0200 Subject: [PATCH 13/25] Remove duplicate code in l2arc_evict() l2arc_evict() performs the adjustment of the size of buffers to be written on L2ARC unnecessarily. l2arc_write_size() is called right before l2arc_evict() and performs those adjustments. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: George Amanakis Closes #14828 --- module/zfs/arc.c | 22 +++++++------------ .../tests/functional/trim/trim_l2arc.ksh | 2 +- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index bf8d99f94c39..a78f664c4fe8 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8198,10 +8198,17 @@ l2arc_write_size(l2arc_dev_t *dev) * iteration can occur. */ dev_size = dev->l2ad_end - dev->l2ad_start; + + /* We need to add in the worst case scenario of log block overhead. */ tsize = size + l2arc_log_blk_overhead(size, dev); - if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { + /* + * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) + * times the writesize, whichever is greater. + */ tsize += MAX(64 * 1024 * 1024, (tsize * l2arc_trim_ahead) / 100); + } if (tsize >= dev_size) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " @@ -8836,19 +8843,6 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) buflist = &dev->l2ad_buflist; - /* - * We need to add in the worst case scenario of log block overhead. - */ - distance += l2arc_log_blk_overhead(distance, dev); - if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { - /* - * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) - * times the write size, whichever is greater. - */ - distance += MAX(64 * 1024 * 1024, - (distance * l2arc_trim_ahead) / 100); - } - top: rerun = B_FALSE; if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { diff --git a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh index 0bbd08acdd3f..a93d0b3cc803 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh @@ -67,7 +67,7 @@ typeset VDEV_MIN_MB=$((MINVDEVSIZE * 0.30 / 1024 / 1024)) log_must zpool create -f $TESTPOOL $TRIM_VDEV1 cache $TRIM_VDEV2 verify_vdevs "-le" "$VDEV_MIN_MB" $TRIM_VDEV2 -typeset fill_mb=$(( floor(2 * MINVDEVSIZE) )) +typeset fill_mb=$(( floor(3 * MINVDEVSIZE) )) export DIRECTORY=/$TESTPOOL export NUMJOBS=1 export FILE_SIZE=${fill_mb} From c8b3dda18638fca8b0cc580ad7cecf410606e646 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 9 May 2023 08:57:02 -0700 Subject: [PATCH 14/25] Debug auto_replace_001_pos failures Reduced the timeout to 60 seconds which should be more than sufficient and allow the test to be marked as FAILED rather than KILLED. Also dump the pool status on cleanup. Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #14829 --- tests/zfs-tests/include/libtest.shlib | 14 +++++++++++--- .../functional/fault/auto_replace_001_pos.ksh | 5 +++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 02e6a500a71a..8521f271be54 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1951,6 +1951,7 @@ function check_pool_status # pool token keyword # is_pool_removing - to check if the pool removing is a vdev # is_pool_removed - to check if the pool remove is completed # is_pool_discarding - to check if the pool checkpoint is being discarded +# is_pool_replacing - to check if the pool is performing a replacement # function is_pool_resilvering #pool { @@ -1997,6 +1998,10 @@ function is_pool_discarding #pool { check_pool_status "$1" "checkpoint" "discarding" } +function is_pool_replacing #pool +{ + zpool status "$1" | grep -qE 'replacing-[0-9]+' +} function wait_for_degraded { @@ -2983,12 +2988,15 @@ function wait_freeing #pool # Wait for every device replace operation to complete # # $1 pool name +# $2 timeout # -function wait_replacing #pool +function wait_replacing #pool timeout { + typeset timeout=${2:-300} typeset pool=${1:-$TESTPOOL} - while zpool status $pool | grep -qE 'replacing-[0-9]+'; do - log_must sleep 1 + for (( timer = 0; timer < $timeout; timer++ )); do + is_pool_replacing $pool || break; + sleep 1; done } diff --git a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh index 2846192d08eb..081e6c18430d 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh @@ -54,6 +54,7 @@ fi function cleanup { + zpool status $TESTPOOL destroy_pool $TESTPOOL sed -i '/alias scsidebug/d' $VDEVID_CONF unload_scsi_debug @@ -99,8 +100,8 @@ block_device_wait insert_disk $SD $SD_HOST # Wait for the new disk to be online and replaced -log_must wait_vdev_state $TESTPOOL "scsidebug" "ONLINE" $MAXTIMEOUT -log_must wait_replacing $TESTPOOL +log_must wait_vdev_state $TESTPOOL "scsidebug" "ONLINE" 60 +log_must wait_replacing $TESTPOOL 60 # Validate auto-replace was successful log_must check_state $TESTPOOL "" "ONLINE" From 903c3613d490d1321d587982abb5e4dda4a43308 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 9 May 2023 09:03:10 -0700 Subject: [PATCH 15/25] Add dmu_tx_hold_append() interface Provides an interface which callers can use to declare a write when the exact starting offset in not yet known. Since the full range being updated is not available only the first L0 block at the provided offset will be prefetched. Reviewed-by: Olaf Faaland Signed-off-by: Brian Behlendorf Closes #14819 --- include/sys/dmu.h | 3 ++ include/sys/dmu_tx.h | 1 + module/zfs/dmu_tx.c | 105 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 1b82ff620f27..a5a5c378279a 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -782,6 +782,9 @@ dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); +void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + int len); void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index ca8514e5d2d0..aa55da626149 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -91,6 +91,7 @@ enum dmu_tx_hold_type { THT_SPACE, THT_SPILL, THT_CLONE, + THT_APPEND, THT_NUMTYPES }; diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 1c5608c4541b..c4e274bd4c42 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -294,6 +294,53 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } } +static void +dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dnode_t *dn = txh->txh_dnode; + int err = 0; + + if (len == 0) + return; + + (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); + + if (dn == NULL) + return; + + /* + * For i/o error checking, read the blocks that will be needed + * to perform the append; first level-0 block (if not aligned, i.e. + * if they are partial-block writes), no additional blocks are read. + */ + if (dn->dn_maxblkid == 0) { + if (off < dn->dn_datablksz && + (off > 0 || len < dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + + /* first level-0 block */ + uint64_t start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + + err = zio_wait(zio); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } +} + static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { @@ -334,6 +381,42 @@ dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) } } +/* + * Should be used when appending to an object and the exact offset is unknown. + * The write must occur at or beyond the specified offset. Only the L0 block + * at provided offset will be prefetched. + */ +void +dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_APPEND, off, DMU_OBJECT_END); + if (txh != NULL) { + dmu_tx_count_append(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + +void +dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END); + if (txh != NULL) { + dmu_tx_count_append(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + /* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and @@ -668,6 +751,26 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) if (blkid == 0) match_offset = TRUE; break; + case THT_APPEND: + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) + match_offset = TRUE; + + /* + * THT_WRITE used for bonus and spill blocks. + */ + ASSERT(blkid != DMU_BONUS_BLKID && + blkid != DMU_SPILL_BLKID); + + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; + break; case THT_FREE: /* * We will dirty all the level 1 blocks in @@ -1454,6 +1557,8 @@ dmu_tx_fini(void) EXPORT_SYMBOL(dmu_tx_create); EXPORT_SYMBOL(dmu_tx_hold_write); EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); +EXPORT_SYMBOL(dmu_tx_hold_append); +EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_free); EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_zap); From d3db900a4e457c3a75e6cef8e9bac8d278ddc929 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 9 May 2023 17:55:19 -0700 Subject: [PATCH 16/25] pam: Fix "buffer overflow" in pam ZTS tests on F38 The pam ZTS tests were reporting a buffer overflow on F38, possibly due to F38 now setting _FORTIFY_SOURCE=3 by default. gdb and valgrind narrowed this down to a snprintf() buffer overflow in zfs_key_config_modify_session_counter(). I'm not clear why this particular snprintf() was being flagged as an overflow, but when I replaced it with an asprintf(), the test passed reliably. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #14802 Closes #14842 --- contrib/pam_zfs_key/pam_zfs_key.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 27c7d63781c5..979546ab3090 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -587,16 +587,11 @@ zfs_key_config_modify_session_counter(pam_handle_t *pamh, errno); return (-1); } - size_t runtime_path_len = strlen(runtime_path); - size_t counter_path_len = runtime_path_len + 1 + 10; - char *counter_path = malloc(counter_path_len + 1); - if (!counter_path) { + + char *counter_path; + if (asprintf(&counter_path, "%s/%u", runtime_path, config->uid) == -1) return (-1); - } - counter_path[0] = 0; - strcat(counter_path, runtime_path); - snprintf(counter_path + runtime_path_len, counter_path_len, "/%d", - config->uid); + const int fd = open(counter_path, O_RDWR | O_CLOEXEC | O_CREAT | O_NOFOLLOW, S_IRUSR | S_IWUSR); From 14ba8ab97ddb3674351861ecf373125ac4e1dc63 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Wed, 10 May 2023 05:56:35 +0500 Subject: [PATCH 17/25] Prevent panic during concurrent snapshot rollback and zvol read Protect zvol_cdev_read with zv_suspend_lock to prevent concurrent release of the dnode, avoiding panic when a snapshot is rolled back in parallel during ongoing zvol read operation. Reviewed-by: Chunwei Chen Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #14839 --- module/os/freebsd/zfs/zvol_os.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 26578491fd67..2520507b98aa 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -832,6 +832,7 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) return (SET_ERROR(EIO)); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); ssize_t start_resid = zfs_uio_resid(&uio); lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), zfs_uio_resid(&uio), RL_READER); @@ -853,6 +854,7 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) zfs_rangelock_exit(lr); int64_t nread = start_resid - zfs_uio_resid(&uio); dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); + rw_exit(&zv->zv_suspend_lock); return (error); } From 469019fb0b2b7ca4bd6c3de5c2f1056a4446f0e3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 11 May 2023 17:27:12 -0400 Subject: [PATCH 18/25] zil: Don't expect zio_shrink() to succeed. At least for RAIDZ zio_shrink() does not reduce zio size, but reduced wsz in that case likely results in writing uninitialized memory. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14853 --- module/zfs/zil.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index ec9da706a806..c37da89dd438 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1866,6 +1866,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); ASSERT3U(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); + wsz = lwb->lwb_write_zio->io_size; } else { wsz = lwb->lwb_sz; From 555ef90c5c1db5dcd1b47c02134c85b5a03dc6bc Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Sun, 30 Apr 2023 02:47:09 -0700 Subject: [PATCH 19/25] Additional block cloning fixes. Reimplement some of the block cloning vs dbuf logic, mostly to fix situation where we clone a block and in the same transaction group we want to partially overwrite the clone. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- include/sys/dbuf.h | 23 +++++----- module/zfs/dbuf.c | 103 +++++++++++++++++++++++++++++++++++---------- module/zfs/dmu.c | 14 +----- 3 files changed, 95 insertions(+), 45 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index fb26a83b1844..1800a7e31da0 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -61,16 +61,18 @@ extern "C" { /* * The simplified state transition diagram for dbufs looks like: * - * +----> READ ----+ - * | | - * | V - * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - * | ^ ^ - * | | | - * +----> FILL ----+ | - * | | - * | | - * +--------> NOFILL -------+ + * +--> READ --+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * ^ | ^ ^ + * | | | | + * | +--> FILL --+ | + * | | | + * | | | + * | +------> NOFILL -----+ + * | | + * +---------------+ * * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range * to find all dbufs in a range of a dnode and must be less than any other @@ -375,6 +377,7 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, uint64_t blkid, uint64_t *hash_out); int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 6a50f1927add..049a62c1c171 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1573,24 +1573,22 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, bpp = &bp; } } else { - struct dirty_leaf *dl; dbuf_dirty_record_t *dr; ASSERT3S(db->db_state, ==, DB_NOFILL); + /* + * Block cloning: If we have a pending block clone, + * we don't want to read the underlying block, but the content + * of the block being cloned, so we have the most recent data. + */ dr = list_head(&db->db_dirty_records); - if (dr == NULL) { + if (dr == NULL || !dr->dt.dl.dr_brtwrite) { err = EIO; goto early_unlock; - } else { - dl = &dr->dt.dl; - if (!dl->dr_brtwrite) { - err = EIO; - goto early_unlock; - } - bp = dl->dr_overridden_by; - bpp = &bp; } + bp = dr->dt.dl.dr_overridden_by; + bpp = &bp; } err = dbuf_read_hole(db, dn, bpp); @@ -1906,6 +1904,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; + boolean_t release; ASSERT(MUTEX_HELD(&db->db_mtx)); /* @@ -1926,8 +1925,10 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); + release = !dr->dt.dl.dr_brtwrite; dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; + dr->dt.dl.dr_brtwrite = B_FALSE; dr->dt.dl.dr_has_raw_params = B_FALSE; /* @@ -1938,7 +1939,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ - if (!dr->dt.dl.dr_brtwrite) + if (release) arc_release(dr->dt.dl.dr_data, db); } @@ -2022,11 +2023,6 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); - if (dr->dt.dl.dr_brtwrite) { - ASSERT(db->db.db_data == NULL); - mutex_exit(&db->db_mtx); - continue; - } } else { /* * This dbuf is not dirty in the open context. @@ -2613,6 +2609,7 @@ static void dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + boolean_t undirty = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); @@ -2625,7 +2622,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) */ mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED) { + if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) { dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); /* * It's possible that it is already dirty but not cached, @@ -2633,10 +2630,21 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) * go through dmu_buf_will_dirty(). */ if (dr != NULL) { - /* This dbuf is already dirty and cached. */ - dbuf_redirty(dr); - mutex_exit(&db->db_mtx); - return; + if (dr->dt.dl.dr_brtwrite) { + /* + * Block cloning: If we are dirtying a cloned + * block, we cannot simply redirty it, because + * this dr has no data associated with it. + * We will go through a full undirtying below, + * before dirtying it again. + */ + undirty = B_TRUE; + } else { + /* This dbuf is already dirty and cached. */ + dbuf_redirty(dr); + mutex_exit(&db->db_mtx); + return; + } } } mutex_exit(&db->db_mtx); @@ -2645,7 +2653,20 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) flags |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); + + /* + * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we + * want to make sure dbuf_read() will read the pending cloned block and + * not the uderlying block that is being replaced. dbuf_undirty() will + * do dbuf_unoverride(), so we will end up with cloned block content, + * without overridden BP. + */ (void) dbuf_read(db, NULL, flags); + if (undirty) { + mutex_enter(&db->db_mtx); + VERIFY(!dbuf_undirty(db, tx)); + mutex_exit(&db->db_mtx); + } (void) dbuf_dirty(db, tx); } @@ -2668,6 +2689,28 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) return (dr != NULL); } +void +dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + /* + * Block cloning: We are going to clone into this block, so undirty + * modifications done to this block so far in this txg. This includes + * writes and clones into this block. + */ + mutex_enter(&db->db_mtx); + VERIFY(!dbuf_undirty(db, tx)); + ASSERT(list_head(&db->db_dirty_records) == NULL); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } + mutex_exit(&db->db_mtx); + + dmu_buf_will_not_fill(db_fake, tx); +} + void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { @@ -2675,7 +2718,9 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) db->db_state = DB_NOFILL; DTRACE_SET_STATE(db, "allocating NOFILL buffer"); - dmu_buf_will_fill(db_fake, tx); + + dbuf_noread(db); + (void) dbuf_dirty(db, tx); } void @@ -2691,6 +2736,19 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + if (db->db_state == DB_NOFILL) { + /* + * Block cloning: We will be completely overwriting a block + * cloned in this transaction group, so let's undirty the + * pending clone and mark the block as uncached. This will be + * as if the clone was never done. + */ + mutex_enter(&db->db_mtx); + VERIFY(!dbuf_undirty(db, tx)); + mutex_exit(&db->db_mtx); + db->db_state = DB_UNCACHED; + } + dbuf_noread(db); (void) dbuf_dirty(db, tx); } @@ -5155,6 +5213,7 @@ EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_is_dirty); +EXPORT_SYMBOL(dmu_buf_will_clone); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index cda1472a77aa..f8accafd6d12 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2284,18 +2284,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); - mutex_enter(&db->db_mtx); - - VERIFY(!dbuf_undirty(db, tx)); - ASSERT(list_head(&db->db_dirty_records) == NULL); - if (db->db_buf != NULL) { - arc_buf_destroy(db->db_buf, db); - db->db_buf = NULL; - } - - mutex_exit(&db->db_mtx); - - dmu_buf_will_not_fill(dbuf, tx); + dmu_buf_will_clone(dbuf, tx); mutex_enter(&db->db_mtx); @@ -2305,7 +2294,6 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dl = &dr->dt.dl; dl->dr_overridden_by = *bp; dl->dr_brtwrite = B_TRUE; - dl->dr_override_state = DR_OVERRIDDEN; if (BP_IS_HOLE(bp)) { dl->dr_overridden_by.blk_birth = 0; From bd8c6bd66f9dde7534ae2f52237a1b208721cbf7 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Tue, 2 May 2023 14:24:43 -0700 Subject: [PATCH 20/25] Deny block cloning is dbuf size doesn't match BP size. I don't know an easy way to shrink down dbuf size, so just deny block cloning into dbufs that don't match our BP's size. This fixes the following situation: 1. Create a small file, eg. 1kB of random bytes. Its dbuf will be 1kB. 2. Create a larger file, eg. 2kB of random bytes. Its dbuf will be 2kB. 3. Truncate the large file to 0. Its dbuf will remain 2kB. 4. Clone the small file into the large file. Small file's BP lsize is 1kB, but the large file's dbuf is 2kB. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- include/sys/dmu.h | 2 +- module/zfs/dmu.c | 29 +++++++++++++++++++++++++---- module/zfs/zfs_vnops.c | 8 ++++++-- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index a5a5c378279a..6a5fb5530498 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1066,7 +1066,7 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, struct blkptr *bps, size_t *nbpsp); -void dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, +int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps, boolean_t replay); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index f8accafd6d12..c1f9d02f0dd8 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2257,7 +2257,7 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, return (error); } -void +int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay) { @@ -2267,7 +2267,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, struct dirty_leaf *dl; dbuf_dirty_record_t *dr; const blkptr_t *bp; - int numbufs; + int error = 0, i, numbufs; spa = os->os_spa; @@ -2275,7 +2275,26 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, &numbufs, &dbp)); ASSERT3U(nbps, ==, numbufs); - for (int i = 0; i < numbufs; i++) { + /* + * Before we start cloning make sure that the dbufs sizes much new BPs + * sizes. If they don't, that's a no-go, as we are not able to shrink + * dbufs. + */ + for (i = 0; i < numbufs; i++) { + dbuf = dbp[i]; + db = (dmu_buf_impl_t *)dbuf; + bp = &bps[i]; + + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) { + error = SET_ERROR(EXDEV); + goto out; + } + } + + for (i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; @@ -2319,8 +2338,10 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, brt_pending_add(spa, bp, tx); } } - +out: dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (error); } void diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index a6a27222bf4c..71955f90db03 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1309,8 +1309,12 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, ((len - 1) / inblksz + 1) * inblksz); } - dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, bps, nbps, - B_FALSE); + error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, + bps, nbps, B_FALSE); + if (error != 0) { + dmu_tx_commit(tx); + break; + } zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, &clear_setid_bits_txg, tx); From d0d91f185efd9149d8faceb89a9a0e5e54093fc8 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Tue, 2 May 2023 15:46:14 -0700 Subject: [PATCH 21/25] Don't use dmu_buf_is_dirty() for unassigned transaction. The dmu_buf_is_dirty() call doesn't make sense here for two reasons: 1. txg is 0 for unassigned tx, so it was a no-op. 2. It is equivalent of checking if we have dirty records and we are doing this few lines earlier. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- include/sys/dmu.h | 2 +- module/zfs/dmu.c | 6 +----- module/zfs/zfs_vnops.c | 13 +++++-------- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 6a5fb5530498..5ee6704668a4 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1065,7 +1065,7 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, dmu_tx_t *tx, struct blkptr *bps, size_t *nbpsp); + uint64_t length, struct blkptr *bps, size_t *nbpsp); int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps, boolean_t replay); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index c1f9d02f0dd8..4e42bb3ef90c 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2173,7 +2173,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, - dmu_tx_t *tx, blkptr_t *bps, size_t *nbpsp) + blkptr_t *bps, size_t *nbpsp) { dmu_buf_t **dbp, *dbuf; dmu_buf_impl_t *db; @@ -2235,10 +2235,6 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, error = SET_ERROR(EAGAIN); goto out; } - if (dmu_buf_is_dirty(dbuf, tx)) { - error = SET_ERROR(EAGAIN); - goto out; - } /* * Make sure we clone only data blocks. */ diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 71955f90db03..dca76227a4ac 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1246,16 +1246,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, break; } - /* - * Start a transaction. - */ - tx = dmu_tx_create(outos); - nbps = maxblocks; - error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, tx, bps, + error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, &nbps); if (error != 0) { - dmu_tx_abort(tx); /* * If we are tyring to clone a block that was created * in the current transaction group. Return an error, @@ -1276,12 +1270,15 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, */ if (BP_IS_PROTECTED(&bps[0])) { if (inzfsvfs != outzfsvfs) { - dmu_tx_abort(tx); error = SET_ERROR(EXDEV); break; } } + /* + * Start a transaction. + */ + tx = dmu_tx_create(outos); dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); DB_DNODE_ENTER(db); From b6d7370b9de5ebc7aae8ada702c3d05b81d28d77 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 3 May 2023 00:24:47 -0700 Subject: [PATCH 22/25] Don't call zfs_exit_two() before zfs_enter_two(). Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- module/zfs/zfs_vnops.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index dca76227a4ac..86706469acee 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1072,6 +1072,15 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, inzfsvfs = ZTOZSB(inzp); outzfsvfs = ZTOZSB(outzp); + + /* + * We need to call zfs_enter() potentially on two different datasets, + * so we need a dedicated function for that. + */ + error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); + if (error != 0) + return (error); + inos = inzfsvfs->z_os; outos = outzfsvfs->z_os; @@ -1083,14 +1092,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, return (SET_ERROR(EXDEV)); } - /* - * We need to call zfs_enter() potentially on two different datasets, - * so we need a dedicated function for that. - */ - error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); - if (error != 0) - return (error); - ASSERT(!outzfsvfs->z_replay); error = zfs_verify_zp(inzp); From 9879930f7a42350a7f5cc0a0edc611c77ae1281e Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 3 May 2023 23:25:22 -0700 Subject: [PATCH 23/25] Remove badly placed comment. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- module/zfs/dmu.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 4e42bb3ef90c..072076ffe91d 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2197,10 +2197,6 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, mutex_enter(&db->db_mtx); - /* - * If the block is not on the disk yet, it has no BP assigned. - * There is not much we can do... - */ if (!list_is_empty(&db->db_dirty_records)) { dbuf_dirty_record_t *dr; From fbbe5e96eff9afadf9def323a246d4dd876eb0bd Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Thu, 4 May 2023 16:14:19 -0700 Subject: [PATCH 24/25] Correct comment. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- module/zfs/dmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 072076ffe91d..97379dfc1cd5 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2268,7 +2268,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT3U(nbps, ==, numbufs); /* - * Before we start cloning make sure that the dbufs sizes much new BPs + * Before we start cloning make sure that the dbufs sizes match new BPs * sizes. If they don't, that's a no-go, as we are not able to shrink * dbufs. */ From e6107668385044718b0a73330ed6423650806473 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Tue, 9 May 2023 22:32:30 -0700 Subject: [PATCH 25/25] Make sure we are not trying to clone a spill block. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- module/zfs/dmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 97379dfc1cd5..8a13b8f410a1 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2279,6 +2279,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_SPILL_BLKID); if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) { error = SET_ERROR(EXDEV); @@ -2293,6 +2294,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_SPILL_BLKID); ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); dmu_buf_will_clone(dbuf, tx);