diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index ec5d1acacf85..5ab13b470dc0 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -3322,13 +3322,22 @@ dump_znode_sa_xattr(sa_handle_t *hdl) (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", sa_xattr_size, sa_xattr_entries); while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { + boolean_t can_print = !dump_opt['P']; uchar_t *value; uint_t cnt, idx; (void) printf("\t\t%s = ", nvpair_name(elem)); nvpair_value_byte_array(elem, &value, &cnt); + for (idx = 0; idx < cnt; ++idx) { - if (isprint(value[idx])) + if (!isprint(value[idx])) { + can_print = B_FALSE; + break; + } + } + + for (idx = 0; idx < cnt; ++idx) { + if (can_print) (void) putchar(value[idx]); else (void) printf("\\%3.3o", value[idx]); @@ -8490,8 +8499,8 @@ zdb_read_block(char *thing, spa_t *spa) !(flags & ZDB_FLAG_DECOMPRESS)) { const blkptr_t *b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); - if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) == - B_FALSE) { + if (zfs_blkptr_verify(spa, b, + BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) { abd_return_buf_copy(pabd, buf, lsize); borrowed = B_FALSE; buf = lbuf; @@ -8499,8 +8508,8 @@ zdb_read_block(char *thing, spa_t *spa) lbuf, lsize, psize, flags); b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); - if (failed || zfs_blkptr_verify(spa, b, B_FALSE, - BLK_VERIFY_LOG) == B_FALSE) { + if (failed || zfs_blkptr_verify(spa, b, + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) { printf("invalid block pointer at this DVA\n"); goto out; } diff --git a/sys/contrib/openzfs/cmd/zhack.c b/sys/contrib/openzfs/cmd/zhack.c index 0b6da31ec573..44611887dd25 100644 --- a/sys/contrib/openzfs/cmd/zhack.c +++ b/sys/contrib/openzfs/cmd/zhack.c @@ -58,6 +58,12 @@ static importargs_t g_importargs; static char *g_pool; static boolean_t g_readonly; +typedef enum { + ZHACK_REPAIR_OP_UNKNOWN = 0, + ZHACK_REPAIR_OP_CKSUM = (1 << 0), + ZHACK_REPAIR_OP_UNDETACH = (1 << 1) +} zhack_repair_op_t; + static __attribute__((noreturn)) void usage(void) { @@ -81,7 +87,10 @@ usage(void) " : should be a feature guid\n" "\n" " label repair \n" - " repair corrupted label checksums\n" + " repair labels of a specified device according to options\n" + " which may be combined to do their functions in one call\n" + " -c repair corrupted label checksums\n" + " -u restore the label on a detached device\n" "\n" " : path to vdev\n"); exit(1); @@ -485,23 +494,374 @@ zhack_do_feature(int argc, char **argv) return (0); } +#define ASHIFT_UBERBLOCK_SHIFT(ashift) \ + MIN(MAX(ashift, UBERBLOCK_SHIFT), \ + MAX_UBERBLOCK_SHIFT) +#define ASHIFT_UBERBLOCK_SIZE(ashift) \ + (1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift)) + +#define REPAIR_LABEL_STATUS_CKSUM (1 << 0) +#define REPAIR_LABEL_STATUS_UB (1 << 1) + static int -zhack_repair_label_cksum(int argc, char **argv) +zhack_repair_read_label(const int fd, vdev_label_t *vl, + const uint64_t label_offset, const int l) { - zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; + const int err = pread64(fd, vl, sizeof (vdev_label_t), label_offset); + + if (err == -1) { + (void) fprintf(stderr, + "error: cannot read label %d: %s\n", + l, strerror(errno)); + return (err); + } else if (err != sizeof (vdev_label_t)) { + (void) fprintf(stderr, + "error: bad label %d read size\n", l); + return (err); + } + + return (0); +} + +static void +zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset, + const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum) +{ + zio_cksum_t verifier; + zio_cksum_t current_cksum; + zio_checksum_info_t *ci; + abd_t *abd; + + ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); + + if (byteswap) + byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + + current_cksum = eck->zec_cksum; + eck->zec_cksum = verifier; + + ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; + abd = abd_get_from_buf(data, abdsize); + ci->ci_func[byteswap](abd, abdsize, NULL, cksum); + abd_free(abd); + + eck->zec_cksum = current_cksum; +} + +static int +zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys, + const size_t cfg_keys_len, nvlist_t *cfg, nvlist_t *vdev_tree_cfg, + uint64_t *ashift) +{ + int err; + + if (ub->ub_txg != 0) { + (void) fprintf(stderr, + "error: label %d: UB TXG of 0 expected, but got %" + PRIu64 "\n", + l, ub->ub_txg); + (void) fprintf(stderr, "It would appear the device was not " + "properly removed.\n"); + return (1); + } + + for (int i = 0; i < cfg_keys_len; i++) { + uint64_t val; + err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val); + if (err) { + (void) fprintf(stderr, + "error: label %d, %d: " + "cannot find nvlist key %s\n", + l, i, cfg_keys[i]); + return (err); + } + } + + err = nvlist_lookup_nvlist(cfg, + ZPOOL_CONFIG_VDEV_TREE, &vdev_tree_cfg); + if (err) { + (void) fprintf(stderr, + "error: label %d: cannot find nvlist key %s\n", + l, ZPOOL_CONFIG_VDEV_TREE); + return (err); + } + + err = nvlist_lookup_uint64(vdev_tree_cfg, + ZPOOL_CONFIG_ASHIFT, ashift); + if (err) { + (void) fprintf(stderr, + "error: label %d: cannot find nvlist key %s\n", + l, ZPOOL_CONFIG_ASHIFT); + return (err); + } + + if (*ashift == 0) { + (void) fprintf(stderr, + "error: label %d: nvlist key %s is zero\n", + l, ZPOOL_CONFIG_ASHIFT); + return (err); + } + + return (0); +} + +static int +zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l) +{ + /* + * Uberblock root block pointer has valid birth TXG. + * Copying it to the label NVlist + */ + if (ub->ub_rootbp.blk_birth != 0) { + const uint64_t txg = ub->ub_rootbp.blk_birth; + ub->ub_txg = txg; + + if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) { + (void) fprintf(stderr, + "error: label %d: " + "Failed to remove pool creation TXG\n", + l); + return (1); + } + + if (nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG) != 0) { + (void) fprintf(stderr, + "error: label %d: Failed to remove pool TXG to " + "be replaced.\n", + l); + return (1); + } + + if (nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg) != 0) { + (void) fprintf(stderr, + "error: label %d: " + "Failed to add pool TXG of %" PRIu64 "\n", + l, txg); + return (1); + } + } + + return (0); +} + +static boolean_t +zhack_repair_write_label(const int l, const int fd, const int byteswap, + void *data, zio_eck_t *eck, const uint64_t offset, const uint64_t abdsize) +{ + zio_cksum_t actual_cksum; + zhack_repair_calc_cksum(byteswap, data, offset, abdsize, eck, + &actual_cksum); + zio_cksum_t expected_cksum = eck->zec_cksum; + ssize_t err; + + if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) + return (B_FALSE); + + eck->zec_cksum = actual_cksum; + + err = pwrite64(fd, data, abdsize, offset); + if (err == -1) { + (void) fprintf(stderr, "error: cannot write label %d: %s\n", + l, strerror(errno)); + return (B_FALSE); + } else if (err != abdsize) { + (void) fprintf(stderr, "error: bad write size label %d\n", l); + return (B_FALSE); + } else { + (void) fprintf(stderr, + "label %d: wrote %" PRIu64 " bytes at offset %" PRIu64 "\n", + l, abdsize, offset); + } + + return (B_TRUE); +} + +static void +zhack_repair_write_uberblock(vdev_label_t *vl, const int l, + const uint64_t ashift, const int fd, const int byteswap, + const uint64_t label_offset, uint32_t *labels_repaired) +{ + void *ub_data = + (char *)vl + offsetof(vdev_label_t, vl_uberblock); + zio_eck_t *ub_eck = + (zio_eck_t *) + ((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift))) - 1; + + if (ub_eck->zec_magic != 0) { + (void) fprintf(stderr, + "error: label %d: " + "Expected Uberblock checksum magic number to " + "be 0, but got %" PRIu64 "\n", + l, ub_eck->zec_magic); + (void) fprintf(stderr, "It would appear there's already " + "a checksum for the uberblock.\n"); + return; + } + + + ub_eck->zec_magic = byteswap ? BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC; + + if (zhack_repair_write_label(l, fd, byteswap, + ub_data, ub_eck, + label_offset + offsetof(vdev_label_t, vl_uberblock), + ASHIFT_UBERBLOCK_SIZE(ashift))) + labels_repaired[l] |= REPAIR_LABEL_STATUS_UB; +} + +static void +zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum) +{ + (void) fprintf(stream, + "%016llx:%016llx:%016llx:%016llx", + (u_longlong_t)cksum->zc_word[0], + (u_longlong_t)cksum->zc_word[1], + (u_longlong_t)cksum->zc_word[2], + (u_longlong_t)cksum->zc_word[3]); +} + +static int +zhack_repair_test_cksum(const int byteswap, void *vdev_data, + zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset, const int l) +{ + const zio_cksum_t expected_cksum = vdev_eck->zec_cksum; + zio_cksum_t actual_cksum; + zhack_repair_calc_cksum(byteswap, vdev_data, vdev_phys_offset, + VDEV_PHYS_SIZE, vdev_eck, &actual_cksum); + const uint64_t expected_magic = byteswap ? + BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC; + const uint64_t actual_magic = vdev_eck->zec_magic; + int err = 0; + if (actual_magic != expected_magic) { + (void) fprintf(stderr, "error: label %d: " + "Expected " + "the nvlist checksum magic number to not be %" + PRIu64 " not %" PRIu64 "\n", + l, expected_magic, actual_magic); + err = ECKSUM; + } + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { + (void) fprintf(stderr, "error: label %d: " + "Expected the nvlist checksum to be ", l); + (void) zhack_repair_print_cksum(stderr, + &expected_cksum); + (void) fprintf(stderr, " not "); + zhack_repair_print_cksum(stderr, &actual_cksum); + (void) fprintf(stderr, "\n"); + err = ECKSUM; + } + return (err); +} + +static void +zhack_repair_one_label(const zhack_repair_op_t op, const int fd, + vdev_label_t *vl, const uint64_t label_offset, const int l, + uint32_t *labels_repaired) +{ + ssize_t err; + uberblock_t *ub = (uberblock_t *)vl->vl_uberblock; + void *vdev_data = + (char *)vl + offsetof(vdev_label_t, vl_vdev_phys); + zio_eck_t *vdev_eck = + (zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1; + const uint64_t vdev_phys_offset = + label_offset + offsetof(vdev_label_t, vl_vdev_phys); const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION, ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID }; - boolean_t labels_repaired[VDEV_LABELS] = {0}; - boolean_t repaired = B_FALSE; + nvlist_t *cfg; + nvlist_t *vdev_tree_cfg = NULL; + uint64_t ashift; + int byteswap; + + err = zhack_repair_read_label(fd, vl, label_offset, l); + if (err) + return; + + if (vdev_eck->zec_magic == 0) { + (void) fprintf(stderr, "error: label %d: " + "Expected the nvlist checksum magic number to not be zero" + "\n", + l); + (void) fprintf(stderr, "There should already be a checksum " + "for the label.\n"); + return; + } + + byteswap = + (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC)); + + if (byteswap) { + byteswap_uint64_array(&vdev_eck->zec_cksum, + sizeof (zio_cksum_t)); + vdev_eck->zec_magic = BSWAP_64(vdev_eck->zec_magic); + } + + if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 && + zhack_repair_test_cksum(byteswap, vdev_data, vdev_eck, + vdev_phys_offset, l) != 0) { + (void) fprintf(stderr, "It would appear checksums are " + "corrupted. Try zhack repair label -c \n"); + return; + } + + err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, + VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0); + if (err) { + (void) fprintf(stderr, + "error: cannot unpack nvlist label %d\n", l); + return; + } + + err = zhack_repair_check_label(ub, + l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift); + if (err) + return; + + if ((op & ZHACK_REPAIR_OP_UNDETACH) != 0) { + char *buf; + size_t buflen; + + err = zhack_repair_undetach(ub, cfg, l); + if (err) + return; + + buf = vl->vl_vdev_phys.vp_nvlist; + buflen = VDEV_PHYS_SIZE - sizeof (zio_eck_t); + if (nvlist_pack(cfg, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) { + (void) fprintf(stderr, + "error: label %d: Failed to pack nvlist\n", l); + return; + } + + zhack_repair_write_uberblock(vl, + l, ashift, fd, byteswap, label_offset, labels_repaired); + } + + if (zhack_repair_write_label(l, fd, byteswap, vdev_data, vdev_eck, + vdev_phys_offset, VDEV_PHYS_SIZE)) + labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM; + + fsync(fd); +} + +static const char * +zhack_repair_label_status(const uint32_t label_status, + const uint32_t to_check) +{ + return ((label_status & to_check) != 0 ? "repaired" : "skipped"); +} + +static int +zhack_label_repair(const zhack_repair_op_t op, const int argc, char **argv) +{ + uint32_t labels_repaired[VDEV_LABELS] = {0}; vdev_label_t labels[VDEV_LABELS] = {{{0}}}; - struct stat st; + struct stat64 st; int fd; + off_t filesize; + uint32_t repaired = 0; abd_init(); - argc -= 1; - argv += 1; - if (argc < 1) { (void) fprintf(stderr, "error: missing device\n"); usage(); @@ -511,93 +871,21 @@ zhack_repair_label_cksum(int argc, char **argv) fatal(NULL, FTAG, "cannot open '%s': %s", argv[0], strerror(errno)); - if (stat(argv[0], &st) != 0) + if (fstat64_blk(fd, &st) != 0) fatal(NULL, FTAG, "cannot stat '%s': %s", argv[0], strerror(errno)); + filesize = st.st_size; + (void) fprintf(stderr, "Calculated filesize to be %jd\n", + (intmax_t)filesize); + + if (filesize % sizeof (vdev_label_t) != 0) + filesize = + (filesize / sizeof (vdev_label_t)) * sizeof (vdev_label_t); + for (int l = 0; l < VDEV_LABELS; l++) { - uint64_t label_offset, offset; - zio_cksum_t expected_cksum; - zio_cksum_t actual_cksum; - zio_cksum_t verifier; - zio_eck_t *eck; - nvlist_t *cfg; - int byteswap; - uint64_t val; - ssize_t err; - - vdev_label_t *vl = &labels[l]; - - label_offset = vdev_label_offset(st.st_size, l, 0); - err = pread64(fd, vl, sizeof (vdev_label_t), label_offset); - if (err == -1) { - (void) fprintf(stderr, "error: cannot read " - "label %d: %s\n", l, strerror(errno)); - continue; - } else if (err != sizeof (vdev_label_t)) { - (void) fprintf(stderr, "error: bad label %d read size " - "\n", l); - continue; - } - - err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, - VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0); - if (err) { - (void) fprintf(stderr, "error: cannot unpack nvlist " - "label %d\n", l); - continue; - } - - for (int i = 0; i < ARRAY_SIZE(cfg_keys); i++) { - err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val); - if (err) { - (void) fprintf(stderr, "error: label %d: " - "cannot find nvlist key %s\n", - l, cfg_keys[i]); - continue; - } - } - - void *data = (char *)vl + offsetof(vdev_label_t, vl_vdev_phys); - eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1; - - offset = label_offset + offsetof(vdev_label_t, vl_vdev_phys); - ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); - - byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); - if (byteswap) - byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); - - expected_cksum = eck->zec_cksum; - eck->zec_cksum = verifier; - - abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE); - ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum); - abd_free(abd); - - if (byteswap) - byteswap_uint64_array(&expected_cksum, - sizeof (zio_cksum_t)); - - if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) - continue; - - eck->zec_cksum = actual_cksum; - - err = pwrite64(fd, data, VDEV_PHYS_SIZE, offset); - if (err == -1) { - (void) fprintf(stderr, "error: cannot write " - "label %d: %s\n", l, strerror(errno)); - continue; - } else if (err != VDEV_PHYS_SIZE) { - (void) fprintf(stderr, "error: bad write size " - "label %d\n", l); - continue; - } - - fsync(fd); - - labels_repaired[l] = B_TRUE; + zhack_repair_one_label(op, fd, &labels[l], + vdev_label_offset(filesize, l, 0), l, labels_repaired); } close(fd); @@ -605,17 +893,51 @@ zhack_repair_label_cksum(int argc, char **argv) abd_fini(); for (int l = 0; l < VDEV_LABELS; l++) { - (void) printf("label %d: %s\n", l, - labels_repaired[l] ? "repaired" : "skipped"); - repaired |= labels_repaired[l]; + const uint32_t lr = labels_repaired[l]; + (void) printf("label %d: ", l); + (void) printf("uberblock: %s ", + zhack_repair_label_status(lr, REPAIR_LABEL_STATUS_UB)); + (void) printf("checksum: %s\n", + zhack_repair_label_status(lr, REPAIR_LABEL_STATUS_CKSUM)); + repaired |= lr; } - if (repaired) + if (repaired > 0) return (0); return (1); } +static int +zhack_do_label_repair(int argc, char **argv) +{ + zhack_repair_op_t op = ZHACK_REPAIR_OP_UNKNOWN; + int c; + + optind = 1; + while ((c = getopt(argc, argv, "+cu")) != -1) { + switch (c) { + case 'c': + op |= ZHACK_REPAIR_OP_CKSUM; + break; + case 'u': + op |= ZHACK_REPAIR_OP_UNDETACH; + break; + default: + usage(); + break; + } + } + + argc -= optind; + argv += optind; + + if (op == ZHACK_REPAIR_OP_UNKNOWN) + op = ZHACK_REPAIR_OP_CKSUM; + + return (zhack_label_repair(op, argc, argv)); +} + static int zhack_do_label(int argc, char **argv) { @@ -632,7 +954,7 @@ zhack_do_label(int argc, char **argv) subcommand = argv[0]; if (strcmp(subcommand, "repair") == 0) { - err = zhack_repair_label_cksum(argc, argv); + err = zhack_do_label_repair(argc, argv); } else { (void) fprintf(stderr, "error: unknown subcommand: %s\n", subcommand); diff --git a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c index 27c7d63781c5..979546ab3090 100644 --- a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c +++ b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c @@ -587,16 +587,11 @@ zfs_key_config_modify_session_counter(pam_handle_t *pamh, errno); return (-1); } - size_t runtime_path_len = strlen(runtime_path); - size_t counter_path_len = runtime_path_len + 1 + 10; - char *counter_path = malloc(counter_path_len + 1); - if (!counter_path) { + + char *counter_path; + if (asprintf(&counter_path, "%s/%u", runtime_path, config->uid) == -1) return (-1); - } - counter_path[0] = 0; - strcat(counter_path, runtime_path); - snprintf(counter_path + runtime_path_len, counter_path_len, "/%d", - config->uid); + const int fd = open(counter_path, O_RDWR | O_CLOEXEC | O_CREAT | O_NOFOLLOW, S_IRUSR | S_IWUSR); diff --git a/sys/contrib/openzfs/include/sys/dbuf.h b/sys/contrib/openzfs/include/sys/dbuf.h index fb26a83b1844..1800a7e31da0 100644 --- a/sys/contrib/openzfs/include/sys/dbuf.h +++ b/sys/contrib/openzfs/include/sys/dbuf.h @@ -61,16 +61,18 @@ extern "C" { /* * The simplified state transition diagram for dbufs looks like: * - * +----> READ ----+ - * | | - * | V - * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - * | ^ ^ - * | | | - * +----> FILL ----+ | - * | | - * | | - * +--------> NOFILL -------+ + * +--> READ --+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * ^ | ^ ^ + * | | | | + * | +--> FILL --+ | + * | | | + * | | | + * | +------> NOFILL -----+ + * | | + * +---------------+ * * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range * to find all dbufs in a range of a dnode and must be less than any other @@ -375,6 +377,7 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, uint64_t blkid, uint64_t *hash_out); int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); diff --git a/sys/contrib/openzfs/include/sys/dmu.h b/sys/contrib/openzfs/include/sys/dmu.h index 1b82ff620f27..5ee6704668a4 100644 --- a/sys/contrib/openzfs/include/sys/dmu.h +++ b/sys/contrib/openzfs/include/sys/dmu.h @@ -782,6 +782,9 @@ dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); +void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + int len); void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, @@ -1062,8 +1065,8 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, dmu_tx_t *tx, struct blkptr *bps, size_t *nbpsp); -void dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, struct blkptr *bps, size_t *nbpsp); +int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps, boolean_t replay); diff --git a/sys/contrib/openzfs/include/sys/dmu_tx.h b/sys/contrib/openzfs/include/sys/dmu_tx.h index ca8514e5d2d0..aa55da626149 100644 --- a/sys/contrib/openzfs/include/sys/dmu_tx.h +++ b/sys/contrib/openzfs/include/sys/dmu_tx.h @@ -91,6 +91,7 @@ enum dmu_tx_hold_type { THT_SPACE, THT_SPILL, THT_CLONE, + THT_APPEND, THT_NUMTYPES }; diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h index b96a9ef1d42f..460ea2bfee4e 100644 --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -1140,7 +1140,8 @@ extern const char *spa_state_to_name(spa_t *spa); struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth); -extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb); +extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, + const uint64_t *birth); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index 3463682a1065..695bc09e6cb7 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -531,6 +531,12 @@ enum blk_verify_flag { BLK_VERIFY_HALT }; +enum blk_config_flag { + BLK_CONFIG_HELD, // SCL_VDEV held for writer + BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader + BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV +}; + extern int zio_bookmark_compare(const void *, const void *); extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, @@ -646,7 +652,7 @@ extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, - boolean_t config_held, enum blk_verify_flag blk_verify); + enum blk_config_flag blk_config, enum blk_verify_flag blk_verify); /* * Initial setup and teardown. diff --git a/sys/contrib/openzfs/man/man1/zhack.1 b/sys/contrib/openzfs/man/man1/zhack.1 index 26b8156b4008..937f1e9168c2 100644 --- a/sys/contrib/openzfs/man/man1/zhack.1 +++ b/sys/contrib/openzfs/man/man1/zhack.1 @@ -98,10 +98,29 @@ feature is now required to read the pool MOS. .It Xo .Nm zhack .Cm label repair +.Op Fl cu .Ar device .Xc -Repair corrupted labels by rewriting the checksum using the presumed valid -contents of the label. +Repair labels of a specified +.Ar device +according to options. +.Pp +Flags may be combined to do their functions simultaneously. +. +.Pp +The +.Fl c +flag repairs corrupted label checksums +. +.Pp +The +.Fl u +flag restores the label on a detached device +.Pp +Example: +.Nm zhack Cm label repair Fl cu Ar device + Fix checksums and undetach a device +. .El . .Sh GLOBAL OPTIONS diff --git a/sys/contrib/openzfs/man/man7/zpool-features.7 b/sys/contrib/openzfs/man/man7/zpool-features.7 index efe9e833996a..2b7dcb63829c 100644 --- a/sys/contrib/openzfs/man/man7/zpool-features.7 +++ b/sys/contrib/openzfs/man/man7/zpool-features.7 @@ -562,13 +562,12 @@ This feature enables the upgraded version of errlog, which required an on-disk error log format change. Now the error log of each head dataset is stored separately in the zap object and keyed by the head id. -In case of encrypted filesystems with unloaded keys or unmounted encrypted -filesystems we are unable to check their snapshots or clones for errors and -these will not be reported. -In this case no filenames will be reported either. With this feature enabled, every dataset affected by an error block is listed in the output of .Nm zpool Cm status . +In case of encrypted filesystems with unloaded keys we are unable to check +their snapshots or clones for errors and these will not be reported. +An "access denied" error will be reported. .Pp \*[instant-never] . diff --git a/sys/contrib/openzfs/man/man8/zpool-status.8 b/sys/contrib/openzfs/man/man8/zpool-status.8 index ed572e29f51f..8f9580cf086e 100644 --- a/sys/contrib/openzfs/man/man8/zpool-status.8 +++ b/sys/contrib/openzfs/man/man8/zpool-status.8 @@ -119,6 +119,9 @@ See .It Fl v Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub. +If the head_errlog feature is enabled and files containing errors have been +removed then the respective filenames will not be reported in subsequent runs +of this command. .It Fl x Only display status for pools that are exhibiting errors or are otherwise unavailable. diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 26578491fd67..2520507b98aa 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -832,6 +832,7 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) return (SET_ERROR(EIO)); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); ssize_t start_resid = zfs_uio_resid(&uio); lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), zfs_uio_resid(&uio), RL_READER); @@ -853,6 +854,7 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) zfs_rangelock_exit(lr); int64_t nread = start_resid - zfs_uio_resid(&uio); dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); + rw_exit(&zv->zv_suspend_lock); return (error); } diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c index 26222d2efe3f..745ee8f02ed4 100644 --- a/sys/contrib/openzfs/module/zfs/abd.c +++ b/sys/contrib/openzfs/module/zfs/abd.c @@ -370,7 +370,20 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) * will retain all the free_on_free settings after being * added to the parents list. */ +#ifdef ZFS_DEBUG + /* + * If cabd had abd_parent, we have to drop it here. We can't + * transfer it to pabd, nor we can clear abd_size leaving it. + */ + if (cabd->abd_parent != NULL) { + (void) zfs_refcount_remove_many( + &cabd->abd_parent->abd_children, + cabd->abd_size, cabd); + cabd->abd_parent = NULL; + } +#endif pabd->abd_size += cabd->abd_size; + cabd->abd_size = 0; list_move_tail(&ABD_GANG(pabd).abd_gang_chain, &ABD_GANG(cabd).abd_gang_chain); ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); @@ -408,7 +421,6 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) */ if (abd_is_gang(cabd)) { ASSERT(!list_link_active(&cabd->abd_gang_link)); - ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); return (abd_gang_add_gang(pabd, cabd, free_on_free)); } ASSERT(!abd_is_gang(cabd)); diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index c50228a2682f..a78f664c4fe8 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -5696,8 +5696,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * and treat it as a checksum error. This allows an alternate blkptr * to be tried when one is available (e.g. ditto blocks). */ - if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER, - BLK_VERIFY_LOG)) { + if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ? + BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { rc = SET_ERROR(ECKSUM); goto done; } @@ -8198,10 +8198,17 @@ l2arc_write_size(l2arc_dev_t *dev) * iteration can occur. */ dev_size = dev->l2ad_end - dev->l2ad_start; + + /* We need to add in the worst case scenario of log block overhead. */ tsize = size + l2arc_log_blk_overhead(size, dev); - if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { + /* + * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) + * times the writesize, whichever is greater. + */ tsize += MAX(64 * 1024 * 1024, (tsize * l2arc_trim_ahead) / 100); + } if (tsize >= dev_size) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " @@ -8836,19 +8843,6 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) buflist = &dev->l2ad_buflist; - /* - * We need to add in the worst case scenario of log block overhead. - */ - distance += l2arc_log_blk_overhead(distance, dev); - if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { - /* - * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) - * times the write size, whichever is greater. - */ - distance += MAX(64 * 1024 * 1024, - (distance * l2arc_trim_ahead) / 100); - } - top: rerun = B_FALSE; if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index 8193fb244079..049a62c1c171 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -1573,24 +1573,22 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, bpp = &bp; } } else { - struct dirty_leaf *dl; dbuf_dirty_record_t *dr; ASSERT3S(db->db_state, ==, DB_NOFILL); + /* + * Block cloning: If we have a pending block clone, + * we don't want to read the underlying block, but the content + * of the block being cloned, so we have the most recent data. + */ dr = list_head(&db->db_dirty_records); - if (dr == NULL) { + if (dr == NULL || !dr->dt.dl.dr_brtwrite) { err = EIO; goto early_unlock; - } else { - dl = &dr->dt.dl; - if (!dl->dr_brtwrite) { - err = EIO; - goto early_unlock; - } - bp = dl->dr_overridden_by; - bpp = &bp; } + bp = dr->dt.dl.dr_overridden_by; + bpp = &bp; } err = dbuf_read_hole(db, dn, bpp); @@ -1906,6 +1904,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; + boolean_t release; ASSERT(MUTEX_HELD(&db->db_mtx)); /* @@ -1926,8 +1925,10 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); + release = !dr->dt.dl.dr_brtwrite; dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; + dr->dt.dl.dr_brtwrite = B_FALSE; dr->dt.dl.dr_has_raw_params = B_FALSE; /* @@ -1938,7 +1939,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ - if (!dr->dt.dl.dr_brtwrite) + if (release) arc_release(dr->dt.dl.dr_data, db); } @@ -2022,11 +2023,6 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); - if (dr->dt.dl.dr_brtwrite) { - ASSERT(db->db.db_data == NULL); - mutex_exit(&db->db_mtx); - continue; - } } else { /* * This dbuf is not dirty in the open context. @@ -2613,6 +2609,7 @@ static void dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + boolean_t undirty = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); @@ -2625,7 +2622,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) */ mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED) { + if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) { dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); /* * It's possible that it is already dirty but not cached, @@ -2633,10 +2630,21 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) * go through dmu_buf_will_dirty(). */ if (dr != NULL) { - /* This dbuf is already dirty and cached. */ - dbuf_redirty(dr); - mutex_exit(&db->db_mtx); - return; + if (dr->dt.dl.dr_brtwrite) { + /* + * Block cloning: If we are dirtying a cloned + * block, we cannot simply redirty it, because + * this dr has no data associated with it. + * We will go through a full undirtying below, + * before dirtying it again. + */ + undirty = B_TRUE; + } else { + /* This dbuf is already dirty and cached. */ + dbuf_redirty(dr); + mutex_exit(&db->db_mtx); + return; + } } } mutex_exit(&db->db_mtx); @@ -2645,7 +2653,20 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) flags |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); + + /* + * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we + * want to make sure dbuf_read() will read the pending cloned block and + * not the uderlying block that is being replaced. dbuf_undirty() will + * do dbuf_unoverride(), so we will end up with cloned block content, + * without overridden BP. + */ (void) dbuf_read(db, NULL, flags); + if (undirty) { + mutex_enter(&db->db_mtx); + VERIFY(!dbuf_undirty(db, tx)); + mutex_exit(&db->db_mtx); + } (void) dbuf_dirty(db, tx); } @@ -2668,6 +2689,28 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) return (dr != NULL); } +void +dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + /* + * Block cloning: We are going to clone into this block, so undirty + * modifications done to this block so far in this txg. This includes + * writes and clones into this block. + */ + mutex_enter(&db->db_mtx); + VERIFY(!dbuf_undirty(db, tx)); + ASSERT(list_head(&db->db_dirty_records) == NULL); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } + mutex_exit(&db->db_mtx); + + dmu_buf_will_not_fill(db_fake, tx); +} + void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { @@ -2675,7 +2718,9 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) db->db_state = DB_NOFILL; DTRACE_SET_STATE(db, "allocating NOFILL buffer"); - dmu_buf_will_fill(db_fake, tx); + + dbuf_noread(db); + (void) dbuf_dirty(db, tx); } void @@ -2691,6 +2736,19 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + if (db->db_state == DB_NOFILL) { + /* + * Block cloning: We will be completely overwriting a block + * cloned in this transaction group, so let's undirty the + * pending clone and mark the block as uncached. This will be + * as if the clone was never done. + */ + mutex_enter(&db->db_mtx); + VERIFY(!dbuf_undirty(db, tx)); + mutex_exit(&db->db_mtx); + db->db_state = DB_UNCACHED; + } + dbuf_noread(db); (void) dbuf_dirty(db, tx); } @@ -4636,6 +4694,20 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) i += DNODE_MIN_SIZE; if (dnp->dn_type != DMU_OT_NONE) { fill++; + for (int j = 0; j < dnp->dn_nblkptr; + j++) { + (void) zfs_blkptr_verify(spa, + &dnp->dn_blkptr[j], + BLK_CONFIG_SKIP, + BLK_VERIFY_HALT); + } + if (dnp->dn_flags & + DNODE_FLAG_SPILL_BLKPTR) { + (void) zfs_blkptr_verify(spa, + DN_SPILL_BLKPTR(dnp), + BLK_CONFIG_SKIP, + BLK_VERIFY_HALT); + } i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } @@ -4653,6 +4725,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; + (void) zfs_blkptr_verify(spa, ibp, + BLK_CONFIG_SKIP, BLK_VERIFY_HALT); fill += BP_GET_FILL(ibp); } } @@ -5139,6 +5213,7 @@ EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_is_dirty); +EXPORT_SYMBOL(dmu_buf_will_clone); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index cda1472a77aa..8a13b8f410a1 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -2173,7 +2173,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, - dmu_tx_t *tx, blkptr_t *bps, size_t *nbpsp) + blkptr_t *bps, size_t *nbpsp) { dmu_buf_t **dbp, *dbuf; dmu_buf_impl_t *db; @@ -2197,10 +2197,6 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, mutex_enter(&db->db_mtx); - /* - * If the block is not on the disk yet, it has no BP assigned. - * There is not much we can do... - */ if (!list_is_empty(&db->db_dirty_records)) { dbuf_dirty_record_t *dr; @@ -2235,10 +2231,6 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, error = SET_ERROR(EAGAIN); goto out; } - if (dmu_buf_is_dirty(dbuf, tx)) { - error = SET_ERROR(EAGAIN); - goto out; - } /* * Make sure we clone only data blocks. */ @@ -2257,7 +2249,7 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, return (error); } -void +int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay) { @@ -2267,7 +2259,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, struct dirty_leaf *dl; dbuf_dirty_record_t *dr; const blkptr_t *bp; - int numbufs; + int error = 0, i, numbufs; spa = os->os_spa; @@ -2275,27 +2267,37 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, &numbufs, &dbp)); ASSERT3U(nbps, ==, numbufs); - for (int i = 0; i < numbufs; i++) { + /* + * Before we start cloning make sure that the dbufs sizes match new BPs + * sizes. If they don't, that's a no-go, as we are not able to shrink + * dbufs. + */ + for (i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_SPILL_BLKID); + + if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) { + error = SET_ERROR(EXDEV); + goto out; + } + } + + for (i = 0; i < numbufs; i++) { + dbuf = dbp[i]; + db = (dmu_buf_impl_t *)dbuf; + bp = &bps[i]; + + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_SPILL_BLKID); ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); - mutex_enter(&db->db_mtx); - - VERIFY(!dbuf_undirty(db, tx)); - ASSERT(list_head(&db->db_dirty_records) == NULL); - if (db->db_buf != NULL) { - arc_buf_destroy(db->db_buf, db); - db->db_buf = NULL; - } - - mutex_exit(&db->db_mtx); - - dmu_buf_will_not_fill(dbuf, tx); + dmu_buf_will_clone(dbuf, tx); mutex_enter(&db->db_mtx); @@ -2305,7 +2307,6 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dl = &dr->dt.dl; dl->dr_overridden_by = *bp; dl->dr_brtwrite = B_TRUE; - dl->dr_override_state = DR_OVERRIDDEN; if (BP_IS_HOLE(bp)) { dl->dr_overridden_by.blk_birth = 0; @@ -2331,8 +2332,10 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, brt_pending_add(spa, bp, tx); } } - +out: dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (error); } void diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c index c2ce5ce000ac..c22a95f8647f 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -1353,7 +1353,7 @@ corrective_read_done(zio_t *zio) cr_cb_data_t *data = zio->io_private; /* Corruption corrected; update error log if needed */ if (zio->io_error == 0) - spa_remove_error(data->spa, &data->zb); + spa_remove_error(data->spa, &data->zb, &zio->io_bp->blk_birth); kmem_free(data, sizeof (cr_cb_data_t)); abd_free(zio->io_abd); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c index 1c5608c4541b..c4e274bd4c42 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_tx.c +++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c @@ -294,6 +294,53 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } } +static void +dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dnode_t *dn = txh->txh_dnode; + int err = 0; + + if (len == 0) + return; + + (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); + + if (dn == NULL) + return; + + /* + * For i/o error checking, read the blocks that will be needed + * to perform the append; first level-0 block (if not aligned, i.e. + * if they are partial-block writes), no additional blocks are read. + */ + if (dn->dn_maxblkid == 0) { + if (off < dn->dn_datablksz && + (off > 0 || len < dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + + /* first level-0 block */ + uint64_t start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + + err = zio_wait(zio); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } +} + static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { @@ -334,6 +381,42 @@ dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) } } +/* + * Should be used when appending to an object and the exact offset is unknown. + * The write must occur at or beyond the specified offset. Only the L0 block + * at provided offset will be prefetched. + */ +void +dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_APPEND, off, DMU_OBJECT_END); + if (txh != NULL) { + dmu_tx_count_append(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + +void +dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END); + if (txh != NULL) { + dmu_tx_count_append(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + /* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and @@ -668,6 +751,26 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) if (blkid == 0) match_offset = TRUE; break; + case THT_APPEND: + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) + match_offset = TRUE; + + /* + * THT_WRITE used for bonus and spill blocks. + */ + ASSERT(blkid != DMU_BONUS_BLKID && + blkid != DMU_SPILL_BLKID); + + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; + break; case THT_FREE: /* * We will dirty all the level 1 blocks in @@ -1454,6 +1557,8 @@ dmu_tx_fini(void) EXPORT_SYMBOL(dmu_tx_create); EXPORT_SYMBOL(dmu_tx_hold_write); EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); +EXPORT_SYMBOL(dmu_tx_hold_append); +EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_free); EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_zap); diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index d6a9365df120..d398b6705551 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -1970,7 +1970,8 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); - } else if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) { + } else if (!zfs_blkptr_verify(spa, bp, + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { /* * Sanity check the block pointer contents, this is handled * by arc_read() for the cases above. diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index dd4a442d97a1..16396170273c 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -2387,7 +2387,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, * When damaged consider it to be a metadata error since we cannot * trust the BP_GET_TYPE and BP_GET_LEVEL values. */ - if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) { + if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { atomic_inc_64(&sle->sle_meta_count); return (0); } @@ -6378,6 +6378,16 @@ spa_tryimport(nvlist_t *tryconfig) spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } + /* + * spa_import() relies on a pool config fetched by spa_try_import() + * for spare/cache devices. Import flags are not passed to + * spa_tryimport(), which makes it return early due to a missing log + * device and missing retrieving the cache device and spare eventually. + * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch + * the correct configuration regardless of the missing log device. + */ + spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c index 3bc8619b51a8..31719063a227 100644 --- a/sys/contrib/openzfs/module/zfs/spa_errlog.c +++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c @@ -72,6 +72,11 @@ #define NAME_MAX_LEN 64 +typedef struct clones { + uint64_t clone_ds; + list_node_t node; +} clones_t; + /* * spa_upgrade_errlog_limit : A zfs module parameter that controls the number * of on-disk error log entries that will be converted to the new @@ -135,10 +140,6 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) } #ifdef _KERNEL -static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, - uint64_t *snap_obj_array, zbookmark_err_phys_t *zep, void* uaddr, - uint64_t *count); - static void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) { @@ -162,15 +163,15 @@ name_to_object(char *buf, uint64_t *obj) static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds) { dsl_dataset_t *ds; - int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, - dsobj, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, + dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); ASSERT(head_ds); *head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } @@ -291,12 +292,13 @@ copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count) */ static int check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, - void *uaddr, uint64_t *count) + void *uaddr, uint64_t *count, list_t *clones_list) { dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; - int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(dp, head_ds, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); @@ -305,23 +307,6 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, boolean_t check_snapshot = B_TRUE; error = find_birth_txg(ds, zep, &latest_txg); - /* - * If the filesystem is encrypted and the key is not loaded - * or the encrypted filesystem is not mounted the error will be EACCES. - * In that case report an error in the head filesystem and return. - */ - if (error == EACCES) { - dsl_dataset_rele(ds, FTAG); - zbookmark_phys_t zb; - zep_to_zb(head_ds, zep, &zb); - error = copyout_entry(&zb, uaddr, count); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - return (0); - } - /* * If find_birth_txg() errors out otherwise, let txg_to_consider be * equal to the spa's syncing txg: if check_filesystem() errors out @@ -333,7 +318,7 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, zep_to_zb(head_ds, zep, &zb); error = copyout_entry(&zb, uaddr, count); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } check_snapshot = B_FALSE; @@ -351,14 +336,14 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } } if (snap_count == 0) { /* Filesystem without snapshots. */ - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (0); } @@ -370,20 +355,21 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); /* Check only snapshots created from this file system. */ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && snap_obj_txg <= txg_to_consider) { - error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); + error = dsl_dataset_hold_obj_flags(dp, snap_obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) goto out; if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) { snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } @@ -403,33 +389,20 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, zep_to_zb(snap_obj, zep, &zb); error = copyout_entry(&zb, uaddr, count); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, + FTAG); goto out; } } snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); } - if (zap_clone != 0 && aff_snap_count > 0) { - error = check_clones(spa, zap_clone, snap_count, snap_obj_array, - zep, uaddr, count); - } + if (zap_clone == 0 || aff_snap_count == 0) + return (0); -out: - kmem_free(snap_obj_array, sizeof (*snap_obj_array)); - return (error); -} - -/* - * Clone checking. - */ -static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, - uint64_t *snap_obj_array, zbookmark_err_phys_t *zep, void* uaddr, - uint64_t *count) -{ - int error = 0; + /* Check clones. */ zap_cursor_t *zc; zap_attribute_t *za; @@ -440,10 +413,9 @@ static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { - dsl_pool_t *dp = spa->spa_dsl_pool; dsl_dataset_t *clone; - error = dsl_dataset_hold_obj(dp, za->za_first_integer, - FTAG, &clone); + error = dsl_dataset_hold_obj_flags(dp, za->za_first_integer, + DS_HOLD_FLAG_DECRYPT, FTAG, &clone); if (error != 0) break; @@ -458,22 +430,22 @@ static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, == snap_obj_array[i]) found = B_TRUE; } - dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele_flags(clone, DS_HOLD_FLAG_DECRYPT, FTAG); if (!found) continue; - error = check_filesystem(spa, za->za_first_integer, zep, - uaddr, count); - - if (error != 0) - break; + clones_t *ct = kmem_zalloc(sizeof (*ct), KM_SLEEP); + ct->clone_ds = za->za_first_integer; + list_insert_tail(clones_list, ct); } zap_cursor_fini(zc); kmem_free(za, sizeof (*za)); kmem_free(zc, sizeof (*zc)); +out: + kmem_free(snap_obj_array, sizeof (*snap_obj_array)); return (error); } @@ -488,14 +460,14 @@ find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, return (error); dsl_dataset_t *ds; - error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj, - FTAG, &ds); + error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); *top_affected_fs = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (0); } @@ -521,10 +493,43 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, } uint64_t top_affected_fs; + uint64_t init_count = *count; int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); if (error == 0) { + clones_t *ct; + list_t clones_list; + + list_create(&clones_list, sizeof (clones_t), + offsetof(clones_t, node)); + error = check_filesystem(spa, top_affected_fs, zep, - uaddr, count); + uaddr, count, &clones_list); + + while ((ct = list_remove_head(&clones_list)) != NULL) { + error = check_filesystem(spa, ct->clone_ds, zep, + uaddr, count, &clones_list); + kmem_free(ct, sizeof (*ct)); + + if (error) { + while (!list_is_empty(&clones_list)) { + ct = list_remove_head(&clones_list); + kmem_free(ct, sizeof (*ct)); + } + break; + } + } + + list_destroy(&clones_list); + } + if (error == 0 && init_count == *count) { + /* + * If we reach this point, no errors have been detected + * in the checked filesystems/snapshots. Before returning mark + * the error block to be removed from the error lists and logs. + */ + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + spa_remove_error(spa, &zb, &zep->zb_birth); } return (error); @@ -536,37 +541,111 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, * so that we can later remove the related log entries in sync context. */ static void -spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb) +spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb, + const uint64_t *birth) { char name[NAME_MAX_LEN]; if (obj == 0) return; - bookmark_to_name(healed_zb, name, sizeof (name)); - mutex_enter(&spa->spa_errlog_lock); - if (zap_contains(spa->spa_meta_objset, obj, name) == 0) { - /* - * Found an error matching healed zb, add zb to our - * tree of healed errors - */ - avl_tree_t *tree = &spa->spa_errlist_healed; - spa_error_entry_t search; - spa_error_entry_t *new; - avl_index_t where; - search.se_bookmark = *healed_zb; - mutex_enter(&spa->spa_errlist_lock); - if (avl_find(tree, &search, &where) != NULL) { - mutex_exit(&spa->spa_errlist_lock); - mutex_exit(&spa->spa_errlog_lock); - return; + boolean_t held_list = B_FALSE; + boolean_t held_log = B_FALSE; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + bookmark_to_name(healed_zb, name, sizeof (name)); + + if (zap_contains(spa->spa_meta_objset, healed_zb->zb_objset, + name) == 0) { + if (!MUTEX_HELD(&spa->spa_errlog_lock)) { + mutex_enter(&spa->spa_errlog_lock); + held_log = B_TRUE; + } + + /* + * Found an error matching healed zb, add zb to our + * tree of healed errors + */ + avl_tree_t *tree = &spa->spa_errlist_healed; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_index_t where; + search.se_bookmark = *healed_zb; + if (!MUTEX_HELD(&spa->spa_errlist_lock)) { + mutex_enter(&spa->spa_errlist_lock); + held_list = B_TRUE; + } + if (avl_find(tree, &search, &where) != NULL) { + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + return; + } + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *healed_zb; + avl_insert(tree, new, where); + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); } - new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); - new->se_bookmark = *healed_zb; - avl_insert(tree, new, where); - mutex_exit(&spa->spa_errlist_lock); + return; } - mutex_exit(&spa->spa_errlog_lock); + + zbookmark_err_phys_t healed_zep; + healed_zep.zb_object = healed_zb->zb_object; + healed_zep.zb_level = healed_zb->zb_level; + healed_zep.zb_blkid = healed_zb->zb_blkid; + + if (birth != NULL) + healed_zep.zb_birth = *birth; + else + healed_zep.zb_birth = 0; + + errphys_to_name(&healed_zep, name, sizeof (name)); + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + if (zap_contains(spa->spa_meta_objset, za.za_first_integer, + name) == 0) { + if (!MUTEX_HELD(&spa->spa_errlog_lock)) { + mutex_enter(&spa->spa_errlog_lock); + held_log = B_TRUE; + } + + avl_tree_t *tree = &spa->spa_errlist_healed; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_index_t where; + search.se_bookmark = *healed_zb; + + if (!MUTEX_HELD(&spa->spa_errlist_lock)) { + mutex_enter(&spa->spa_errlist_lock); + held_list = B_TRUE; + } + + if (avl_find(tree, &search, &where) != NULL) { + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + continue; + } + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *healed_zb; + new->se_zep = healed_zep; + avl_insert(tree, new, where); + + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + } + } + zap_cursor_fini(&zc); } /* @@ -604,12 +683,36 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) &cookie)) != NULL) { remove_error_from_list(spa, s, &se->se_bookmark); remove_error_from_list(spa, l, &se->se_bookmark); - bookmark_to_name(&se->se_bookmark, name, sizeof (name)); kmem_free(se, sizeof (spa_error_entry_t)); - (void) zap_remove(spa->spa_meta_objset, - spa->spa_errlog_last, name, tx); - (void) zap_remove(spa->spa_meta_objset, - spa->spa_errlog_scrub, name, tx); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + bookmark_to_name(&se->se_bookmark, name, sizeof (name)); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_last, name, tx); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_scrub, name, tx); + } else { + errphys_to_name(&se->se_zep, name, sizeof (name)); + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_errlog_last); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + zap_remove(spa->spa_meta_objset, + za.za_first_integer, name, tx); + } + zap_cursor_fini(&zc); + + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_errlog_scrub); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + zap_remove(spa->spa_meta_objset, + za.za_first_integer, name, tx); + } + zap_cursor_fini(&zc); + } } } @@ -618,14 +721,10 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) * later in spa_remove_healed_errors(). */ void -spa_remove_error(spa_t *spa, zbookmark_phys_t *zb) +spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, const uint64_t *birth) { - char name[NAME_MAX_LEN]; - - bookmark_to_name(zb, name, sizeof (name)); - - spa_add_healed_error(spa, spa->spa_errlog_last, zb); - spa_add_healed_error(spa, spa->spa_errlog_scrub, zb); + spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth); + spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth); } static uint64_t @@ -736,7 +835,8 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, dsl_dataset_t *ds; objset_t *os; - int error = dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(dp, zb.zb_objset, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) continue; @@ -751,7 +851,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, * truly persistent, it should re-appear after a scan. */ if (dmu_objset_from_ds(ds, &os) != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } @@ -759,7 +859,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, blkptr_t bp; if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } @@ -773,7 +873,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); if (error != 0 || BP_IS_HOLE(&bp)) continue; @@ -827,62 +927,84 @@ spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) static int process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) { - zap_cursor_t zc; - zap_attribute_t za; - if (obj == 0) return (0); + zap_cursor_t *zc; + zap_attribute_t *za; + + zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { + for (zap_cursor_init(zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { if (*count == 0) { - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); return (SET_ERROR(ENOMEM)); } zbookmark_phys_t zb; - name_to_bookmark(za.za_name, &zb); + name_to_bookmark(za->za_name, &zb); int error = copyout_entry(&zb, uaddr, count); if (error != 0) { - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); return (error); } } - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); return (0); } - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { + for (zap_cursor_init(zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { - zap_cursor_t head_ds_cursor; - zap_attribute_t head_ds_attr; + zap_cursor_t *head_ds_cursor; + zap_attribute_t *head_ds_attr; - uint64_t head_ds_err_obj = za.za_first_integer; + head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + + uint64_t head_ds_err_obj = za->za_first_integer; uint64_t head_ds; - name_to_object(za.za_name, &head_ds); - for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, - head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor, - &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { + name_to_object(za->za_name, &head_ds); + for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, + head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { zbookmark_err_phys_t head_ds_block; - name_to_errphys(head_ds_attr.za_name, &head_ds_block); + name_to_errphys(head_ds_attr->za_name, &head_ds_block); int error = process_error_block(spa, head_ds, &head_ds_block, uaddr, count); if (error != 0) { - zap_cursor_fini(&head_ds_cursor); - zap_cursor_fini(&zc); + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, + sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); + + zap_cursor_fini(zc); + kmem_free(za, sizeof (*za)); + kmem_free(zc, sizeof (*zc)); return (error); } } - zap_cursor_fini(&head_ds_cursor); + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); } - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(za, sizeof (*za)); + kmem_free(zc, sizeof (*zc)); return (0); } @@ -1229,7 +1351,8 @@ find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; - int error = dsl_dataset_hold_obj(dp, old_head, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(dp, old_head, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); @@ -1237,9 +1360,9 @@ find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; while (prev_obj != 0) { - dsl_dataset_rele(ds, FTAG); - if ((error = dsl_dataset_hold_obj(dp, prev_obj, - FTAG, &ds)) == 0 && + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + if ((error = dsl_dataset_hold_obj_flags(dp, prev_obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) == 0 && dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head) break; @@ -1249,7 +1372,7 @@ find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); ASSERT(prev_obj != 0); *txg = prev_obj_txg; return (0); diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index 22e644f75f95..3b1e2ae5fb5d 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -7862,6 +7862,8 @@ zfs_kmod_fini(void) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); + if (zs != &zfsdev_state_listhead) + kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index a6a27222bf4c..86706469acee 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -1072,6 +1072,15 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, inzfsvfs = ZTOZSB(inzp); outzfsvfs = ZTOZSB(outzp); + + /* + * We need to call zfs_enter() potentially on two different datasets, + * so we need a dedicated function for that. + */ + error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); + if (error != 0) + return (error); + inos = inzfsvfs->z_os; outos = outzfsvfs->z_os; @@ -1083,14 +1092,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, return (SET_ERROR(EXDEV)); } - /* - * We need to call zfs_enter() potentially on two different datasets, - * so we need a dedicated function for that. - */ - error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); - if (error != 0) - return (error); - ASSERT(!outzfsvfs->z_replay); error = zfs_verify_zp(inzp); @@ -1246,16 +1247,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, break; } - /* - * Start a transaction. - */ - tx = dmu_tx_create(outos); - nbps = maxblocks; - error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, tx, bps, + error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, &nbps); if (error != 0) { - dmu_tx_abort(tx); /* * If we are tyring to clone a block that was created * in the current transaction group. Return an error, @@ -1276,12 +1271,15 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, */ if (BP_IS_PROTECTED(&bps[0])) { if (inzfsvfs != outzfsvfs) { - dmu_tx_abort(tx); error = SET_ERROR(EXDEV); break; } } + /* + * Start a transaction. + */ + tx = dmu_tx_create(outos); dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); DB_DNODE_ENTER(db); @@ -1309,8 +1307,12 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, ((len - 1) / inblksz + 1) * inblksz); } - dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, bps, nbps, - B_FALSE); + error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, + bps, nbps, B_FALSE); + if (error != 0) { + dmu_tx_commit(tx); + break; + } zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, &clear_setid_bits_txg, tx); diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index ec9da706a806..c37da89dd438 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -1866,6 +1866,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); ASSERT3U(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); + wsz = lwb->lwb_write_zio->io_size; } else { wsz = lwb->lwb_sz; diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 0924fb6f40bc..c17ca5e1d651 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -935,9 +935,35 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); + zfs_dbgmsg("bad blkptr at %px: " + "DVA[0]=%#llx/%#llx " + "DVA[1]=%#llx/%#llx " + "DVA[2]=%#llx/%#llx " + "prop=%#llx " + "pad=%#llx,%#llx " + "phys_birth=%#llx " + "birth=%#llx " + "fill=%#llx " + "cksum=%#llx/%#llx/%#llx/%#llx", + bp, + (long long)bp->blk_dva[0].dva_word[0], + (long long)bp->blk_dva[0].dva_word[1], + (long long)bp->blk_dva[1].dva_word[0], + (long long)bp->blk_dva[1].dva_word[1], + (long long)bp->blk_dva[2].dva_word[0], + (long long)bp->blk_dva[2].dva_word[1], + (long long)bp->blk_prop, + (long long)bp->blk_pad[0], + (long long)bp->blk_pad[1], + (long long)bp->blk_phys_birth, + (long long)bp->blk_birth, + (long long)bp->blk_fill, + (long long)bp->blk_cksum.zc_word[0], + (long long)bp->blk_cksum.zc_word[1], + (long long)bp->blk_cksum.zc_word[2], + (long long)bp->blk_cksum.zc_word[3]); switch (blk_verify) { case BLK_VERIFY_HALT: - dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: @@ -958,47 +984,54 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, * If everything checks out B_TRUE is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * - * Modes for zfs_blkptr_verify: - * 1) BLK_VERIFY_ONLY (evaluate the block) - * 2) BLK_VERIFY_LOG (evaluate the block and log problems) - * 3) BLK_VERIFY_HALT (call zfs_panic_recover on error) + * Values for blk_verify_flag: + * BLK_VERIFY_ONLY: evaluate the block + * BLK_VERIFY_LOG: evaluate the block and log problems + * BLK_VERIFY_HALT: call zfs_panic_recover on error + * + * Values for blk_config_flag: + * BLK_CONFIG_HELD: caller holds SCL_VDEV for writer + * BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be + * obtained for reader + * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better + * performance */ boolean_t -zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, - enum blk_verify_flag blk_verify) +zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, + enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { int errors = 0; if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid TYPE %llu", + "blkptr at %px has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid CHECKSUM %llu", + "blkptr at %px has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid COMPRESS %llu", + "blkptr at %px has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid LSIZE %llu", + "blkptr at %px has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid PSIZE %llu", + "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid ETYPE %llu", + "blkptr at %px has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } @@ -1010,10 +1043,19 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, if (!spa->spa_trust_config) return (errors == 0); - if (!config_held) - spa_config_enter(spa, SCL_VDEV, bp, RW_READER); - else + switch (blk_config) { + case BLK_CONFIG_HELD: ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); + break; + case BLK_CONFIG_NEEDED: + spa_config_enter(spa, SCL_VDEV, bp, RW_READER); + break; + case BLK_CONFIG_SKIP: + return (errors == 0); + default: + panic("invalid blk_config %u", blk_config); + } + /* * Pool-specific checks. * @@ -1028,20 +1070,20 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, if (vdevid >= spa->spa_root_vdev->vdev_children) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid VDEV %llu", + "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid VDEV %llu", + "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has hole VDEV %llu", + "blkptr at %px DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } @@ -1059,13 +1101,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid OFFSET %llu", + "blkptr at %px DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } - if (errors > 0) - dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); - if (!config_held) + if (blk_config == BLK_CONFIG_NEEDED) spa_config_exit(spa, SCL_VDEV, bp); return (errors == 0); @@ -1203,7 +1243,7 @@ void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { - (void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT); + (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We @@ -1282,8 +1322,8 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; - (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER, - BLK_VERIFY_HALT); + (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ? + BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); @@ -2301,7 +2341,7 @@ zio_nowait(zio_t *zio) ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && - zio_unique_parent(zio) == NULL) { + list_is_empty(&zio->io_parent_list)) { zio_t *pio; /* diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run index 55991cfeaf78..e2137ac596d9 100644 --- a/sys/contrib/openzfs/tests/runfiles/common.run +++ b/sys/contrib/openzfs/tests/runfiles/common.run @@ -325,7 +325,8 @@ tests = ['zfs_wait_deleteq', 'zfs_wait_getsubopt'] tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zhack] -tests = ['zhack_label_checksum'] +tests = ['zhack_label_repair_001', 'zhack_label_repair_002', + 'zhack_label_repair_003', 'zhack_label_repair_004'] pre = post = tags = ['functional', 'cli_root', 'zhack'] @@ -421,7 +422,7 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'import_cachefile_mirror_detached', 'import_cachefile_paths_changed', 'import_cachefile_shared_device', - 'import_devices_missing', + 'import_devices_missing', 'import_log_missing', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced'] diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in index f3cfca912a57..63470bc041c6 100755 --- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in @@ -257,6 +257,7 @@ if sys.platform.startswith('freebsd'): 'resilver/resilver_restart_001': ['FAIL', known_reason], 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], 'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623], + 'snapshot/snapshot_002_pos': ['FAIL', '14831'], }) elif sys.platform.startswith('linux'): maybe.update({ diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib index 02e6a500a71a..8521f271be54 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib @@ -1951,6 +1951,7 @@ function check_pool_status # pool token keyword # is_pool_removing - to check if the pool removing is a vdev # is_pool_removed - to check if the pool remove is completed # is_pool_discarding - to check if the pool checkpoint is being discarded +# is_pool_replacing - to check if the pool is performing a replacement # function is_pool_resilvering #pool { @@ -1997,6 +1998,10 @@ function is_pool_discarding #pool { check_pool_status "$1" "checkpoint" "discarding" } +function is_pool_replacing #pool +{ + zpool status "$1" | grep -qE 'replacing-[0-9]+' +} function wait_for_degraded { @@ -2983,12 +2988,15 @@ function wait_freeing #pool # Wait for every device replace operation to complete # # $1 pool name +# $2 timeout # -function wait_replacing #pool +function wait_replacing #pool timeout { + typeset timeout=${2:-300} typeset pool=${1:-$TESTPOOL} - while zpool status $pool | grep -qE 'replacing-[0-9]+'; do - log_must sleep 1 + for (( timer = 0; timer < $timeout; timer++ )); do + is_pool_replacing $pool || break; + sleep 1; done } diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/math.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/math.shlib index 38d9fecea7cf..da1e77e5fb97 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/math.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/include/math.shlib @@ -118,9 +118,7 @@ function verify_ne # # A simple function to get a random number between two bounds (inclusive) # -# Probably not the most efficient for large ranges, but it's okay. -# -# Note since we're using $RANDOM, 32767 is the largest number we +# Note since we're using $RANDOM, $min+32767 is the largest number we # can accept as the upper bound. # # $1 lower bound @@ -129,11 +127,6 @@ function random_int_between { typeset -i min=$1 typeset -i max=$2 - typeset -i rand=0 - while [[ $rand -lt $min ]] ; do - rand=$(( $RANDOM % $max + 1)) - done - - echo $rand + echo $(( (RANDOM % (max - min + 1)) + min )) } diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am index 74295b86ddc2..9299a4ca9b47 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am @@ -250,6 +250,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/cli_root/zpool_upgrade/zpool_upgrade.cfg \ functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib \ functional/cli_root/zpool_wait/zpool_wait.kshlib \ + functional/cli_root/zhack/library.kshlib \ functional/cli_user/misc/misc.cfg \ functional/cli_user/zfs_list/zfs_list.cfg \ functional/cli_user/zfs_list/zfs_list.kshlib \ @@ -932,7 +933,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs/zfs_001_neg.ksh \ functional/cli_root/zfs/zfs_002_pos.ksh \ functional/cli_root/zfs/zfs_003_neg.ksh \ - functional/cli_root/zhack/zhack_label_checksum.ksh \ + functional/cli_root/zhack/zhack_label_repair_001.ksh \ + functional/cli_root/zhack/zhack_label_repair_002.ksh \ + functional/cli_root/zhack/zhack_label_repair_003.ksh \ + functional/cli_root/zhack/zhack_label_repair_004.ksh \ functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \ functional/cli_root/zpool_add/add-o_ashift.ksh \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ @@ -1052,6 +1056,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh \ functional/cli_root/zpool_import/import_cachefile_shared_device.ksh \ functional/cli_root/zpool_import/import_devices_missing.ksh \ + functional/cli_root/zpool_import/import_log_missing.ksh \ functional/cli_root/zpool_import/import_paths_changed.ksh \ functional/cli_root/zpool_import/import_rewind_config_changed.ksh \ functional/cli_root/zpool_import/import_rewind_device_replaced.ksh \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh index 9ebde1cd9d32..261fc5eed8cb 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh @@ -163,7 +163,24 @@ corrupt_blocks_at_level "/$TESTPOOL/testfs5/$TESTFILE0" 0 log_must zfs unmount $TESTPOOL/testfs5 log_must zfs unload-key $TESTPOOL/testfs5 # test healing recv (on an encrypted dataset) using a raw send file -test_corrective_recv "$TESTPOOL/testfs5@snap1" $raw_backup +# This is a special case since with unloaded keys we cannot report errors +# in the filesystem. +log_must zpool scrub -w $TESTPOOL +log_must zpool status -v $TESTPOOL +log_mustnot eval "zpool status -v $TESTPOOL | \ + grep \"permission denied\"" +# make sure we will read the corruption from disk by flushing the ARC +log_must zinject -a +log_must eval "zfs recv -c $TESTPOOL/testfs5@snap1 < $raw_backup" + +log_must zpool scrub -w $TESTPOOL +log_must zpool status -v $TESTPOOL +log_mustnot eval "zpool status -v $TESTPOOL | \ + grep \"Permanent errors have been detected\"" +typeset cksum=$(md5digest $file) +[[ "$cksum" == "$checksum" ]] || \ + log_fail "Checksums differ ($cksum != $checksum)" + # non raw send file healing an encrypted dataset with an unloaded key will fail log_mustnot eval "zfs recv -c $TESTPOOL/testfs5@snap1 < $backup" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib new file mode 100644 index 000000000000..880a78861630 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib @@ -0,0 +1,361 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. "$STF_SUITE"/include/libtest.shlib +. "$STF_SUITE"/include/blkdev.shlib + +# +# Description: +# +# Test whether zhack label repair commands can recover detached devices +# and corrupted checksums with a variety of sizes, and ensure +# the purposes of either command is cleanly separated from the others. +# +# Strategy: +# +# Tests are done on loopback devices with sizes divisible by label size and sizes that are not. +# +# Test one: +# +# 1. Create pool on a loopback device with some test data +# 2. Export the pool. +# 3. Corrupt all label checksums in the pool +# 4. Check that pool cannot be imported +# 5. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 6. Use zhack label repair -c on device +# 7. Check that pool can be imported and that data is intact +# +# Test two: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Verify that the remaining detached device cannot be imported +# 6. Verify that it cannot be imported after using zhack label repair -c +# to ensure that the -c option will not undetach a device. +# 7. Use zhack label repair -u on device +# 8. Verify that the detached device can be imported and that data is intact +# +# Test three: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 8. Verify that it cannot be imported after using zhack label repair -c +# -c should repair the checksums, but not undetach a device. +# 9. Use zhack label repair -u on device +# 10. Verify that the detached device can be imported and that data is intact +# +# Test four: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Use zhack label repair -cu on device to attempt to fix checksums and +# undetach the device in a single operation. +# 8. Verify that the detached device can be imported and that data is intact +# + +log_assert "Verify zhack label repair will repair label checksums and uberblocks" +log_onexit cleanup + +LABEL_SIZE="$((2**18))" +LABEL_NVLIST_END="$((LABEL_SIZE / 2))" +LABEL_CKSUM_SIZE="32" +LABEL_CKSUM_START="$(( LABEL_NVLIST_END - LABEL_CKSUM_SIZE ))" + +VIRTUAL_DISK=$TEST_BASE_DIR/disk +VIRTUAL_MIRROR_DISK=$TEST_BASE_DIR/mirrordisk + +VIRTUAL_DEVICE= +VIRTUAL_MIRROR_DEVICE= + +function cleanup_lo +{ + L_DEVICE="$1" + + if [[ -e $L_DEVICE ]]; then + if is_linux; then + log_must losetup -d "$L_DEVICE" + elif is_freebsd; then + log_must mdconfig -d -u "$L_DEVICE" + else + log_must lofiadm -d "$L_DEVICE" + fi + fi +} + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + cleanup_lo "$VIRTUAL_DEVICE" + cleanup_lo "$VIRTUAL_MIRROR_DEVICE" + VIRTUAL_DEVICE= + VIRTUAL_MIRROR_DEVICE= + [[ -f "$VIRTUAL_DISK" ]] && log_must rm "$VIRTUAL_DISK" + [[ -f "$VIRTUAL_MIRROR_DISK" ]] && log_must rm "$VIRTUAL_MIRROR_DISK" +} + +RAND_MAX="$((2**15 - 1))" +function get_devsize +{ + if [ "$RANDOM" -gt "$(( RAND_MAX / 2 ))" ]; then + echo "$(( MINVDEVSIZE + RANDOM ))" + else + echo "$MINVDEVSIZE" + fi +} + +function pick_logop +{ + L_SHOULD_SUCCEED="$1" + + l_logop="log_mustnot" + if [ "$L_SHOULD_SUCCEED" == true ]; then + l_logop="log_must" + fi + + echo "$l_logop" +} + +function check_dataset +{ + L_SHOULD_SUCCEED="$1" + L_LOGOP="$(pick_logop "$L_SHOULD_SUCCEED")" + + "$L_LOGOP" mounted "$TESTPOOL"/"$TESTFS" + + "$L_LOGOP" test -f "$TESTDIR"/"test" +} + +function setup_dataset +{ + log_must zfs create "$TESTPOOL"/"$TESTFS" + + log_must mkdir -p "$TESTDIR" + log_must zfs set mountpoint="$TESTDIR" "$TESTPOOL"/"$TESTFS" + + log_must mounted "$TESTPOOL"/"$TESTFS" + + log_must touch "$TESTDIR"/"test" + log_must test -f "$TESTDIR"/"test" + + log_must zpool sync "$TESTPOOL" + + check_dataset true +} + +function get_practical_size +{ + L_SIZE="$1" + + if [ "$((L_SIZE % LABEL_SIZE))" -ne 0 ]; then + echo "$(((L_SIZE / LABEL_SIZE) * LABEL_SIZE))" + else + echo "$L_SIZE" + fi +} + +function corrupt_sized_label_checksum +{ + L_SIZE="$1" + L_LABEL="$2" + L_DEVICE="$3" + + L_PRACTICAL_SIZE="$(get_practical_size "$L_SIZE")" + + typeset -a L_OFFSETS=("$LABEL_CKSUM_START" \ + "$((LABEL_SIZE + LABEL_CKSUM_START))" \ + "$(((L_PRACTICAL_SIZE - LABEL_SIZE*2) + LABEL_CKSUM_START))" \ + "$(((L_PRACTICAL_SIZE - LABEL_SIZE) + LABEL_CKSUM_START))") + + dd if=/dev/urandom of="$L_DEVICE" \ + seek="${L_OFFSETS["$L_LABEL"]}" bs=1 count="$LABEL_CKSUM_SIZE" \ + conv=notrunc +} + +function corrupt_labels +{ + L_SIZE="$1" + L_DISK="$2" + + corrupt_sized_label_checksum "$L_SIZE" 0 "$L_DISK" + corrupt_sized_label_checksum "$L_SIZE" 1 "$L_DISK" + corrupt_sized_label_checksum "$L_SIZE" 2 "$L_DISK" + corrupt_sized_label_checksum "$L_SIZE" 3 "$L_DISK" +} + +function try_import_and_repair +{ + L_REPAIR_SHOULD_SUCCEED="$1" + L_IMPORT_SHOULD_SUCCEED="$2" + L_OP="$3" + L_POOLDISK="$4" + L_REPAIR_LOGOP="$(pick_logop "$L_REPAIR_SHOULD_SUCCEED")" + L_IMPORT_LOGOP="$(pick_logop "$L_IMPORT_SHOULD_SUCCEED")" + + log_mustnot zpool import "$TESTPOOL" -d "$L_POOLDISK" + + "$L_REPAIR_LOGOP" zhack label repair "$L_OP" "$L_POOLDISK" + + "$L_IMPORT_LOGOP" zpool import "$TESTPOOL" -d "$L_POOLDISK" + + check_dataset "$L_IMPORT_SHOULD_SUCCEED" +} + +function prepare_vdev +{ + L_SIZE="$1" + L_BACKFILE="$2" + + l_devname= + if truncate -s "$L_SIZE" "$L_BACKFILE"; then + if is_linux; then + l_devname="$(losetup -f "$L_BACKFILE" --show)" + elif is_freebsd; then + l_devname=/dev/"$(mdconfig -a -t vnode -f "$L_BACKFILE")" + else + l_devname="$(lofiadm -a "$L_BACKFILE")" + fi + fi + echo "$l_devname" +} + +function run_test_one +{ + L_SIZE="$1" + + VIRTUAL_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_DISK")" + log_must test -e "$VIRTUAL_DEVICE" + + log_must zpool create "$TESTPOOL" "$VIRTUAL_DEVICE" + + setup_dataset + + log_must zpool export "$TESTPOOL" + + corrupt_labels "$L_SIZE" "$VIRTUAL_DISK" + + try_import_and_repair false false "-u" "$VIRTUAL_DEVICE" + + try_import_and_repair true true "-c" "$VIRTUAL_DEVICE" + + cleanup + + log_pass "zhack label repair corruption test passed with a randomized size of $L_SIZE" +} + +function make_mirrored_pool +{ + L_SIZE="$1" + + VIRTUAL_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_DISK")" + log_must test -e "$VIRTUAL_DEVICE" + VIRTUAL_MIRROR_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_MIRROR_DISK")" + log_must test -e "$VIRTUAL_MIRROR_DEVICE" + + log_must zpool create "$TESTPOOL" "$VIRTUAL_DEVICE" + log_must zpool attach "$TESTPOOL" "$VIRTUAL_DEVICE" "$VIRTUAL_MIRROR_DEVICE" +} + +function export_and_cleanup_vdisk +{ + log_must zpool export "$TESTPOOL" + + cleanup_lo "$VIRTUAL_DEVICE" + + VIRTUAL_DEVICE= + + log_must rm "$VIRTUAL_DISK" +} + +function run_test_two +{ + L_SIZE="$1" + + make_mirrored_pool "$L_SIZE" + + setup_dataset + + log_must zpool detach "$TESTPOOL" "$VIRTUAL_MIRROR_DEVICE" + + export_and_cleanup_vdisk + + try_import_and_repair false false "-c" "$VIRTUAL_MIRROR_DEVICE" + + try_import_and_repair true true "-u" "$VIRTUAL_MIRROR_DEVICE" + + cleanup + + log_pass "zhack label repair detached test passed with a randomized size of $L_SIZE" +} + +function run_test_three +{ + L_SIZE="$1" + + make_mirrored_pool "$L_SIZE" + + setup_dataset + + log_must zpool detach "$TESTPOOL" "$VIRTUAL_MIRROR_DEVICE" + + export_and_cleanup_vdisk + + corrupt_labels "$L_SIZE" "$VIRTUAL_MIRROR_DISK" + + try_import_and_repair false false "-u" "$VIRTUAL_MIRROR_DEVICE" + + try_import_and_repair true false "-c" "$VIRTUAL_MIRROR_DEVICE" + + try_import_and_repair true true "-u" "$VIRTUAL_MIRROR_DEVICE" + + cleanup + + log_pass "zhack label repair corruption and detached test passed with a randomized size of $L_SIZE" +} + +function run_test_four +{ + L_SIZE="$1" + + make_mirrored_pool "$L_SIZE" + + setup_dataset + + log_must zpool detach "$TESTPOOL" "$VIRTUAL_MIRROR_DEVICE" + + export_and_cleanup_vdisk + + corrupt_labels "$L_SIZE" "$VIRTUAL_MIRROR_DISK" + + try_import_and_repair true true "-cu" "$VIRTUAL_MIRROR_DEVICE" + + cleanup + + log_pass "zhack label repair corruption and detached single-command test passed with a randomized size of $L_SIZE." +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_checksum.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_checksum.ksh deleted file mode 100755 index 67c7e7c4487d..000000000000 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_checksum.ksh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/ksh - -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright (c) 2021 by vStack. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/include/blkdev.shlib - -# -# Description: -# zhack label repair will calculate and rewrite label checksum if invalid -# -# Strategy: -# 1. Create pool with some number of vdevs and export it -# 2. Corrupt all labels checksums -# 3. Check that pool cannot be imported -# 4. Use zhack to repair labels checksums -# 5. Check that pool can be imported -# - -log_assert "Verify zhack label repair will repair labels checksums" -log_onexit cleanup - -VIRTUAL_DISK=$TEST_BASE_DIR/disk - -function cleanup -{ - poolexists $TESTPOOL && destroy_pool $TESTPOOL - [[ -f $VIRTUAL_DISK ]] && log_must rm $VIRTUAL_DISK -} - -log_must truncate -s $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK - -log_must zpool create $TESTPOOL $VIRTUAL_DISK -log_must zpool export $TESTPOOL - -log_mustnot zhack label repair $VIRTUAL_DISK - -corrupt_label_checksum 0 $VIRTUAL_DISK -corrupt_label_checksum 1 $VIRTUAL_DISK -corrupt_label_checksum 2 $VIRTUAL_DISK -corrupt_label_checksum 3 $VIRTUAL_DISK - -log_mustnot zpool import $TESTPOOL -d $TEST_BASE_DIR - -log_must zhack label repair $VIRTUAL_DISK - -log_must zpool import $TESTPOOL -d $TEST_BASE_DIR - -cleanup - -log_pass "zhack label repair works correctly." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh new file mode 100755 index 000000000000..2a511e9efcb6 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover +# corrupted checksums on devices of varied size, +# but not undetached devices. +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Export the pool. +# 3. Corrupt all label checksums in the pool +# 4. Check that pool cannot be imported +# 5. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 6. Use zhack label repair -c on device +# 7. Check that pool can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_one "$(get_devsize)" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_002.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_002.ksh new file mode 100755 index 000000000000..4f1e61a39857 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_002.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover +# detached drives on devices of varied size, but not +# repair corrupted checksums. +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Verify that the remaining detached device cannot be imported +# 6. Verify that it cannot be imported after using zhack label repair -c +# to ensure that the -c option will not undetach a device. +# 7. Use zhack label repair -u on device +# 8. Verify that the detached device can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_two "$(get_devsize)" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_003.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_003.ksh new file mode 100755 index 000000000000..7e82363d2f46 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_003.ksh @@ -0,0 +1,33 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover a device of varied size with +# corrupted checksums and which has been detached. +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 8. Verify that it cannot be imported after using zhack label repair -c +# -c should repair the checksums, but not undetach a device. +# 9. Use zhack label repair -u on device +# 10. Verify that the detached device can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_three "$(get_devsize)" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_004.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_004.ksh new file mode 100755 index 000000000000..0b739402b199 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_004.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover a device of varied size with +# corrupted checksums and which has been detached (in one command). +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Use zhack label repair -cu on device to attempt to fix checksums and +# undetach the device in a single operation. +# 8. Verify that the detached device can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_four "$(get_devsize)" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_log_missing.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_log_missing.ksh new file mode 100755 index 000000000000..f12cac78540f --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_log_missing.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Import with missing log device should not remove spare/cache. +# +# STRATEGY: +# 1. Create a pool. +# 2. Add spare, cache and log devices to the pool. +# 3. Export the pool. +# 4. Remove the log device. +# 5. Import the pool with -m flag. +# 6. Verify that spare and cache are still present in the pool. +# + +verify_runnable "global" + +log_onexit cleanup + +function test_missing_log +{ + typeset poolcreate="$1" + typeset cachevdev="$2" + typeset sparevdev="$3" + typeset logvdev="$4" + typeset missingvdev="$4" + + log_note "$0: pool '$poolcreate', adding $cachevdev, $sparevdev," \ + "$logvdev then moving away $missingvdev." + + log_must zpool create $TESTPOOL1 $poolcreate + + log_must zpool add $TESTPOOL1 cache $cachevdev spare $sparevdev \ + log $logvdev + + log_must_busy zpool export $TESTPOOL1 + + log_must mv $missingvdev $BACKUP_DEVICE_DIR + + log_must zpool import -m -d $DEVICE_DIR $TESTPOOL1 + + CACHE_PRESENT=$(zpool status -v $TESTPOOL1 | grep $cachevdev) + + SPARE_PRESENT=$(zpool status -v $TESTPOOL1 | grep $sparevdev) + + if [ -z "$CACHE_PRESENT"] || [ -z "SPARE_PRESENT"] + then + log_fail "cache/spare vdev missing after importing with missing" \ + "log device" + fi + + # Cleanup + log_must zpool destroy $TESTPOOL1 + + log_note "" +} + +log_must mkdir -p $BACKUP_DEVICE_DIR + +test_missing_log "$VDEV0" "$VDEV1" "$VDEV2" "$VDEV3" + +log_pass "zpool import succeeded with missing log device" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh index 04cd1892380d..ec4c67fb42f5 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh @@ -29,7 +29,7 @@ # Verify correct output with 'zpool status -v' after corrupting a file # # STRATEGY: -# 1. Create a pool, an ancrypted filesystem and a file +# 1. Create a pool, an encrypted filesystem and a file # 2. zinject checksum errors # 3. Unmount the filesystem and unload the key # 4. Scrub the pool @@ -76,8 +76,8 @@ log_must zpool sync $TESTPOOL2 log_must zpool scrub $TESTPOOL2 log_must zpool wait -t scrub $TESTPOOL2 log_must zpool status -v $TESTPOOL2 -log_must eval "zpool status -v $TESTPOOL2 | \ - grep \"Permanent errors have been detected\"" +log_mustnot eval "zpool status -v $TESTPOOL2 | \ + grep \"permission denied\"" log_mustnot eval "zpool status -v $TESTPOOL2 | grep '$file'" log_must eval "cat /$TESTPOOL2/pwd | zfs load-key $TESTPOOL2/$TESTFS1" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh index c9849379f779..666ac9bfc9dd 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh @@ -39,6 +39,9 @@ # 7. Verify we report errors in the pool in 'zpool status -v' # 8. Promote clone1 # 9. Verify we report errors in the pool in 'zpool status -v' +# 10. Delete the corrupted file and origin snapshots. +# 11. Verify we do not report data errors anymore, without requiring +# a scrub. . $STF_SUITE/include/libtest.shlib @@ -95,4 +98,14 @@ log_mustnot eval "zpool status -v | grep '$TESTPOOL2/clonexx/$TESTFILE0'" log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'" log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'" +log_must rm /$TESTPOOL2/clone1/$TESTFILE0 +log_must zfs destroy -R $TESTPOOL2/clone1@snap1 +log_must zfs destroy -R $TESTPOOL2/clone1@snap2 +log_must zfs list -r $TESTPOOL2 +log_must zpool status -v $TESTPOOL2 +log_must zpool sync +log_must zpool status -v $TESTPOOL2 +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"No known data errors\"" + log_pass "Verify reporting errors when deleting corrupted files after scrub" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh index 2846192d08eb..081e6c18430d 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh @@ -54,6 +54,7 @@ fi function cleanup { + zpool status $TESTPOOL destroy_pool $TESTPOOL sed -i '/alias scsidebug/d' $VDEVID_CONF unload_scsi_debug @@ -99,8 +100,8 @@ block_device_wait insert_disk $SD $SD_HOST # Wait for the new disk to be online and replaced -log_must wait_vdev_state $TESTPOOL "scsidebug" "ONLINE" $MAXTIMEOUT -log_must wait_replacing $TESTPOOL +log_must wait_vdev_state $TESTPOOL "scsidebug" "ONLINE" 60 +log_must wait_replacing $TESTPOOL 60 # Validate auto-replace was successful log_must check_state $TESTPOOL "" "ONLINE" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh index 0bbd08acdd3f..a93d0b3cc803 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh @@ -67,7 +67,7 @@ typeset VDEV_MIN_MB=$((MINVDEVSIZE * 0.30 / 1024 / 1024)) log_must zpool create -f $TESTPOOL $TRIM_VDEV1 cache $TRIM_VDEV2 verify_vdevs "-le" "$VDEV_MIN_MB" $TRIM_VDEV2 -typeset fill_mb=$(( floor(2 * MINVDEVSIZE) )) +typeset fill_mb=$(( floor(3 * MINVDEVSIZE) )) export DIRECTORY=/$TESTPOOL export NUMJOBS=1 export FILE_SIZE=${fill_mb} diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index f489f0b6c69a..ec45185797b4 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -1048,7 +1048,7 @@ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_gd96e29576" +#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_ge61076683" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" @@ -1078,7 +1078,7 @@ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "FreeBSD_gd96e29576" +#define ZFS_META_RELEASE "FreeBSD_ge61076683" /* Define the project version. */ #define ZFS_META_VERSION "2.1.99" diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index e0a986d41c02..29954b8d4719 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.1.99-1919-gd96e29576" +#define ZFS_META_GITREV "zfs-2.1.99-1944-ge61076683"