From c85ac731a0ec16e4277857b55ebe123c552365b6 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 25 Jan 2023 11:28:54 -0800 Subject: [PATCH 01/28] Improve resilver ETAs When resilvering the estimated time remaining is calculated using the average issue rate over the current pass. Where the current pass starts when a scan was started, or restarted, if the pool was exported/imported. For dRAID pools in particular this can result in wildly optimistic estimates since the issue rate will be very high while scanning when non-degraded regions of the pool are scanned. Once repair I/O starts being issued performance drops to a realistic number but the estimated performance is still significantly skewed. To address this we redefine a pass such that it starts after a scanning phase completes so the issue rate is more reflective of recent performance. Additionally, the zfs_scan_report_txgs module option can be set to reset the pass statistics more often. Reviewed-by: Akash B Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #14410 --- cmd/zpool/zpool_main.c | 33 ++++++++++++++++++++++----------- man/man4/zfs.4 | 7 +++++++ module/zfs/dsl_scan.c | 28 ++++++++++++++++++++++++++++ module/zfs/spa_misc.c | 1 - 4 files changed, 57 insertions(+), 12 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 93d6a18981cb..efb2d10e591b 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -7524,19 +7524,20 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); - assert(ps->pss_func == POOL_SCAN_SCRUB || - ps->pss_func == POOL_SCAN_RESILVER); + int is_resilver = ps->pss_func == POOL_SCAN_RESILVER; + int is_scrub = ps->pss_func == POOL_SCAN_SCRUB; + assert(is_resilver || is_scrub); /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { secs_to_dhms(end - start, time_buf); - if (ps->pss_func == POOL_SCAN_SCRUB) { + if (is_scrub) { (void) printf(gettext("scrub repaired %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); - } else if (ps->pss_func == POOL_SCAN_RESILVER) { + } else if (is_resilver) { (void) printf(gettext("resilvered %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, @@ -7544,10 +7545,10 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) } return; } else if (ps->pss_state == DSS_CANCELED) { - if (ps->pss_func == POOL_SCAN_SCRUB) { + if (is_scrub) { (void) printf(gettext("scrub canceled on %s"), ctime(&end)); - } else if (ps->pss_func == POOL_SCAN_RESILVER) { + } else if (is_resilver) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); } @@ -7557,7 +7558,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) assert(ps->pss_state == DSS_SCANNING); /* Scan is in progress. Resilvers can't be paused. */ - if (ps->pss_func == POOL_SCAN_SCRUB) { + if (is_scrub) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); @@ -7567,7 +7568,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } - } else if (ps->pss_func == POOL_SCAN_RESILVER) { + } else if (is_resilver) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); } @@ -7609,17 +7610,27 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) scanned_buf, issued_buf, total_buf); } - if (ps->pss_func == POOL_SCAN_RESILVER) { + if (is_resilver) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), processed_buf, 100 * fraction_done); - } else if (ps->pss_func == POOL_SCAN_SCRUB) { + } else if (is_scrub) { (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); } if (pause == 0) { + /* + * Only provide an estimate iff: + * 1) the time remaining is valid, and + * 2) the issue rate exceeds 10 MB/s, and + * 3) it's either: + * a) a resilver which has started repairs, or + * b) a scrub which has entered the issue phase. + */ if (total_secs_left != UINT64_MAX && - issue_rate >= 10 * 1024 * 1024) { + issue_rate >= 10 * 1024 * 1024 && + ((is_resilver && ps->pss_processed > 0) || + (is_scrub && issued > 0))) { (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index e20d601340c6..6f260660e6d9 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1890,6 +1890,13 @@ I/O. In this case (unless the metadata scan is done) we stop issuing verification I/O and start scanning metadata again until we get to the hard limit. . +.It Sy zfs_scan_report_txgs Ns = Ns Sy 0 Ns | Ns 1 Pq uint +When reporting resilver throughput and estimated completion time use the +performance observed over roughly the last +.Sy zfs_scan_report_txgs +TXGs. +When set to zero performance is calculated over the time between checkpoints. +. .It Sy zfs_scan_strict_mem_lim Ns = Ns Sy 0 Ns | Ns 1 Pq int Enforce tight memory limits on pool scans when a sequential scan is in progress. When disabled, the memory limit may be exceeded by fast disks. diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f9e437f0c947..8a5a02cea272 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -131,6 +131,15 @@ static uint64_t dsl_scan_count_data_disks(vdev_t *vd); extern uint_t zfs_vdev_async_write_active_min_dirty_percent; static int zfs_scan_blkstats = 0; +/* + * 'zpool status' uses bytes processed per pass to report throughput and + * estimate time remaining. We define a pass to start when the scanning + * phase completes for a sequential resilver. Optionally, this value + * may be used to reset the pass statistics every N txgs to provide an + * estimated completion time based on currently observed performance. + */ +static uint_t zfs_scan_report_txgs = 0; + /* * By default zfs will check to ensure it is not over the hard memory * limit before each txg. If finer-grained control of this is needed @@ -604,6 +613,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) } spa_scan_stat_init(spa); + vdev_scan_stat_init(spa->spa_root_vdev); + return (0); } @@ -763,6 +774,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) scn->scn_last_checkpoint = 0; scn->scn_checkpointing = B_FALSE; spa_scan_stat_init(spa); + vdev_scan_stat_init(spa->spa_root_vdev); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; @@ -3652,6 +3664,16 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) return; } + /* + * Disabled by default, set zfs_scan_report_txgs to report + * average performance over the last zfs_scan_report_txgs TXGs. + */ + if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 && + tx->tx_txg % zfs_scan_report_txgs == 0) { + scn->scn_issued_before_pass += spa->spa_scan_pass_issued; + spa_scan_stat_init(spa); + } + /* * It is possible to switch from unsorted to sorted at any time, * but afterwards the scan will remain sorted unless reloaded from @@ -3780,6 +3802,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (scn->scn_is_sorted) { scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; + scn->scn_issued_before_pass += + spa->spa_scan_pass_issued; + spa_scan_stat_init(spa); } zfs_dbgmsg("scan complete for %s txg %llu", spa->spa_name, @@ -4507,5 +4532,8 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW, "Tunable to adjust bias towards more filled segments during scans"); +ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, + "Tunable to report resilver performance over the last N txgs"); + ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 822fd0ee89b6..53763e915ca8 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2556,7 +2556,6 @@ spa_scan_stat_init(spa_t *spa) spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; - vdev_scan_stat_init(spa->spa_root_vdev); } /* From dc5c8006f684b1df3f2d4b6b8c121447d2db0017 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 25 Jan 2023 14:30:24 -0500 Subject: [PATCH 02/28] Prefetch on deadlists merge During snapshot deletion ZFS may issue several reads for each deadlist to merge them into next snapshot's or pool's bpobj. Number of the dead lists increases with number of snapshots. On HDD pools it may take significant time during which sync thread is blocked. This patch introduces prescient prefetch of required blocks for up to 128 deadlists ahead. Tests show reduction of time required to delete dataset with 720 snapshots with randomly overwritten file on wide HDD pool from 75-85 to 22-28 seconds. Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Issue #14276 Closes #14402 --- include/sys/bpobj.h | 1 + module/zfs/bpobj.c | 65 +++++++++++++++++++++++++++++++++-- module/zfs/dsl_deadlist.c | 71 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 131 insertions(+), 6 deletions(-) diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h index 84f0ee76c44e..f3384f526454 100644 --- a/include/sys/bpobj.h +++ b/include/sys/bpobj.h @@ -87,6 +87,7 @@ int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, int64_t start); void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); +void bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj); void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx); diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index f7fded56518b..fa99f5141d4e 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -663,14 +663,13 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) } VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); - VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); - if (bpobj_is_empty(&subbpo)) { /* No point in having an empty subobj. */ bpobj_close(&subbpo); bpobj_free(bpo->bpo_os, subobj, tx); return; } + VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); mutex_enter(&bpo->bpo_lock); dmu_buf_will_dirty(bpo->bpo_dbuf, tx); @@ -780,6 +779,68 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) } +/* + * Prefetch metadata required for bpobj_enqueue_subobj(). + */ +void +bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj) +{ + dmu_object_info_t doi; + bpobj_t subbpo; + uint64_t subsubobjs; + boolean_t copy_subsub = B_TRUE; + boolean_t copy_bps = B_TRUE; + + ASSERT(bpobj_is_open(bpo)); + ASSERT(subobj != 0); + + if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) + return; + + if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0) + return; + if (bpobj_is_empty(&subbpo)) { + bpobj_close(&subbpo); + return; + } + subsubobjs = subbpo.bpo_phys->bpo_subobjs; + bpobj_close(&subbpo); + + if (subsubobjs != 0) { + if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0) + return; + if (doi.doi_max_offset > doi.doi_data_block_size) + copy_subsub = B_FALSE; + } + + if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0) + return; + if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) + copy_bps = B_FALSE; + + if (copy_subsub && subsubobjs != 0) { + if (bpo->bpo_phys->bpo_subobjs) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, + ZIO_PRIORITY_ASYNC_READ); + } + dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1, + ZIO_PRIORITY_ASYNC_READ); + } + + if (copy_bps) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, + bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1, + ZIO_PRIORITY_ASYNC_READ); + dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1, + ZIO_PRIORITY_ASYNC_READ); + } else if (bpo->bpo_phys->bpo_subobjs) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, + ZIO_PRIORITY_ASYNC_READ); + } +} + void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 2b33446e66af..d58820701f60 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -438,6 +438,18 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, } } +/* + * Prefetch metadata required for dle_enqueue_subobj(). + */ +static void +dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + uint64_t obj) +{ + if (dle->dle_bpobj.bpo_object != + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) + bpobj_prefetch_subobj(&dle->dle_bpobj, obj); +} + void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) @@ -810,6 +822,27 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, dle_enqueue_subobj(dl, dle, obj, tx); } +/* + * Prefetch metadata required for dsl_deadlist_insert_bpobj(). + */ +static void +dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + ASSERT(MUTEX_HELD(&dl->dl_lock)); + + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + dle_prefetch_subobj(dl, dle, obj); +} + static int dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) @@ -826,12 +859,12 @@ dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) { - zap_cursor_t zc; - zap_attribute_t za; + zap_cursor_t zc, pzc; + zap_attribute_t za, pza; dmu_buf_t *bonus; dsl_deadlist_phys_t *dlp; dmu_object_info_t doi; - int error; + int error, perror, i; VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { @@ -843,15 +876,32 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) } mutex_enter(&dl->dl_lock); + /* + * Prefetch up to 128 deadlists first and then more as we progress. + * The limit is a balance between ARC use and diminishing returns. + */ + for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0; + (perror = zap_cursor_retrieve(&pzc, &pza)) == 0 && i < 128; + zap_cursor_advance(&pzc), i++) { + dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer, + zfs_strtonum(pza.za_name, NULL)); + } for (zap_cursor_init(&zc, dl->dl_os, obj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t mintxg = zfs_strtonum(za.za_name, NULL); dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); + if (perror == 0) { + dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer, + zfs_strtonum(pza.za_name, NULL)); + zap_cursor_advance(&pzc); + perror = zap_cursor_retrieve(&pzc, &pza); + } } VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); + zap_cursor_fini(&pzc); VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); dlp = bonus->db_data; @@ -869,8 +919,9 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; - dsl_deadlist_entry_t *dle; + dsl_deadlist_entry_t *dle, *pdle; avl_index_t where; + int i; ASSERT(!dl->dl_oldfmt); @@ -882,11 +933,23 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); + /* + * Prefetch up to 128 deadlists first and then more as we progress. + * The limit is a balance between ARC use and diminishing returns. + */ + for (pdle = dle, i = 0; pdle && i < 128; ) { + bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); + pdle = AVL_NEXT(&dl->dl_tree, pdle); + } while (dle) { uint64_t used, comp, uncomp; dsl_deadlist_entry_t *dle_next; bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); + if (pdle) { + bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); + pdle = AVL_NEXT(&dl->dl_tree, pdle); + } VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); From c0aea7cf4e86fc02db8046fbb3bca21a918053a2 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 24 Jan 2023 14:05:45 -0800 Subject: [PATCH 03/28] Increase default zfs_scan_vdev_limit to 16MB For HDD based pools the default zfs_scan_vdev_limit of 4M per-vdev can significantly limit the maximum scrub performance. Increasing the default to 16M can double the scrub speed from 80 MB/s per disk to 160 MB/s per disk. This does increase the memory footprint during scrub/resilver but given the performance win this is a reasonable trade off. Memory usage is capped at 1/4 of arc_c_max. Note that number of outstanding I/Os has not changed and is still limited by zfs_vdev_scrub_max_active. Reviewed-by: Akash B Reviewed-by: Tony Nguyen Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #14428 --- man/man4/zfs.4 | 2 +- module/zfs/dsl_scan.c | 28 ++++++++++++++++------------ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 6f260660e6d9..031981f9b143 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1905,7 +1905,7 @@ When disabled, the memory limit may be exceeded by fast disks. Freezes a scrub/resilver in progress without actually pausing it. Intended for testing/debugging. . -.It Sy zfs_scan_vdev_limit Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq int +.It Sy zfs_scan_vdev_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int Maximum amount of data that can be concurrently issued at once for scrubs and resilvers per leaf device, given in bytes. . diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 8a5a02cea272..2cc77eab6275 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -126,7 +127,7 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); -static uint64_t dsl_scan_count_data_disks(vdev_t *vd); +static uint64_t dsl_scan_count_data_disks(spa_t *spa); extern uint_t zfs_vdev_async_write_active_min_dirty_percent; static int zfs_scan_blkstats = 0; @@ -156,7 +157,7 @@ static int zfs_scan_strict_mem_lim = B_FALSE; * overload the drives with I/O, since that is protected by * zfs_vdev_scrub_max_active. */ -static uint64_t zfs_scan_vdev_limit = 4 << 20; +static uint64_t zfs_scan_vdev_limit = 16 << 20; static uint_t zfs_scan_issue_strategy = 0; @@ -475,11 +476,12 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) /* * Calculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. + * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). + * Limits for the issuing phase are done per top-level vdev and + * are handled separately. */ - scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); + scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, + zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); @@ -2823,8 +2825,9 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) } static uint64_t -dsl_scan_count_data_disks(vdev_t *rvd) +dsl_scan_count_data_disks(spa_t *spa) { + vdev_t *rvd = spa->spa_root_vdev; uint64_t i, leaves = 0; for (i = 0; i < rvd->vdev_children; i++) { @@ -3733,12 +3736,13 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) taskqid_t prefetch_tqid; /* - * Recalculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. + * Calculate the max number of in-flight bytes for pool-wide + * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). + * Limits for the issuing phase are done per top-level vdev and + * are handled separately. */ - scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); + scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, + zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); if (scnp->scn_ddt_bookmark.ddb_class <= scnp->scn_ddt_class_max) { From 973934b965268b5333564dbdf4e76b34cc7e7b6f Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 24 Jan 2023 15:23:32 -0800 Subject: [PATCH 04/28] Increase default zfs_rebuild_vdev_limit to 64MB When testing distributed rebuild performance with more capable hardware it was observed than increasing the zfs_rebuild_vdev_limit to 64M reduced the rebuild time by 17%. Beyond 64MB there was some improvement (~2%) but it was not significant when weighed against the increased memory usage. Memory usage is capped at 1/4 of arc_c_max. Additionally, vr_bytes_inflight_max has been moved so it's updated per-metaslab to allow the size to be adjust while a rebuild is running. Reviewed-by: Akash B Reviewed-by: Tony Nguyen Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #14428 --- man/man4/zfs.4 | 2 +- module/zfs/vdev_rebuild.c | 27 ++++++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 031981f9b143..88a044f63f28 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1769,7 +1769,7 @@ completes in order to verify the checksums of all blocks which have been resilvered. This is enabled by default and strongly recommended. . -.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64 +.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64 Maximum amount of I/O that can be concurrently issued for a sequential resilver per leaf device, given in bytes. . diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 1f56275c853b..62aa61b3b9e7 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -34,6 +34,7 @@ #include #include #include +#include #include /* @@ -116,13 +117,12 @@ static uint64_t zfs_rebuild_max_segment = 1024 * 1024; * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep * the queue depth short. * - * 32MB was selected as the default value to achieve good performance with - * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential - * rebuild was unable to saturate all of the drives using smaller values. - * With a value of 32MB the sequential resilver write rate was measured at - * 800MB/s sustained while rebuilding to a distributed spare. + * 64MB was observed to deliver the best performance and set as the default. + * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c) + * and a rebuild rate of 1.2GB/s was measured to the distribute spare. + * Smaller values were unable to fully saturate the available pool I/O. */ -static uint64_t zfs_rebuild_vdev_limit = 32 << 20; +static uint64_t zfs_rebuild_vdev_limit = 64 << 20; /* * Automatically start a pool scrub when the last active sequential resilver @@ -754,6 +754,7 @@ vdev_rebuild_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; int error = 0; /* @@ -786,9 +787,6 @@ vdev_rebuild_thread(void *arg) vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; - vr->vr_bytes_inflight_max = MAX(1ULL << 20, - zfs_rebuild_vdev_limit * vd->vdev_children); - uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); @@ -804,6 +802,17 @@ vdev_rebuild_thread(void *arg) metaslab_t *msp = vd->vdev_ms[i]; vr->vr_scan_msp = msp; + /* + * Calculate the max number of in-flight bytes for top-level + * vdev scanning operations (minimum 1MB, maximum 1/4 of + * arc_c_max shared by all top-level vdevs). Limits for the + * issuing phase are done per top-level vdev and are handled + * separately. + */ + uint64_t limit = (arc_c_max / 4) / MAX(rvd->vdev_children, 1); + vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20, + zfs_rebuild_vdev_limit * vd->vdev_children)); + /* * Removal of vdevs from the vdev tree may eliminate the need * for the rebuild, in which case it should be canceled. The From 05b72415d16e3cfbdeb0f59d5f24640f10ee34d6 Mon Sep 17 00:00:00 2001 From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com> Date: Fri, 3 Feb 2023 04:09:57 +0500 Subject: [PATCH 05/28] Fix console progress reporting for recursive send After commit 19d3961, progress reporting (-v) with replication flag enabled does not report the progress on the console. This commit fixes the issue by updating the logic to check for pa->progress instead of pa_verbosity in send_progress_thread(). Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Ameer Hamza Closes #14448 --- cmd/zfs/zfs_main.c | 2 +- lib/libzfs/libzfs_sendrecv.c | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 9d48b2b32e83..cec025ce6de8 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -4532,7 +4532,7 @@ zfs_do_send(int argc, char **argv) } } - if (flags.parsable && flags.verbosity == 0) + if ((flags.parsable || flags.progressastitle) && flags.verbosity == 0) flags.verbosity = 1; if (excludes.count > 0 && !flags.replicate) { diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 038613a1fcfa..1d2ad1944051 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -84,6 +84,7 @@ typedef struct progress_arg { boolean_t pa_estimate; int pa_verbosity; boolean_t pa_astitle; + boolean_t pa_progress; uint64_t pa_size; } progress_arg_t; @@ -940,7 +941,7 @@ send_progress_thread(void *arg) struct tm tm; int err; - if (!pa->pa_parsable && pa->pa_verbosity != 0) { + if (!pa->pa_parsable && pa->pa_progress) { (void) fprintf(stderr, "TIME %s %sSNAPSHOT %s\n", pa->pa_estimate ? "BYTES" : " SENT", @@ -990,7 +991,7 @@ send_progress_thread(void *arg) (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, (u_longlong_t)bytes, zhp->zfs_name); - } else if (pa->pa_verbosity != 0) { + } else if (pa->pa_progress) { zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, @@ -1206,6 +1207,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) pa.pa_verbosity = sdd->verbosity; pa.pa_size = sdd->size; pa.pa_astitle = sdd->progressastitle; + pa.pa_progress = sdd->progress; if ((err = pthread_create(&tid, NULL, send_progress_thread, &pa)) != 0) { @@ -1886,6 +1888,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, pa.pa_verbosity = flags->verbosity; pa.pa_size = size; pa.pa_astitle = flags->progressastitle; + pa.pa_progress = flags->progress; error = pthread_create(&tid, NULL, send_progress_thread, &pa); @@ -2696,6 +2699,7 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, pa.pa_verbosity = flags->verbosity; pa.pa_size = size; pa.pa_astitle = flags->progressastitle; + pa.pa_progress = flags->progress; err = pthread_create(&ptid, NULL, send_progress_thread, &pa); From f18e083bf8ce0c0d1997002f9986122be6d4ebe8 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Thu, 2 Feb 2023 18:11:35 -0500 Subject: [PATCH 06/28] rootdelay on zfs should be adaptive The 'rootdelay' boot option currently pauses the boot for a specified amount of time. The original intent was to ensure that slower configurations would have ample time to enumerate the devices to make importing the root pool successful. This, however, causes unnecessary boot delay for environments like Azure which set this parameter by default. This commit changes the initramfs logic to pause until it can successfully load the 'zfs' module. The timeout specified by 'rootdelay' now becomes the maximum amount of time that initramfs will wait before failing the boot. Reviewed-by: Brian Behlendorf Reviewed-by: Prakash Surya Signed-off-by: George Wilson Closes #14430 --- contrib/initramfs/scripts/zfs | 54 +++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index c724f0c2cf57..41a4d96f8732 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -272,30 +272,46 @@ import_pool() # with more logging etc. load_module_initrd() { - [ -n "$ROOTDELAY" ] && ZFS_INITRD_PRE_MOUNTROOT_SLEEP="$ROOTDELAY" + ZFS_INITRD_PRE_MOUNTROOT_SLEEP=${ROOTDELAY:-0} - if [ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" -gt 0 ] 2>/dev/null - then - if [ "$quiet" != "y" ]; then - zfs_log_begin_msg "Sleeping for" \ - "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP seconds..." + if [ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" -gt 0 ]; then + [ "$quiet" != "y" ] && zfs_log_begin_msg "Delaying for up to '${ZFS_INITRD_PRE_MOUNTROOT_SLEEP}' seconds." + fi + + START=$(/bin/date -u +%s) + END=$((START+ZFS_INITRD_PRE_MOUNTROOT_SLEEP)) + while true; do + + # Wait for all of the /dev/{hd,sd}[a-z] device nodes to appear. + if command -v wait_for_udev > /dev/null 2>&1 ; then + wait_for_udev 10 + elif command -v wait_for_dev > /dev/null 2>&1 ; then + wait_for_dev fi - sleep "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" + + # + # zpool import refuse to import without a valid + # /proc/self/mounts + # + [ ! -f /proc/self/mounts ] && mount proc /proc + + # Load the module + if load_module "zfs"; then + ret=0 + break + else + ret=1 + fi + + [ "$(/bin/date -u +%s)" -gt "$END" ] && break + sleep 1 + + done + if [ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" -gt 0 ]; then [ "$quiet" != "y" ] && zfs_log_end_msg fi - # Wait for all of the /dev/{hd,sd}[a-z] device nodes to appear. - if command -v wait_for_udev > /dev/null 2>&1 ; then - wait_for_udev 10 - elif command -v wait_for_dev > /dev/null 2>&1 ; then - wait_for_dev - fi - - # zpool import refuse to import without a valid /proc/self/mounts - [ ! -f /proc/self/mounts ] && mount proc /proc - - # Load the module - load_module "zfs" || return 1 + [ "$ret" -ne 0 ] && return 1 if [ "$ZFS_INITRD_POST_MODPROBE_SLEEP" -gt 0 ] 2>/dev/null then From c799866b970bee4fc11a7e8c9a1d70e02481fe66 Mon Sep 17 00:00:00 2001 From: Allan Jude Date: Thu, 2 Feb 2023 18:12:51 -0500 Subject: [PATCH 07/28] Resolve WS-2021-0184 vulnerability in zstd Pull in d40f55cd950919d7eac951b122668e55e33e5202 from upstream Reviewed-by: Brian Behlendorf Reviewed-by: Richard Yao Signed-off-by: Allan Jude Closes #14439 --- module/zstd/lib/compress/zstd_double_fast.c | 4 ++-- module/zstd/lib/compress/zstd_fast.c | 6 +++--- module/zstd/lib/compress/zstd_lazy.c | 12 ++++++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/module/zstd/lib/compress/zstd_double_fast.c b/module/zstd/lib/compress/zstd_double_fast.c index 27eed66cfedd..4a95c01a090d 100644 --- a/module/zstd/lib/compress/zstd_double_fast.c +++ b/module/zstd/lib/compress/zstd_double_fast.c @@ -409,7 +409,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( hashSmall[hSmall] = hashLong[hLong] = current; /* update hash table */ if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */ - & (repIndex > dictStartIndex)) + & (offset_1 < current+1 - dictStartIndex)) /* note: we are searching at current+1 */ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; @@ -477,7 +477,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( U32 const repIndex2 = current2 - offset_2; const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ - & (repIndex2 > dictStartIndex)) + & (offset_2 < current2 - dictStartIndex)) && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; diff --git a/module/zstd/lib/compress/zstd_fast.c b/module/zstd/lib/compress/zstd_fast.c index 85a3a7a91e49..17894b85472f 100644 --- a/module/zstd/lib/compress/zstd_fast.c +++ b/module/zstd/lib/compress/zstd_fast.c @@ -416,9 +416,9 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( const BYTE* const repMatch = repBase + repIndex; hashTable[h] = current; /* update hash table */ DEBUGLOG(7, "offset_1 = %u , current = %u", offset_1, current); - assert(offset_1 <= current +1); /* check repIndex */ - if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex)) + if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ + & (offset_1 < current+1 - dictStartIndex) ) /* note: we are searching at current+1 */ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; @@ -453,7 +453,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( U32 const current2 = (U32)(ip-base); U32 const repIndex2 = current2 - offset_2; const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; - if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex)) /* intentional overflow */ + if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 < current - dictStartIndex)) /* intentional overflow */ && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; diff --git a/module/zstd/lib/compress/zstd_lazy.c b/module/zstd/lib/compress/zstd_lazy.c index 4cf5c88b5325..22d80597ec62 100644 --- a/module/zstd/lib/compress/zstd_lazy.c +++ b/module/zstd/lib/compress/zstd_lazy.c @@ -975,7 +975,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 repIndex = (U32)(current+1 - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */ + & (offset_1 < current+1 - windowLow) ) /* note: we are searching at current+1 */ if (MEM_read32(ip+1) == MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -1006,7 +1007,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 repIndex = (U32)(current - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ + & (offset_1 < current - windowLow) ) /* equivalent to `current > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -1037,7 +1039,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 repIndex = (U32)(current - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ + & (offset_1 < current - windowLow) ) /* equivalent to `current > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -1083,7 +1086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 repIndex = repCurrent - offset_2; const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ + & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; From 326f1e3d88dab158b8aad8c49b96fa292b5912e5 Mon Sep 17 00:00:00 2001 From: rob-wing <98866084+rob-wing@users.noreply.github.com> Date: Thu, 2 Feb 2023 14:16:40 -0900 Subject: [PATCH 08/28] zfs_main.c: fix unused variable error with GCC zfs_setproctitle_init() is stubbed out on FreeBSD. Reviewed-by: Ryan Moeller Reviewed-by: Ameer Hamza Reviewed-by: Brian Behlendorf Reviewed-by: Richard Yao Signed-off-by: Rob Wing Closes #14441 --- cmd/zfs/zfs_main.c | 1 - include/libzutil.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index cec025ce6de8..1ffe805aab70 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -8672,7 +8672,6 @@ main(int argc, char **argv) int i = 0; const char *cmdname; char **newargv; - extern char **environ; (void) setlocale(LC_ALL, ""); (void) setlocale(LC_NUMERIC, "C"); diff --git a/include/libzutil.h b/include/libzutil.h index 4d4bddaad5f3..948ac08cd772 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -183,6 +183,7 @@ _LIBZUTIL_H int printf_color(const char *color, const char *format, ...); _LIBZUTIL_H const char *zfs_basename(const char *path); _LIBZUTIL_H ssize_t zfs_dirnamelen(const char *path); #ifdef __linux__ +extern char **environ; _LIBZUTIL_H void zfs_setproctitle_init(int argc, char *argv[], char *envp[]); _LIBZUTIL_H void zfs_setproctitle(const char *fmt, ...); #else From ac2038a19c3b7e9ba913a850e05ba773725bd25d Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Fri, 3 Feb 2023 00:17:37 +0100 Subject: [PATCH 09/28] Teach zdb about DMU_OT_ERROR_LOG objects With the persistent error log feature we need to account for spa_errlog_{scrub, last} containing mappings to other error log objects, which need to be marked as in-use as well. Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14442 Closes #14434 --- cmd/zdb/zdb.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index b04b220c768e..d6e3c5806944 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -7509,6 +7509,19 @@ mos_leak_log_spacemaps(spa_t *spa) mos_obj_refd(sls->sls_sm_obj); } +static void +errorlog_count_refd(objset_t *mos, uint64_t errlog) +{ + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, mos, errlog); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + mos_obj_refd(za.za_first_integer); + } + zap_cursor_fini(&zc); +} + static int dump_mos_leaks(spa_t *spa) { @@ -7529,6 +7542,12 @@ dump_mos_leaks(spa_t *spa) mos_obj_refd(spa->spa_history); mos_obj_refd(spa->spa_errlog_last); mos_obj_refd(spa->spa_errlog_scrub); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + errorlog_count_refd(mos, spa->spa_errlog_last); + errorlog_count_refd(mos, spa->spa_errlog_scrub); + } + mos_obj_refd(spa->spa_all_vdev_zaps); mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); From 90f4e01f8ab89edc39bfa227032acb66ae511988 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 2 Feb 2023 15:19:26 -0800 Subject: [PATCH 10/28] Prevent error messages when running tests with no timeout Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #14450 --- tests/test-runner/bin/test-runner.py.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in index cb453b266f3c..28276ebc47e3 100755 --- a/tests/test-runner/bin/test-runner.py.in +++ b/tests/test-runner/bin/test-runner.py.in @@ -297,7 +297,7 @@ User: %s proc = Popen(privcmd, stdout=PIPE, stderr=PIPE) # Allow a special timeout value of 0 to mean infinity if int(self.timeout) == 0: - self.timeout = sys.maxsize + self.timeout = sys.maxsize / (10 ** 9) t = Timer(int(self.timeout), self.kill_cmd, [proc]) try: From 6017fd9377b217481097dda1206132ec81fcc8ef Mon Sep 17 00:00:00 2001 From: Reno Reckling Date: Fri, 3 Feb 2023 00:22:12 +0100 Subject: [PATCH 11/28] Fix variable shadowing in libzfs_mount We accidentally reused variable name "i" for inner and outer loops. Reviewed-by: Rich Ercolani Reviewed-by: Ryan Moeller Reviewed-by: Richard Yao Signed-off-by: Reno Reckling Closes #14452 Closes #14445 --- lib/libzfs/libzfs_mount.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index 57737bc6c01a..8612e082ba34 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -1422,10 +1422,10 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) * Walk through and first unshare everything. */ for (i = 0; i < used; i++) { - for (enum sa_protocol i = 0; i < SA_PROTOCOL_COUNT; ++i) { - if (sa_is_shared(sets[i].mountpoint, i) && + for (enum sa_protocol p = 0; p < SA_PROTOCOL_COUNT; ++p) { + if (sa_is_shared(sets[i].mountpoint, p) && unshare_one(hdl, sets[i].mountpoint, - sets[i].mountpoint, i) != 0) + sets[i].mountpoint, p) != 0) goto out; } } From dffd40b3b6185b001d529db50561ec11098d75cc Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Tue, 7 Feb 2023 02:27:55 +0900 Subject: [PATCH 12/28] Unify assembly files with macOS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The remaining changes needed to make the assembly files work with macOS. Reviewed-by: Attila Fülöp Reviewed-by: Brian Behlendorf Signed-off-by: Jorgen Lundman Closes #14451 --- module/icp/asm-x86_64/blake3/blake3_avx2.S | 1 - module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S | 5 +++++ module/icp/asm-x86_64/modes/gcm_pclmulqdq.S | 2 +- module/icp/asm-x86_64/sha2/sha256_impl.S | 2 +- module/icp/asm-x86_64/sha2/sha512_impl.S | 3 +-- 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/module/icp/asm-x86_64/blake3/blake3_avx2.S b/module/icp/asm-x86_64/blake3/blake3_avx2.S index 8f9e766486f1..0ebec5c1095e 100644 --- a/module/icp/asm-x86_64/blake3/blake3_avx2.S +++ b/module/icp/asm-x86_64/blake3/blake3_avx2.S @@ -1791,7 +1791,6 @@ ENTRY_ALIGN(zfs_blake3_hash_many_avx2, 64) SET_SIZE(zfs_blake3_hash_many_avx2) SECTION_STATIC -.section .rodata .p2align 6 ADD0: diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S index 165492a0ed76..909b2147dff9 100644 --- a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S +++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -53,6 +53,11 @@ /* Windows userland links with OpenSSL */ #if !defined (_WIN32) || defined (_KERNEL) +/* Apple needs _ */ +#if defined (__APPLE__) +#define gcm_avx_can_use_movbe _gcm_avx_can_use_movbe +#endif + .extern gcm_avx_can_use_movbe .text diff --git a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S index e40b3df32753..dec782fda33e 100644 --- a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S +++ b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S @@ -101,7 +101,7 @@ gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { // static uint8_t byte_swap16_mask[] = { // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; -.section .rodata +SECTION_STATIC .balign XMM_ALIGN .Lbyte_swap16_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S index f3d701528459..f1fde51c1d69 100644 --- a/module/icp/asm-x86_64/sha2/sha256_impl.S +++ b/module/icp/asm-x86_64/sha2/sha256_impl.S @@ -2063,7 +2063,7 @@ ENTRY_NP(SHA256TransformBlocks) .cfi_endproc SET_SIZE(SHA256TransformBlocks) -.section .rodata +SECTION_STATIC .balign 64 SET_OBJ(K256) K256: diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S index 520f5b6dab24..b2f7d4863d8a 100644 --- a/module/icp/asm-x86_64/sha2/sha512_impl.S +++ b/module/icp/asm-x86_64/sha2/sha512_impl.S @@ -2064,7 +2064,7 @@ ENTRY_NP(SHA512TransformBlocks) .cfi_endproc SET_SIZE(SHA512TransformBlocks) -.section .rodata +SECTION_STATIC .balign 64 SET_OBJ(K512) K512: @@ -2113,4 +2113,3 @@ K512: #if defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif - From 0a5b942d4aadbf5be99ff62976384db83d342f8c Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Tue, 7 Feb 2023 02:34:59 +0900 Subject: [PATCH 13/28] Restore FreeBSD to use .rodata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In https://github.com/openzfs/zfs/pull/14228 the FreeBSD SECTION_STATIC was set to ".data" instead of ".rodata". This commit just restores it back to .rodata. Reviewed-by: Attila Fülöp Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Jorgen Lundman Closes #14460 --- include/os/freebsd/spl/sys/ia32/asm_linkage.h | 2 +- lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/os/freebsd/spl/sys/ia32/asm_linkage.h b/include/os/freebsd/spl/sys/ia32/asm_linkage.h index 058d600007af..1ebfd8350661 100644 --- a/include/os/freebsd/spl/sys/ia32/asm_linkage.h +++ b/include/os/freebsd/spl/sys/ia32/asm_linkage.h @@ -36,7 +36,7 @@ #define ENDBR #define SECTION_TEXT .text -#define SECTION_STATIC .data +#define SECTION_STATIC .rodata #ifdef __cplusplus extern "C" { diff --git a/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h b/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h index 9964f183cc68..08c73037990f 100644 --- a/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h +++ b/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h @@ -40,7 +40,7 @@ #define ENDBR #define SECTION_TEXT .text -#define SECTION_STATIC .data +#define SECTION_STATIC .rodata #ifdef __cplusplus extern "C" { From 14872aaa4f909d72c6b5e4105dadcfa13c7d9d66 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 6 Feb 2023 09:37:06 -0800 Subject: [PATCH 14/28] EIO caused by encryption + recursive gang Encrypted blocks can not have 3 DVAs, because they use the space of the 3rd DVA for the IV+salt. zio_write_gang_block() takes this into account, setting `gbh_copies` to no more than 2 in this case. Gang members BP's do not have the X (encrypted) bit set (nor do they have the DMU level and type fields set), because encryption is not handled at this level. The gang block is reassembled, and then encryption (and compression) are handled. To check if this gang block is encrypted, the code in zio_write_gang_block() checks `pio->io_bp`. This is normally fine, because the block that's being ganged is typically the encrypted BP. The problem is that if there is "recursive ganging", where a gang member is itself a gang block, then when zio_write_gang_block() is called to create a gang block for a gang member, `pio->io_bp` is the gang member's BP, which doesn't have the X bit set, so the number of DVA's is not restricted to 2. It should instead be looking at the the "gang leader", i.e. the top-level gang block, to determine how many DVA's can be used, to avoid a "NDVA's inversion" (where a child has more DVA's than its parent). gang leader BP: X (encrypted) bit set, 2 DVA's, IV+salt in 3rd DVA's space: ``` DVA[0]=<1:...:100400> DVA[1]=<0:...:100400> salt=... iv=... [L0 ZFS plain file] fletcher4 uncompressed encrypted LE gang unique double size=100000L/100000P birth=... fill=1 cksum=... ``` leader's GBH contains a BP with gang bit set and 3 DVA's: ``` DVA[0]=<1:...:55600> DVA[1]=<0:...:55600> [L0 unallocated] fletcher4 uncompressed unencrypted LE contiguous unique double size=55600L/55600P birth=... fill=0 cksum=... DVA[0]=<1:...:55600> DVA[1]=<0:...:55600> [L0 unallocated] fletcher4 uncompressed unencrypted LE contiguous unique double size=55600L/55600P birth=... fill=0 cksum=... DVA[0]=<1:...:55600> DVA[1]=<0:...:55600> DVA[2]=<1:...:200> [L0 unallocated] fletcher4 uncompressed unencrypted LE gang unique double size=55400L/55400P birth=... fill=0 cksum=... ``` On nondebug bits, having the 3rd DVA in the gang block works for the most part, because it's true that all 3 DVA's are available in the gang member BP (in the GBH). However, for accounting purposes, gang block DVA's ASIZE include all the space allocated below them, i.e. the 512-byte gang block header (GBH) as well as the gang members below that. We see that above where the gang leader BP is 1MB logical (and after compression: 0x`100000P`), but the ASIZE of each DVA is 2 sectors (1KB) more than 1MB (0x`100400`). Since thre are 3 copies of a block below it, we increment the ATIME of the 3rd DVA of the gang leader by the space used by the 3rd DVA of the child (1 sector, in this case). But there isn't really a 3rd DVA of the parent; the salt is stored in place of the 3rd DVA's ASIZE. So when zio_write_gang_member_ready() increments the parent's BP's `DVA[2]`'s ASIZE, it's actually incrementing the parent's salt. When we later try to read the encrypted recursively-ganged block, the salt doesn't match what we used to write it, so MAC verification fails and we get an EIO. ``` zio_encrypt(): encrypted 515/2/0/403 salt: 25 25 bb 9d ad d6 cd 89 zio_decrypt(): decrypting 515/2/0/403 salt: 26 25 bb 9d ad d6 cd 89 ``` This commit addresses the problem by not increasing the number of copies of the GBH beyond 2 (even for non-encrypted blocks). This simplifies the logic while maintaining the ability to traverse all metadata (including gang blocks) even if one copy is lost. (Note that 3 copies of the GBH will still be created if requested, e.g. for `copies=3` or MOS blocks.) Additionally, the code that increments the parent's DVA's ASIZE is made to check the parent DVA's NDVAS even on nondebug bits. So if there's a similar bug in the future, it will cause a panic when trying to write, rather than corrupting the parent BP and causing an error when reading. Reviewed-by: Brian Behlendorf Co-authored-by: Brian Behlendorf Signed-off-by: Matthew Ahrens Caused-by: #14356 Closes #14440 Closes #14413 --- module/zfs/zio.c | 16 ++-- tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/no_space/enospc_ganging.ksh | 86 +++++++++++++++++++ 4 files changed, 97 insertions(+), 8 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 5d7ed6d582a2..d888a584a93c 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2778,7 +2778,7 @@ zio_write_gang_member_ready(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); - ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); + VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { @@ -2816,18 +2816,20 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; - int gbh_copies; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* - * encrypted blocks need DVA[2] free so encrypted gang headers can't - * have a third copy. + * If one copy was requested, store 2 copies of the GBH, so that we + * can still traverse all the data (e.g. to free or scrub) even if a + * block is damaged. Note that we can't store 3 copies of the GBH in + * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt. */ - gbh_copies = MIN(copies + 1, spa_max_replication(spa)); - if (BP_IS_ENCRYPTED(bp) && gbh_copies >= SPA_DVAS_PER_BP) - gbh_copies = SPA_DVAS_PER_BP - 1; + int gbh_copies = copies; + if (gbh_copies == 1) { + gbh_copies = MIN(2, spa_max_replication(spa)); + } int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 005c539fc89d..7a7cf927c77e 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -704,7 +704,7 @@ tags = ['functional', 'nestedfs'] [tests/functional/no_space] tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', - 'enospc_df', 'enospc_rm'] + 'enospc_df', 'enospc_ganging', 'enospc_rm'] tags = ['functional', 'no_space'] [tests/functional/nopwrite] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bbe94f9177ae..ad2ec4670556 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1539,6 +1539,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/no_space/enospc_002_pos.ksh \ functional/no_space/enospc_003_pos.ksh \ functional/no_space/enospc_df.ksh \ + functional/no_space/enospc_ganging.ksh \ functional/no_space/enospc_rm.ksh \ functional/no_space/setup.ksh \ functional/online_offline/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh b/tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh new file mode 100755 index 000000000000..1d35fba5dbfa --- /dev/null +++ b/tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Exercise gang block IO paths for non-encrypted and encrypted datasets. +# + +verify_runnable "both" +log_assert "Verify IO when file system is full and ganging." + +function cleanup +{ + log_must set_tunable64 METASLAB_FORCE_GANGING $metaslab_force_ganging + default_cleanup_noexit +} + +log_onexit cleanup + +default_setup_noexit $DISKS + +typeset metaslab_force_ganging=$(get_tunable METASLAB_FORCE_GANGING) +shift=$(random_int_between 15 17) +log_must set_tunable64 METASLAB_FORCE_GANGING $((2**$shift)) + +keyfile=/$TESTPOOL/keyencfods +log_must eval "echo 'password' > $keyfile" +bs=1024k +count=512 + +log_must dd if=/dev/urandom of=$TESTDIR/data bs=$bs count=$count +data_checksum=$(sha256digest $TESTDIR/data) + +# Test common large block configuration. +log_must zfs create -o recordsize=1m -o primarycache=metadata $TESTPOOL/gang +mntpnt=$(get_prop mountpoint $TESTPOOL/gang) + +log_must dd if=$TESTDIR/data of=$mntpnt/file bs=$bs count=$count +sync_pool $TESTPOOL +log_must dd if=$mntpnt/file of=$TESTDIR/out bs=$bs count=$count +out_checksum=$(sha256digest $TESTDIR/out) + +if [[ "$data_checksum" != "$out_checksum" ]]; then + log_fail "checksum mismatch ($data_checksum != $out_checksum)" +fi + +log_must rm -f $TESTDIR/out +log_must zfs destroy $TESTPOOL/gang + +# Test common large block configuration with encryption. +log_must zfs create \ + -o recordsize=1m \ + -o primarycache=metadata \ + -o compression=off \ + -o encryption=on \ + -o keyformat=passphrase \ + -o keylocation=file://$keyfile \ + -o copies=2 \ + $TESTPOOL/gang +mntpnt=$(get_prop mountpoint $TESTPOOL/gang) + +log_must dd if=$TESTDIR/data of=$mntpnt/file bs=$bs count=$count +sync_pool $TESTPOOL +log_must dd if=$mntpnt/file of=$TESTDIR/out bs=$bs count=$count +out_checksum=$(sha256digest $TESTDIR/out) + +if [[ "$data_checksum" != "$out_checksum" ]]; then + log_fail "checksum mismatch ($data_checksum != $out_checksum)" +fi + +log_must rm -f $TESTDIR/out +log_must zfs destroy $TESTPOOL/gang + +log_pass "Verified IO when file system is full and ganging." From 3a7d2a0ce0b7e7a2b005256be54fda00feaf86c0 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 23 Dec 2022 00:00:38 -0500 Subject: [PATCH 15/28] zfs_get_temporary_prop() should not pass NULL to strcpy() `dsl_dir_activity_in_progress()` can call `zfs_get_temporary_prop()` with the forth value set to NULL, which will pass NULL to `strcpy()` when there is a match Clang's static analyzer caught this with the help of CodeChecker for Cross Translation Unit analysis. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Richard Yao Closes #14456 --- module/os/freebsd/zfs/zfs_vfsops.c | 3 ++- module/os/linux/zfs/zfs_vfsops.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 2820f10b5de8..a1e0595bda34 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -230,7 +230,8 @@ zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, vfs_unbusy(vfsp); if (tmp != *val) { - (void) strcpy(setpoint, "temporary"); + if (setpoint) + (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index c921e587c75c..2d9b27a90884 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -608,7 +608,8 @@ zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, } if (tmp != *val) { - (void) strcpy(setpoint, "temporary"); + if (setpoint) + (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); From cfb49616cdeb02c7cbb3dd314c55b9452ee21182 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 23 Jan 2023 15:03:21 -0500 Subject: [PATCH 16/28] Cleanup: spa vdev processing should check NULL pointers The PVS Studio 2016 FreeBSD kernel report stated: \contrib\opensolaris\uts\common\fs\zfs\spa.c (1341): error V595: The 'spa->spa_spares.sav_vdevs' pointer was utilized before it was verified against nullptr. Check lines: 1341, 1342. \sys\cddl\contrib\opensolaris\uts\common\fs\zfs\spa.c (1355): error V595: The 'spa->spa_l2cache.sav_vdevs' pointer was utilized before it was verified against nullptr. Check lines: 1355, 1357. \sys\cddl\contrib\opensolaris\uts\common\fs\zfs\spa.c (1398): error V595: The 'spa->spa_spares.sav_vdevs' pointer was utilized before it was verified against nullptr. Check lines: 1398, 1408. \sys\cddl\contrib\opensolaris\uts\common\fs\zfs\spa.c (1583): error V595: The 'oldvdevs' pointer was utilized before it was verified against nullptr. Check lines: 1583, 1595. In practice, all of these uses were safe because a NULL pointer implied a 0 vdev count, which kept us from iterating over vdevs. However, rearranging the code to check the pointer first is not a terrible micro-optimization and makes it more readable, so let us do that. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Richard Yao Closes #14456 --- module/zfs/spa.c | 58 +++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 67b3a03a951a..bbb83fc610b1 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1714,9 +1714,9 @@ spa_unload(spa_t *spa) */ spa_l2cache_drop(spa); - for (int i = 0; i < spa->spa_spares.sav_count; i++) - vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { + for (int i = 0; i < spa->spa_spares.sav_count; i++) + vdev_free(spa->spa_spares.sav_vdevs[i]); kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; @@ -1727,11 +1727,11 @@ spa_unload(spa_t *spa) } spa->spa_spares.sav_count = 0; - for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { - vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); - vdev_free(spa->spa_l2cache.sav_vdevs[i]); - } if (spa->spa_l2cache.sav_vdevs) { + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); + vdev_free(spa->spa_l2cache.sav_vdevs[i]); + } kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; @@ -1789,20 +1789,21 @@ spa_load_spares(spa_t *spa) /* * First, close and free any existing spare vdevs. */ - for (i = 0; i < spa->spa_spares.sav_count; i++) { - vd = spa->spa_spares.sav_vdevs[i]; + if (spa->spa_spares.sav_vdevs) { + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; - /* Undo the call to spa_activate() below */ - if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, - B_FALSE)) != NULL && tvd->vdev_isspare) - spa_spare_remove(tvd); - vdev_close(vd); - vdev_free(vd); - } + /* Undo the call to spa_activate() below */ + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, + B_FALSE)) != NULL && tvd->vdev_isspare) + spa_spare_remove(tvd); + vdev_close(vd); + vdev_free(vd); + } - if (spa->spa_spares.sav_vdevs) kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); + } if (spa->spa_spares.sav_config == NULL) nspares = 0; @@ -2013,23 +2014,24 @@ spa_load_l2cache(spa_t *spa) /* * Purge vdevs that were dropped */ - for (i = 0; i < oldnvdevs; i++) { - uint64_t pool; + if (oldvdevs) { + for (i = 0; i < oldnvdevs; i++) { + uint64_t pool; - vd = oldvdevs[i]; - if (vd != NULL) { - ASSERT(vd->vdev_isl2cache); + vd = oldvdevs[i]; + if (vd != NULL) { + ASSERT(vd->vdev_isl2cache); - if (spa_l2cache_exists(vd->vdev_guid, &pool) && - pool != 0ULL && l2arc_vdev_present(vd)) - l2arc_remove_vdev(vd); - vdev_clear_stats(vd); - vdev_free(vd); + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) + l2arc_remove_vdev(vd); + vdev_clear_stats(vd); + vdev_free(vd); + } } - } - if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); + } for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); From 66953686c05901277115e837bbaab4df4ffd63c6 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 2 Feb 2023 17:30:39 -0500 Subject: [PATCH 17/28] Add assertion and make variables unsigned in abd_alloc_chunks() Clang's static analyzer pointed out that if alloc_pages >= nr_pages before the loop, the value of page will be undefined and will be used anyway. This should not be possible, but as cleanup, we add an assertion. We also recognize that the local variables should be unsigned in the first place, so we make them unsigned. This is not enough to avoid the need for the assertion, since there is still the case that alloc_pages == nr_pages and nr_pages == 0, which the assertion implicitly checks. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Richard Yao Closes #14456 --- module/os/linux/zfs/abd_os.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 16530d82693e..13150adbe0cf 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -272,18 +272,20 @@ abd_alloc_chunks(abd_t *abd, size_t size) struct page *page, *tmp_page = NULL; gfp_t gfp = __GFP_NOWARN | GFP_NOIO; gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; - int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); - int nr_pages = abd_chunkcnt_for_bytes(size); - int chunks = 0, zones = 0; + unsigned int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); + unsigned int nr_pages = abd_chunkcnt_for_bytes(size); + unsigned int chunks = 0, zones = 0; size_t remaining_size; int nid = NUMA_NO_NODE; - int alloc_pages = 0; + unsigned int alloc_pages = 0; INIT_LIST_HEAD(&pages); + ASSERT3U(alloc_pages, <, nr_pages); + while (alloc_pages < nr_pages) { - unsigned chunk_pages; - int order; + unsigned int chunk_pages; + unsigned int order; order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); chunk_pages = (1U << order); From eb823cbc76d28a7cafdf6a7aafdefe7e74fe26bc Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Mon, 6 Feb 2023 14:16:01 -0500 Subject: [PATCH 18/28] initramfs: Make mountpoint=none work In initramfs, mount.zfs fails to mount a dataset with mountpoint=none, but mount.zfs -o zfsutil works. Use -o zfsutil when mountpoint=none. Reviewed-by: Brian Behlendorf Reviewed-by: Richard Yao Signed-off-by: Ryan Moeller Closes #14455 --- contrib/initramfs/scripts/zfs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index 41a4d96f8732..d1a99e4a87ba 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -359,9 +359,11 @@ mount_fs() # isn't the root fs. return 0 fi - ZFS_CMD="mount.zfs" # Last hail-mary: Hope 'rootmnt' is set! mountpoint="" + if [ "$mountpoint" = "legacy" ]; then + ZFS_CMD="mount.zfs" + fi else mountpoint="$mountpoint1" fi From ac7648179c856750b719c7a9e0464466df390b26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rob=20N=20=E2=98=85?= Date: Wed, 8 Feb 2023 08:48:22 +1100 Subject: [PATCH 19/28] zdb: zero-pad checksum output The leading zeroes are part of the checksum so we should show them. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #14464 --- cmd/zdb/zdb.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index d6e3c5806944..d239da67613c 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2377,7 +2377,8 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " %s", "FREE"); (void) snprintf(blkbuf + strlen(blkbuf), - buflen - strlen(blkbuf), " cksum=%llx:%llx:%llx:%llx", + buflen - strlen(blkbuf), + " cksum=%016llx:%016llx:%016llx:%016llx", (u_longlong_t)bp->blk_cksum.zc_word[0], (u_longlong_t)bp->blk_cksum.zc_word[1], (u_longlong_t)bp->blk_cksum.zc_word[2], @@ -8383,7 +8384,9 @@ zdb_read_block(char *thing, spa_t *spa) DVA_GET_OFFSET(&bp->blk_dva[0]); ck_zio->io_bp = bp; zio_checksum_compute(ck_zio, ck, pabd, lsize); - printf("%12s\tcksum=%llx:%llx:%llx:%llx\n", + printf( + "%12s\t" + "cksum=%016llx:%016llx:%016llx:%016llx\n", zio_checksum_table[ck].ci_name, (u_longlong_t)bp->blk_cksum.zc_word[0], (u_longlong_t)bp->blk_cksum.zc_word[1], From 7883ea2234fac0cd976eb2ca0c6b51e7a5da7668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20V=C3=B6gele?= Date: Thu, 9 Feb 2023 20:57:50 +0100 Subject: [PATCH 20/28] rpm: Use libtirpc-devel and /usr/lib on SUSE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SUSE Linux distributions require libtirpc-devel. The dracut and udev directories are /usr/lib/dracut and /usr/lib/udev. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Reviewed-by: Richard Yao Signed-off-by: Andreas Vögele Closes #14467 Closes #14468 --- rpm/generic/zfs.spec.in | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 251470a67fba..8c538a00d203 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -3,7 +3,7 @@ # Set the default udev directory based on distribution. %if %{undefined _udevdir} -%if 0%{?fedora}%{?rhel}%{?centos}%{?openEuler} +%if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler} %global _udevdir %{_prefix}/lib/udev %else %global _udevdir /lib/udev @@ -12,7 +12,7 @@ # Set the default udevrule directory based on distribution. %if %{undefined _udevruledir} -%if 0%{?fedora}%{?rhel}%{?centos}%{?openEuler} +%if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler} %global _udevruledir %{_prefix}/lib/udev/rules.d %else %global _udevruledir /lib/udev/rules.d @@ -21,7 +21,7 @@ # Set the default dracut directory based on distribution. %if %{undefined _dracutdir} -%if 0%{?fedora}%{?rhel}%{?centos}%{?openEuler} +%if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler} %global _dracutdir %{_prefix}/lib/dracut %else %global _dracutdir %{_prefix}/share/dracut @@ -110,7 +110,7 @@ BuildRequires: libblkid-devel BuildRequires: libudev-devel BuildRequires: libattr-devel BuildRequires: openssl-devel -%if 0%{?fedora}%{?openEuler} || 0%{?rhel} >= 8 || 0%{?centos} >= 8 +%if 0%{?fedora}%{?suse_version}%{?openEuler} || 0%{?rhel} >= 8 || 0%{?centos} >= 8 BuildRequires: libtirpc-devel %endif From 87a4dfa561900dafaa446661538faa485af5f17a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 13 Feb 2023 16:21:53 -0500 Subject: [PATCH 21/28] Improve arc_read() error reporting Debugging reported NULL de-reference panic in dnode_hold_impl() I found that for certain types of errors arc_read() may only return error code, but not properly report it via done and pio arguments. Lack of done calls may result in reference and/or memory leaks in higher level code. Lack of error reporting via pio may result in unnoticed errors there. For example, dbuf_read(), where dbuf_read_impl() ignores arc_read() return, relies completely on the pio mechanism and missed the errors. This patch makes arc_read() to always call done callback and always propagate errors to parent zio, if either is provided. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14454 --- module/zfs/arc.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 2a52d0d24572..aa806706de29 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5958,6 +5958,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; + arc_buf_t *buf = NULL; int rc = 0; ASSERT(!embedded_bp || @@ -5987,7 +5988,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER, BLK_VERIFY_LOG)) { rc = SET_ERROR(ECKSUM); - goto out; + goto done; } if (!embedded_bp) { @@ -6008,14 +6009,13 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { boolean_t is_data = !HDR_ISTYPE_METADATA(hdr); - arc_buf_t *buf = NULL; if (HDR_IO_IN_PROGRESS(hdr)) { if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); rc = SET_ERROR(ENOENT); - goto out; + goto done; } zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; @@ -6144,9 +6144,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, is_data, data, metadata, hits); *arc_flags |= ARC_FLAG_CACHED; - - if (done) - done(NULL, zb, bp, buf, private); + goto done; } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); @@ -6159,10 +6157,10 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; if (*arc_flags & ARC_FLAG_CACHED_ONLY) { - rc = SET_ERROR(ENOENT); if (hash_lock != NULL) mutex_exit(hash_lock); - goto out; + rc = SET_ERROR(ENOENT); + goto done; } if (hdr == NULL) { @@ -6482,6 +6480,16 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, spa_read_history_add(spa, zb, *arc_flags); spl_fstrans_unmark(cookie); return (rc); + +done: + if (done) + done(NULL, zb, bp, buf, private); + if (pio && rc != 0) { + zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); + zio->io_error = rc; + zio_nowait(zio); + } + goto out; } arc_prune_t * From 13312e2fa107b2a10595d67de82ecdf74db317ed Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Mon, 13 Feb 2023 16:35:59 -0800 Subject: [PATCH 22/28] Reduce need for contiguous memory for ioctls We've had cases where we trigger an OOM despite having memory freely available on the system. For example, here, we had about 21GB free: kernel: Node 0 Normal: 2418758*4kB (UME) 1549533*8kB (UE) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 22071296kB The problem being, all the memory is in 4K and 8K contiguous regions, but the allocation request was for a 16K contiguous region: kernel: SafeExecutors-4 invoked oom-killer: gfp_mask=0x42dc0(GFP_KERNEL|__GFP_NOWARN|__GFP_COMP|__GFP_ZERO), order=2, oom_score_adj=0 The offending allocation came from this call trace: kernel: Call Trace: kernel: dump_stack+0x57/0x7a kernel: dump_header+0x4f/0x1e1 kernel: oom_kill_process.cold.33+0xb/0x10 kernel: out_of_memory+0x1ad/0x490 kernel: __alloc_pages_slowpath+0xd55/0xe40 kernel: __alloc_pages_nodemask+0x2df/0x330 kernel: kmalloc_large_node+0x42/0x90 kernel: __kmalloc_node+0x25a/0x320 kernel: ? spl_kmem_free_impl+0x21/0x30 [spl] kernel: spl_kmem_alloc_impl+0xa5/0x100 [spl] kernel: spl_kmem_zalloc+0x19/0x20 [spl] kernel: zfsdev_ioctl+0x2b/0xe0 [zfs] kernel: do_vfs_ioctl+0xa9/0x640 kernel: ? __audit_syscall_entry+0xdd/0x130 kernel: ksys_ioctl+0x67/0x90 kernel: __x64_sys_ioctl+0x1a/0x20 kernel: do_syscall_64+0x5e/0x200 kernel: entry_SYSCALL_64_after_hwframe+0x44/0xa9 kernel: RIP: 0033:0x7fdca3674317 The problem is, for each ioctl that ZFS makes, it has to allocate a zfs_cmd_t structure, which is 13744 bytes in size (on my system): sdb> sizeof zfs_cmd (size_t)13744 This size, coupled with the fact that we currently allocate it with kmem_zalloc, means we need a 16K contiguous region of memory to satisfy the request. The solution taken by this change, is to use "vmem" instead of "kmem" to do the allocation, such that we don't necessarily need a contiguous 16K memory region to satisfy the allocation. Arguably, a better solution would be not to require such a large allocation to begin with (e.g. reduce the size of the zfs_cmd_t structure), but that'd be a much larger change than this "one liner". Thus, I've opted for this approach for now; we can always circle back and attempt to reduce the size of the structure in the future. Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Reviewed-by: Richard Yao Reviewed-by: Mark Maybee Reviewed-by: Don Brady Signed-off-by: Prakash Surya Closes #14474 --- module/os/freebsd/zfs/kmod_core.c | 10 +++++----- module/os/linux/zfs/zfs_ioctl_os.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c index a1b1c8e4e6d9..f4c87013dbf0 100644 --- a/module/os/freebsd/zfs/kmod_core.c +++ b/module/os/freebsd/zfs/kmod_core.c @@ -142,7 +142,7 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, return (EINVAL); uaddr = (void *)zp->zfs_cmd; - zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + zc = vmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); #ifdef ZFS_LEGACY_SUPPORT /* * Remap ioctl code for legacy user binaries @@ -150,10 +150,10 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, if (zp->zfs_ioctl_version == ZFS_IOCVER_LEGACY) { vecnum = zfs_ioctl_legacy_to_ozfs(vecnum); if (vecnum < 0) { - kmem_free(zc, sizeof (zfs_cmd_t)); + vmem_free(zc, sizeof (zfs_cmd_t)); return (ENOTSUP); } - zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP); + zcl = vmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP); if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) { error = SET_ERROR(EFAULT); goto out; @@ -180,9 +180,9 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, out: #ifdef ZFS_LEGACY_SUPPORT if (zcl) - kmem_free(zcl, sizeof (zfs_cmd_legacy_t)); + vmem_free(zcl, sizeof (zfs_cmd_legacy_t)); #endif - kmem_free(zc, sizeof (zfs_cmd_t)); + vmem_free(zc, sizeof (zfs_cmd_t)); MPASS(tsd_get(rrw_tsd_key) == NULL); return (error); } diff --git a/module/os/linux/zfs/zfs_ioctl_os.c b/module/os/linux/zfs/zfs_ioctl_os.c index f8d1777b07a6..f068f544f0ec 100644 --- a/module/os/linux/zfs/zfs_ioctl_os.c +++ b/module/os/linux/zfs/zfs_ioctl_os.c @@ -135,7 +135,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) vecnum = cmd - ZFS_IOC_FIRST; - zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + zc = vmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); if (ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t), 0)) { error = -SET_ERROR(EFAULT); @@ -146,7 +146,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) if (error == 0 && rc != 0) error = -SET_ERROR(EFAULT); out: - kmem_free(zc, sizeof (zfs_cmd_t)); + vmem_free(zc, sizeof (zfs_cmd_t)); return (error); } From 34ce4c42ffdcd8933768533343d8b29f9612fbae Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 14 Feb 2023 01:37:46 +0100 Subject: [PATCH 23/28] Fix a race condition in dsl_dataset_sync() when activating features The zio returned from arc_write() in dmu_objset_sync() uses zio_nowait(). However we may reach the end of dsl_dataset_sync() which checks if we need to activate features in the filesystem without knowing if that zio has even run through the ZIO pipeline yet. In that case we will flag features to be activated in dsl_dataset_block_born() but dsl_dataset_sync() has already completed its run and those features will not actually be activated. Mitigate this by moving the feature activation code in dsl_dataset_sync_done(). Also add new ASSERTs in dsl_scan_visitbp() checking if a block contradicts any filesystem flags. Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Reviewed-by: Brian Atkinson Signed-off-by: George Amanakis Closes #13816 --- include/sys/dsl_dataset.h | 1 + module/zfs/dmu_send.c | 1 + module/zfs/dsl_dataset.c | 23 ++++++++++++----------- module/zfs/dsl_scan.c | 20 ++++++++++++++++++++ 4 files changed, 34 insertions(+), 11 deletions(-) diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 3450527af7e0..b0d8c7994c07 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -372,6 +372,7 @@ int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, minor_t cleanup_minor, const char *htag); +boolean_t zfeature_active(spa_feature_t f, void *arg); blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 7f8de23f0e29..33beb04b19b1 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -493,6 +493,7 @@ dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object, (bp != NULL ? BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && io_compressed : lsize != psize); if (raw || compressed) { + ASSERT(bp != NULL); ASSERT(raw || dscp->dsc_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_IS_EMBEDDED(bp)); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 57a58f88cec5..0584a356f9ac 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -1039,7 +1039,7 @@ dsl_dataset_has_owner(dsl_dataset_t *ds) return (rv); } -static boolean_t +boolean_t zfeature_active(spa_feature_t f, void *arg) { switch (spa_feature_table[f].fi_type) { @@ -2121,16 +2121,6 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) } dmu_objset_sync(ds->ds_objset, zio, tx); - - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (zfeature_active(f, ds->ds_feature_activation[f])) { - if (zfeature_active(f, ds->ds_feature[f])) - continue; - dsl_dataset_activate_feature(ds->ds_object, f, - ds->ds_feature_activation[f], tx); - ds->ds_feature[f] = ds->ds_feature_activation[f]; - } - } } /* @@ -2303,6 +2293,17 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); dmu_buf_rele(ds->ds_dbuf, ds); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (zfeature_active(f, + ds->ds_feature_activation[f])) { + if (zfeature_active(f, ds->ds_feature[f])) + continue; + dsl_dataset_activate_feature(ds->ds_object, f, + ds->ds_feature_activation[f], tx); + ds->ds_feature[f] = ds->ds_feature_activation[f]; + } + } } int diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 2cc77eab6275..f971aa211e0c 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2038,6 +2038,26 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, return; } + /* + * Check if this block contradicts any filesystem flags. + */ + spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS; + if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) + ASSERT(dsl_dataset_feature_is_active(ds, f)); + + f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); + if (f != SPA_FEATURE_NONE) + ASSERT(dsl_dataset_feature_is_active(ds, f)); + + f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); + if (f != SPA_FEATURE_NONE) + ASSERT(dsl_dataset_feature_is_active(ds, f)); + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { + scn->scn_lt_min_this_txg++; + return; + } + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; From cfd57573ffea911f464aeaf7ba415daf32fe0bd5 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Mon, 13 Feb 2023 19:40:13 -0500 Subject: [PATCH 24/28] quick fix for lingering snapdir unmount problems Unfortunately, even after e79b6807, I still, much more rarely, tripped asserts when playing with many ctldir mounts at once. Since this appears to happen if we dispatched twice too fast, just ignore it. We don't actually need to do anything if someone already started doing it for us. Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #14462 --- module/os/linux/zfs/zfs_ctldir.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 519f13212fac..cc4bc9702f8e 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -392,7 +392,20 @@ zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) zfsctl_snapshot_hold(se); rw_enter(&se->se_taskqid_lock, RW_WRITER); - ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID); + /* + * If this condition happens, we managed to: + * - dispatch once + * - want to dispatch _again_ before it returned + * + * So let's just return - if that task fails at unmounting, + * we'll eventually dispatch again, and if it succeeds, + * no problem. + */ + if (se->se_taskqid != TASKQID_INVALID) { + rw_exit(&se->se_taskqid_lock); + zfsctl_snapshot_rele(se); + return; + } se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); rw_exit(&se->se_taskqid_lock); From ab672133a9bde75d20afd59d8db1405c7300a557 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 14 Feb 2023 14:03:42 -0500 Subject: [PATCH 25/28] Give strlcat() full buffer lengths rather than smaller buffer lengths strlcat() is supposed to be given the length of the destination buffer, including the existing contents. Unfortunately, I had been overzealous when I wrote a51288aabbbc176a8a73a8b3cd56f79607db32cf, since I gave it the length of the destination buffer, minus the existing contents. This likely caused a regression on large strings. On the topic of being overzealous, the use of strlcat() in dmu_send_estimate_fast() was unnecessary because recv_clone_name is a fixed length string. We continue using strlcat() mostly as defensive programming, in case the string length is ever changed, even though it is unnecessary. Reviewed-by: Ryan Moeller Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #14476 --- lib/libshare/nfs.c | 2 +- lib/libzfs/libzfs_sendrecv.c | 4 ++-- module/zfs/dmu_send.c | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/libshare/nfs.c b/lib/libshare/nfs.c index 118ad7ef2209..3962c87453d4 100644 --- a/lib/libshare/nfs.c +++ b/lib/libshare/nfs.c @@ -97,7 +97,7 @@ nfs_init_tmpfile(const char *prefix, const char *mdir, struct tmpfile *tmpf) } strlcpy(tmpf->name, prefix, sizeof (tmpf->name)); - strlcat(tmpf->name, ".XXXXXXXX", sizeof (tmpf->name) - strlen(prefix)); + strlcat(tmpf->name, ".XXXXXXXX", sizeof (tmpf->name)); int fd = mkostemp(tmpf->name, O_CLOEXEC); if (fd == -1) { diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 1d2ad1944051..66a22e333663 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -4590,7 +4590,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, B_FALSE, destsnap) == 0) { *strchr(destsnap, '@') = '\0'; (void) strlcat(destsnap, suffix, - sizeof (destsnap) - strlen(destsnap)); + sizeof (destsnap)); } } } else { @@ -4626,7 +4626,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, B_FALSE, destsnap) == 0) { *strchr(destsnap, '@') = '\0'; (void) strlcat(destsnap, snap, - sizeof (destsnap) - strlen(destsnap)); + sizeof (destsnap)); } } } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 33beb04b19b1..f86a0a5b1c57 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -3029,8 +3029,7 @@ dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds, dsl_dataset_name(origds, dsname); (void) strcat(dsname, "/"); - (void) strlcat(dsname, recv_clone_name, - sizeof (dsname) - strlen(dsname)); + (void) strlcat(dsname, recv_clone_name, sizeof (dsname)); err = dsl_dataset_hold(origds->ds_dir->dd_pool, dsname, FTAG, &ds); From 3fc92adc409a36de229c78c7ca3d4689e9386bd3 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 14 Feb 2023 11:04:34 -0800 Subject: [PATCH 26/28] Linux: use filemap_range_has_page() As of the 4.13 kernel filemap_range_has_page() can be used to check if there is a page mapped in a given file range. When available this interface should be used which eliminates the need for the zp->z_is_mapped boolean. Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #14493 --- config/kernel-filemap.m4 | 26 +++++++++++++++++++++ config/kernel.m4 | 2 ++ include/os/freebsd/zfs/sys/zfs_znode_impl.h | 3 ++- include/os/linux/zfs/sys/trace_acl.h | 9 +++---- include/os/linux/zfs/sys/zfs_znode_impl.h | 16 ++++++++++++- include/sys/zfs_znode.h | 1 - module/os/linux/zfs/zfs_ctldir.c | 2 ++ module/os/linux/zfs/zfs_vnops_os.c | 5 ++-- module/os/linux/zfs/zfs_znode.c | 4 +++- module/os/linux/zfs/zpl_file.c | 6 +++-- module/zfs/zfs_vnops.c | 8 ++++--- 11 files changed, 65 insertions(+), 17 deletions(-) create mode 100644 config/kernel-filemap.m4 diff --git a/config/kernel-filemap.m4 b/config/kernel-filemap.m4 new file mode 100644 index 000000000000..745928168f92 --- /dev/null +++ b/config/kernel-filemap.m4 @@ -0,0 +1,26 @@ +dnl # +dnl # filemap_range_has_page was not available till 4.13 +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP], [ + ZFS_LINUX_TEST_SRC([filemap_range_has_page], [ + #include + ],[ + struct address_space *mapping = NULL; + loff_t lstart = 0; + loff_t lend = 0; + bool ret __attribute__ ((unused)); + + ret = filemap_range_has_page(mapping, lstart, lend); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FILEMAP], [ + AC_MSG_CHECKING([whether filemap_range_has_page() is available]) + ZFS_LINUX_TEST_RESULT([filemap_range_has_page], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILEMAP_RANGE_HAS_PAGE, 1, + [filemap_range_has_page() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 353988e9c867..121d73ef641a 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -150,6 +150,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_USER_NS_COMMON_INUM ZFS_AC_KERNEL_SRC_IDMAP_MNT_API ZFS_AC_KERNEL_SRC_IATTR_VFSID + ZFS_AC_KERNEL_SRC_FILEMAP AC_MSG_CHECKING([for available kernel interfaces]) ZFS_LINUX_TEST_COMPILE_ALL([kabi]) @@ -273,6 +274,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_USER_NS_COMMON_INUM ZFS_AC_KERNEL_IDMAP_MNT_API ZFS_AC_KERNEL_IATTR_VFSID + ZFS_AC_KERNEL_FILEMAP ]) dnl # diff --git a/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/include/os/freebsd/zfs/sys/zfs_znode_impl.h index 41a5bb218c12..8cde33dbcbbb 100644 --- a/include/os/freebsd/zfs/sys/zfs_znode_impl.h +++ b/include/os/freebsd/zfs/sys/zfs_znode_impl.h @@ -116,7 +116,8 @@ typedef struct zfs_soft_state { #define Z_ISLNK(type) ((type) == VLNK) #define Z_ISDIR(type) ((type) == VDIR) -#define zn_has_cached_data(zp) vn_has_cached_data(ZTOV(zp)) +#define zn_has_cached_data(zp, start, end) \ + vn_has_cached_data(ZTOV(zp)) #define zn_flush_cached_data(zp, sync) vn_flush_cached_data(ZTOV(zp), sync) #define zn_rlimit_fsize(zp, uio) \ vn_rlimit_fsize(ZTOV(zp), GET_UIO_STRUCT(uio), zfs_uio_td(uio)) diff --git a/include/os/linux/zfs/sys/trace_acl.h b/include/os/linux/zfs/sys/trace_acl.h index 2c734322267a..15dc77edafac 100644 --- a/include/os/linux/zfs/sys/trace_acl.h +++ b/include/os/linux/zfs/sys/trace_acl.h @@ -62,7 +62,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __field(uint32_t, z_async_writes_cnt) __field(mode_t, z_mode) __field(boolean_t, z_is_sa) - __field(boolean_t, z_is_mapped) __field(boolean_t, z_is_ctldir) __field(uint32_t, i_uid) @@ -96,7 +95,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_async_writes_cnt = zn->z_async_writes_cnt; __entry->z_mode = zn->z_mode; __entry->z_is_sa = zn->z_is_sa; - __entry->z_is_mapped = zn->z_is_mapped; __entry->z_is_ctldir = zn->z_is_ctldir; __entry->i_uid = KUID_TO_SUID(ZTOI(zn)->i_uid); @@ -119,7 +117,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class, "zn_prefetch %u blksz %u seq %u " "mapcnt %llu size %llu pflags %llu " "sync_cnt %u sync_writes_cnt %u async_writes_cnt %u " - "mode 0x%x is_sa %d is_mapped %d is_ctldir %d " + "mode 0x%x is_sa %d is_ctldir %d " "inode { uid %u gid %u ino %lu nlink %u size %lli " "blkbits %u bytes %u mode 0x%x generation %x } } " "ace { type %u flags %u access_mask %u } mask_matched %u", @@ -128,9 +126,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_seq, __entry->z_mapcnt, __entry->z_size, __entry->z_pflags, __entry->z_sync_cnt, __entry->z_sync_writes_cnt, __entry->z_async_writes_cnt, - __entry->z_mode, __entry->z_is_sa, __entry->z_is_mapped, - __entry->z_is_ctldir, __entry->i_uid, - __entry->i_gid, __entry->i_ino, __entry->i_nlink, + __entry->z_mode, __entry->z_is_sa, __entry->z_is_ctldir, + __entry->i_uid, __entry->i_gid, __entry->i_ino, __entry->i_nlink, __entry->i_size, __entry->i_blkbits, __entry->i_bytes, __entry->i_mode, __entry->i_generation, __entry->z_type, __entry->z_flags, __entry->z_access_mask, diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h index 52568781011f..81607ef2a25e 100644 --- a/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -47,9 +47,16 @@ extern "C" { #endif +#if defined(HAVE_FILEMAP_RANGE_HAS_PAGE) #define ZNODE_OS_FIELDS \ inode_timespec_t z_btime; /* creation/birth time (cached) */ \ struct inode z_inode; +#else +#define ZNODE_OS_FIELDS \ + inode_timespec_t z_btime; /* creation/birth time (cached) */ \ + struct inode z_inode; \ + boolean_t z_is_mapped; /* we are mmap'ed */ +#endif /* * Convert between znode pointers and inode pointers @@ -70,7 +77,14 @@ extern "C" { #define Z_ISDEV(type) (S_ISCHR(type) || S_ISBLK(type) || S_ISFIFO(type)) #define Z_ISDIR(type) S_ISDIR(type) -#define zn_has_cached_data(zp) ((zp)->z_is_mapped) +#if defined(HAVE_FILEMAP_RANGE_HAS_PAGE) +#define zn_has_cached_data(zp, start, end) \ + filemap_range_has_page(ZTOI(zp)->i_mapping, start, end) +#else +#define zn_has_cached_data(zp, start, end) \ + ((zp)->z_is_mapped) +#endif + #define zn_flush_cached_data(zp, sync) write_inode_now(ZTOI(zp), sync) #define zn_rlimit_fsize(zp, uio) (0) diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index de38f56dc32d..fcee55b0199d 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -188,7 +188,6 @@ typedef struct znode { boolean_t z_atime_dirty; /* atime needs to be synced */ boolean_t z_zn_prefetch; /* Prefetch znodes? */ boolean_t z_is_sa; /* are we native sa? */ - boolean_t z_is_mapped; /* are we mmap'ed */ boolean_t z_is_ctldir; /* are we .zfs entry */ boolean_t z_suspended; /* extra ref from a suspend? */ uint_t z_blksz; /* block size in bytes */ diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index cc4bc9702f8e..dca48e1e4010 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -498,7 +498,9 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_atime_dirty = B_FALSE; zp->z_zn_prefetch = B_FALSE; zp->z_is_sa = B_FALSE; +#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) zp->z_is_mapped = B_FALSE; +#endif zp->z_is_ctldir = B_TRUE; zp->z_sa_hdl = NULL; zp->z_blksz = 0; diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 47f132a38abe..302a88c2d353 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -987,7 +987,7 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && - !(zp->z_is_mapped); + !zn_has_cached_data(zp, 0, LLONG_MAX); mutex_exit(&zp->z_lock); /* @@ -1075,7 +1075,8 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && atomic_read(&ZTOI(zp)->i_count) == 1 && - !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && + !zn_has_cached_data(zp, 0, LLONG_MAX) && + xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; } diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index 1faf25d93cc7..7b802a9bace0 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -551,7 +551,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; +#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) zp->z_is_mapped = B_FALSE; +#endif zp->z_is_ctldir = B_FALSE; zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; @@ -1641,7 +1643,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) * Zero partial page cache entries. This must be done under a * range lock in order to keep the ARC and page cache in sync. */ - if (zp->z_is_mapped) { + if (zn_has_cached_data(zp, off, off + len - 1)) { loff_t first_page, last_page, page_len; loff_t first_page_offset, last_page_offset; diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index c56e3691e70a..e42b15042a3c 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -625,7 +625,6 @@ static int zpl_mmap(struct file *filp, struct vm_area_struct *vma) { struct inode *ip = filp->f_mapping->host; - znode_t *zp = ITOZ(ip); int error; fstrans_cookie_t cookie; @@ -640,9 +639,12 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) if (error) return (error); +#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) + znode_t *zp = ITOZ(ip); mutex_enter(&zp->z_lock); zp->z_is_mapped = B_TRUE; mutex_exit(&zp->z_lock); +#endif return (error); } @@ -937,7 +939,7 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_WILLNEED: #ifdef HAVE_GENERIC_FADVISE - if (zn_has_cached_data(zp)) + if (zn_has_cached_data(zp, offset, offset + len - 1)) error = generic_fadvise(filp, offset, len, advice); #endif /* diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 0c392b9da0fb..10677d8d9947 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -106,7 +106,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) hole = B_FALSE; /* Flush any mmap()'d data to disk */ - if (zn_has_cached_data(zp)) + if (zn_has_cached_data(zp, 0, file_sz - 1)) zn_flush_cached_data(zp, B_FALSE); lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER); @@ -288,7 +288,8 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) error = mappedread_sf(zp, nbytes, uio); else #endif - if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { + if (zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) { error = mappedread(zp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), @@ -696,7 +697,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } - if (tx_bytes && zn_has_cached_data(zp) && + if (tx_bytes && + zn_has_cached_data(zp, woff, woff + tx_bytes - 1) && !(ioflag & O_DIRECT)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } From f04cb31e7c1775cb6fb1fac716525c7926a14484 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 14 Feb 2023 14:05:41 -0500 Subject: [PATCH 27/28] Suppress Clang static analyzer complaint in zfs_replay_create() Clang's static analyzer incorrectly complains about an undefined value here when lr->lr_common.lrc_txtype == TX_SYMLINK and txtype == TX_CREATE. This is impossible, because of this line: txtype = (lr->lr_common.lrc_txtype & ~TX_CI((uint64_t)0x1 << 63)); Changing the code to compare against txtype suppresses the report. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Richard Yao Closes #14472 --- module/zfs/zfs_replay.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 0293e46d5858..32be27a8ba6e 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -512,9 +512,9 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) * * The _ATTR versions will grab the fuid info in their subcases. */ - if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && - (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && - (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { + if (txtype != TX_SYMLINK && + txtype != TX_MKDIR_ATTR && + txtype != TX_CREATE_ATTR) { start = (lr + 1); zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, From 57cfae4a2f04aaff10c45b3f7975e0fe3ef3e8b8 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 15 Feb 2023 09:06:29 -0800 Subject: [PATCH 28/28] zdb: zero-pad checksum output follow up Apply zero padding for checksums consistently. The SNPRINTF_BLKPTR macro was not updated in commit ac7648179c8 which results in the `cli_root/zdb/zdb_checksum.ksh` test case reliably failing. Reviewed-by: Igor Kozhukhov Reviewed-by: Akash B Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #14497 --- include/sys/spa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index 500eb3491a99..c9d03bf645a9 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -678,7 +678,7 @@ typedef struct blkptr { len += func(buf + len, size - len, \ "[L%llu %s] %s %s %s %s %s %s %s%c" \ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ - "cksum=%llx:%llx:%llx:%llx", \ + "cksum=%016llx:%016llx:%016llx:%016llx", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ checksum, \