From c85ac731a0ec16e4277857b55ebe123c552365b6 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 25 Jan 2023 11:28:54 -0800
Subject: [PATCH 01/28] Improve resilver ETAs

When resilvering the estimated time remaining is calculated using
the average issue rate over the current pass.  Where the current
pass starts when a scan was started, or restarted, if the pool
was exported/imported.

For dRAID pools in particular this can result in wildly optimistic
estimates since the issue rate will be very high while scanning
when non-degraded regions of the pool are scanned.  Once repair
I/O starts being issued performance drops to a realistic number
but the estimated performance is still significantly skewed.

To address this we redefine a pass such that it starts after a
scanning phase completes so the issue rate is more reflective of
recent performance.  Additionally, the zfs_scan_report_txgs
module option can be set to reset the pass statistics more often.

Reviewed-by: Akash B <akash-b@hpe.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #14410
---
 cmd/zpool/zpool_main.c | 33 ++++++++++++++++++++++-----------
 man/man4/zfs.4         |  7 +++++++
 module/zfs/dsl_scan.c  | 28 ++++++++++++++++++++++++++++
 module/zfs/spa_misc.c  |  1 -
 4 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 93d6a18981cb..efb2d10e591b 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -7524,19 +7524,20 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 
 	zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf));
 
-	assert(ps->pss_func == POOL_SCAN_SCRUB ||
-	    ps->pss_func == POOL_SCAN_RESILVER);
+	int is_resilver = ps->pss_func == POOL_SCAN_RESILVER;
+	int is_scrub = ps->pss_func == POOL_SCAN_SCRUB;
+	assert(is_resilver || is_scrub);
 
 	/* Scan is finished or canceled. */
 	if (ps->pss_state == DSS_FINISHED) {
 		secs_to_dhms(end - start, time_buf);
 
-		if (ps->pss_func == POOL_SCAN_SCRUB) {
+		if (is_scrub) {
 			(void) printf(gettext("scrub repaired %s "
 			    "in %s with %llu errors on %s"), processed_buf,
 			    time_buf, (u_longlong_t)ps->pss_errors,
 			    ctime(&end));
-		} else if (ps->pss_func == POOL_SCAN_RESILVER) {
+		} else if (is_resilver) {
 			(void) printf(gettext("resilvered %s "
 			    "in %s with %llu errors on %s"), processed_buf,
 			    time_buf, (u_longlong_t)ps->pss_errors,
@@ -7544,10 +7545,10 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 		}
 		return;
 	} else if (ps->pss_state == DSS_CANCELED) {
-		if (ps->pss_func == POOL_SCAN_SCRUB) {
+		if (is_scrub) {
 			(void) printf(gettext("scrub canceled on %s"),
 			    ctime(&end));
-		} else if (ps->pss_func == POOL_SCAN_RESILVER) {
+		} else if (is_resilver) {
 			(void) printf(gettext("resilver canceled on %s"),
 			    ctime(&end));
 		}
@@ -7557,7 +7558,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 	assert(ps->pss_state == DSS_SCANNING);
 
 	/* Scan is in progress. Resilvers can't be paused. */
-	if (ps->pss_func == POOL_SCAN_SCRUB) {
+	if (is_scrub) {
 		if (pause == 0) {
 			(void) printf(gettext("scrub in progress since %s"),
 			    ctime(&start));
@@ -7567,7 +7568,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 			(void) printf(gettext("\tscrub started on %s"),
 			    ctime(&start));
 		}
-	} else if (ps->pss_func == POOL_SCAN_RESILVER) {
+	} else if (is_resilver) {
 		(void) printf(gettext("resilver in progress since %s"),
 		    ctime(&start));
 	}
@@ -7609,17 +7610,27 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 		    scanned_buf, issued_buf, total_buf);
 	}
 
-	if (ps->pss_func == POOL_SCAN_RESILVER) {
+	if (is_resilver) {
 		(void) printf(gettext("\t%s resilvered, %.2f%% done"),
 		    processed_buf, 100 * fraction_done);
-	} else if (ps->pss_func == POOL_SCAN_SCRUB) {
+	} else if (is_scrub) {
 		(void) printf(gettext("\t%s repaired, %.2f%% done"),
 		    processed_buf, 100 * fraction_done);
 	}
 
 	if (pause == 0) {
+		/*
+		 * Only provide an estimate iff:
+		 * 1) the time remaining is valid, and
+		 * 2) the issue rate exceeds 10 MB/s, and
+		 * 3) it's either:
+		 *    a) a resilver which has started repairs, or
+		 *    b) a scrub which has entered the issue phase.
+		 */
 		if (total_secs_left != UINT64_MAX &&
-		    issue_rate >= 10 * 1024 * 1024) {
+		    issue_rate >= 10 * 1024 * 1024 &&
+		    ((is_resilver && ps->pss_processed > 0) ||
+		    (is_scrub && issued > 0))) {
 			(void) printf(gettext(", %s to go\n"), time_buf);
 		} else {
 			(void) printf(gettext(", no estimated "
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index e20d601340c6..6f260660e6d9 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1890,6 +1890,13 @@ I/O.
 In this case (unless the metadata scan is done) we stop issuing verification I/O
 and start scanning metadata again until we get to the hard limit.
 .
+.It Sy zfs_scan_report_txgs Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+When reporting resilver throughput and estimated completion time use the
+performance observed over roughly the last
+.Sy zfs_scan_report_txgs
+TXGs.
+When set to zero performance is calculated over the time between checkpoints.
+.
 .It Sy zfs_scan_strict_mem_lim Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enforce tight memory limits on pool scans when a sequential scan is in progress.
 When disabled, the memory limit may be exceeded by fast disks.
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index f9e437f0c947..8a5a02cea272 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -131,6 +131,15 @@ static uint64_t dsl_scan_count_data_disks(vdev_t *vd);
 extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
 static int zfs_scan_blkstats = 0;
 
+/*
+ * 'zpool status' uses bytes processed per pass to report throughput and
+ * estimate time remaining.  We define a pass to start when the scanning
+ * phase completes for a sequential resilver.  Optionally, this value
+ * may be used to reset the pass statistics every N txgs to provide an
+ * estimated completion time based on currently observed performance.
+ */
+static uint_t zfs_scan_report_txgs = 0;
+
 /*
  * By default zfs will check to ensure it is not over the hard memory
  * limit before each txg. If finer-grained control of this is needed
@@ -604,6 +613,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 	}
 
 	spa_scan_stat_init(spa);
+	vdev_scan_stat_init(spa->spa_root_vdev);
+
 	return (0);
 }
 
@@ -763,6 +774,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 	scn->scn_last_checkpoint = 0;
 	scn->scn_checkpointing = B_FALSE;
 	spa_scan_stat_init(spa);
+	vdev_scan_stat_init(spa->spa_root_vdev);
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
@@ -3652,6 +3664,16 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		return;
 	}
 
+	/*
+	 * Disabled by default, set zfs_scan_report_txgs to report
+	 * average performance over the last zfs_scan_report_txgs TXGs.
+	 */
+	if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 &&
+	    tx->tx_txg % zfs_scan_report_txgs == 0) {
+		scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
+		spa_scan_stat_init(spa);
+	}
+
 	/*
 	 * It is possible to switch from unsorted to sorted at any time,
 	 * but afterwards the scan will remain sorted unless reloaded from
@@ -3780,6 +3802,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 			if (scn->scn_is_sorted) {
 				scn->scn_checkpointing = B_TRUE;
 				scn->scn_clearing = B_TRUE;
+				scn->scn_issued_before_pass +=
+				    spa->spa_scan_pass_issued;
+				spa_scan_stat_init(spa);
 			}
 			zfs_dbgmsg("scan complete for %s txg %llu",
 			    spa->spa_name,
@@ -4507,5 +4532,8 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW,
 	"Tunable to adjust bias towards more filled segments during scans");
 
+ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW,
+	"Tunable to report resilver performance over the last N txgs");
+
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
 	"Process all resilvers immediately");
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 822fd0ee89b6..53763e915ca8 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -2556,7 +2556,6 @@ spa_scan_stat_init(spa_t *spa)
 	spa->spa_scan_pass_scrub_spent_paused = 0;
 	spa->spa_scan_pass_exam = 0;
 	spa->spa_scan_pass_issued = 0;
-	vdev_scan_stat_init(spa->spa_root_vdev);
 }
 
 /*

From dc5c8006f684b1df3f2d4b6b8c121447d2db0017 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 25 Jan 2023 14:30:24 -0500
Subject: [PATCH 02/28] Prefetch on deadlists merge

During snapshot deletion ZFS may issue several reads for each deadlist
to merge them into next snapshot's or pool's bpobj.  Number of the dead
lists increases with number of snapshots.  On HDD pools it may take
significant time during which sync thread is blocked.

This patch introduces prescient prefetch of required blocks for up to
128 deadlists ahead.  Tests show reduction of time required to delete
dataset with 720 snapshots with randomly overwritten file on wide HDD
pool from 75-85 to 22-28 seconds.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Issue #14276
Closes #14402
---
 include/sys/bpobj.h       |  1 +
 module/zfs/bpobj.c        | 65 +++++++++++++++++++++++++++++++++--
 module/zfs/dsl_deadlist.c | 71 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 131 insertions(+), 6 deletions(-)

diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h
index 84f0ee76c44e..f3384f526454 100644
--- a/include/sys/bpobj.h
+++ b/include/sys/bpobj.h
@@ -87,6 +87,7 @@ int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func,
     void *arg, int64_t start);
 
 void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
+void bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj);
 void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx);
 
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index f7fded56518b..fa99f5141d4e 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -663,14 +663,13 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 	}
 
 	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
-	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
-
 	if (bpobj_is_empty(&subbpo)) {
 		/* No point in having an empty subobj. */
 		bpobj_close(&subbpo);
 		bpobj_free(bpo->bpo_os, subobj, tx);
 		return;
 	}
+	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
 
 	mutex_enter(&bpo->bpo_lock);
 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
@@ -780,6 +779,68 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 
 }
 
+/*
+ * Prefetch metadata required for bpobj_enqueue_subobj().
+ */
+void
+bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj)
+{
+	dmu_object_info_t doi;
+	bpobj_t subbpo;
+	uint64_t subsubobjs;
+	boolean_t copy_subsub = B_TRUE;
+	boolean_t copy_bps = B_TRUE;
+
+	ASSERT(bpobj_is_open(bpo));
+	ASSERT(subobj != 0);
+
+	if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj)
+		return;
+
+	if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0)
+		return;
+	if (bpobj_is_empty(&subbpo)) {
+		bpobj_close(&subbpo);
+		return;
+	}
+	subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+	bpobj_close(&subbpo);
+
+	if (subsubobjs != 0) {
+		if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0)
+			return;
+		if (doi.doi_max_offset > doi.doi_data_block_size)
+			copy_subsub = B_FALSE;
+	}
+
+	if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0)
+		return;
+	if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub)
+		copy_bps = B_FALSE;
+
+	if (copy_subsub && subsubobjs != 0) {
+		if (bpo->bpo_phys->bpo_subobjs) {
+			dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
+			    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
+			    ZIO_PRIORITY_ASYNC_READ);
+		}
+		dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1,
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
+
+	if (copy_bps) {
+		dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
+		    bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1,
+		    ZIO_PRIORITY_ASYNC_READ);
+		dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1,
+		    ZIO_PRIORITY_ASYNC_READ);
+	} else if (bpo->bpo_phys->bpo_subobjs) {
+		dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
+		    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
+}
+
 void
 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index 2b33446e66af..d58820701f60 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -438,6 +438,18 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
 	}
 }
 
+/*
+ * Prefetch metadata required for dle_enqueue_subobj().
+ */
+static void
+dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    uint64_t obj)
+{
+	if (dle->dle_bpobj.bpo_object !=
+	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj)
+		bpobj_prefetch_subobj(&dle->dle_bpobj, obj);
+}
+
 void
 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
@@ -810,6 +822,27 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
 	dle_enqueue_subobj(dl, dle, obj, tx);
 }
 
+/*
+ * Prefetch metadata required for dsl_deadlist_insert_bpobj().
+ */
+static void
+dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	ASSERT(MUTEX_HELD(&dl->dl_lock));
+
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = birth;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+	dle_prefetch_subobj(dl, dle, obj);
+}
+
 static int
 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
@@ -826,12 +859,12 @@ dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 void
 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
 {
-	zap_cursor_t zc;
-	zap_attribute_t za;
+	zap_cursor_t zc, pzc;
+	zap_attribute_t za, pza;
 	dmu_buf_t *bonus;
 	dsl_deadlist_phys_t *dlp;
 	dmu_object_info_t doi;
-	int error;
+	int error, perror, i;
 
 	VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
 	if (doi.doi_type == DMU_OT_BPOBJ) {
@@ -843,15 +876,32 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
 	}
 
 	mutex_enter(&dl->dl_lock);
+	/*
+	 * Prefetch up to 128 deadlists first and then more as we progress.
+	 * The limit is a balance between ARC use and diminishing returns.
+	 */
+	for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0;
+	    (perror = zap_cursor_retrieve(&pzc, &pza)) == 0 && i < 128;
+	    zap_cursor_advance(&pzc), i++) {
+		dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer,
+		    zfs_strtonum(pza.za_name, NULL));
+	}
 	for (zap_cursor_init(&zc, dl->dl_os, obj);
 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
 		dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
 		VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
+		if (perror == 0) {
+			dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer,
+			    zfs_strtonum(pza.za_name, NULL));
+			zap_cursor_advance(&pzc);
+			perror = zap_cursor_retrieve(&pzc, &pza);
+		}
 	}
 	VERIFY3U(error, ==, ENOENT);
 	zap_cursor_fini(&zc);
+	zap_cursor_fini(&pzc);
 
 	VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
 	dlp = bonus->db_data;
@@ -869,8 +919,9 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
     dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
-	dsl_deadlist_entry_t *dle;
+	dsl_deadlist_entry_t *dle, *pdle;
 	avl_index_t where;
+	int i;
 
 	ASSERT(!dl->dl_oldfmt);
 
@@ -882,11 +933,23 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+	/*
+	 * Prefetch up to 128 deadlists first and then more as we progress.
+	 * The limit is a balance between ARC use and diminishing returns.
+	 */
+	for (pdle = dle, i = 0; pdle && i < 128; ) {
+		bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
+		pdle = AVL_NEXT(&dl->dl_tree, pdle);
+	}
 	while (dle) {
 		uint64_t used, comp, uncomp;
 		dsl_deadlist_entry_t *dle_next;
 
 		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+		if (pdle) {
+			bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
+			pdle = AVL_NEXT(&dl->dl_tree, pdle);
+		}
 
 		VERIFY0(bpobj_space(&dle->dle_bpobj,
 		    &used, &comp, &uncomp));

From c0aea7cf4e86fc02db8046fbb3bca21a918053a2 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 24 Jan 2023 14:05:45 -0800
Subject: [PATCH 03/28] Increase default zfs_scan_vdev_limit to 16MB

For HDD based pools the default zfs_scan_vdev_limit of 4M
per-vdev can significantly limit the maximum scrub performance.
Increasing the default to 16M can double the scrub speed from
80 MB/s per disk to 160 MB/s per disk.

This does increase the memory footprint during scrub/resilver
but given the performance win this is a reasonable trade off.
Memory usage is capped at 1/4 of arc_c_max.  Note that number
of outstanding I/Os has not changed and is still limited by
zfs_vdev_scrub_max_active.

Reviewed-by: Akash B <akash-b@hpe.com>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #14428
---
 man/man4/zfs.4        |  2 +-
 module/zfs/dsl_scan.c | 28 ++++++++++++++++------------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 6f260660e6d9..031981f9b143 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1905,7 +1905,7 @@ When disabled, the memory limit may be exceeded by fast disks.
 Freezes a scrub/resilver in progress without actually pausing it.
 Intended for testing/debugging.
 .
-.It Sy zfs_scan_vdev_limit Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq int
+.It Sy zfs_scan_vdev_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int
 Maximum amount of data that can be concurrently issued at once for scrubs and
 resilvers per leaf device, given in bytes.
 .
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 8a5a02cea272..2cc77eab6275 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -37,6 +37,7 @@
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
+#include <sys/arc_impl.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
@@ -126,7 +127,7 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
 static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
 static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
 static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
-static uint64_t dsl_scan_count_data_disks(vdev_t *vd);
+static uint64_t dsl_scan_count_data_disks(spa_t *spa);
 
 extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
 static int zfs_scan_blkstats = 0;
@@ -156,7 +157,7 @@ static int zfs_scan_strict_mem_lim = B_FALSE;
  * overload the drives with I/O, since that is protected by
  * zfs_vdev_scrub_max_active.
  */
-static uint64_t zfs_scan_vdev_limit = 4 << 20;
+static uint64_t zfs_scan_vdev_limit = 16 << 20;
 
 static uint_t zfs_scan_issue_strategy = 0;
 
@@ -475,11 +476,12 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 
 	/*
 	 * Calculate the max number of in-flight bytes for pool-wide
-	 * scanning operations (minimum 1MB). Limits for the issuing
-	 * phase are done per top-level vdev and are handled separately.
+	 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
+	 * Limits for the issuing phase are done per top-level vdev and
+	 * are handled separately.
 	 */
-	scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
-	    dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
+	scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
+	    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
 
 	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
 	    offsetof(scan_ds_t, sds_node));
@@ -2823,8 +2825,9 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 }
 
 static uint64_t
-dsl_scan_count_data_disks(vdev_t *rvd)
+dsl_scan_count_data_disks(spa_t *spa)
 {
+	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t i, leaves = 0;
 
 	for (i = 0; i < rvd->vdev_children; i++) {
@@ -3733,12 +3736,13 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		taskqid_t prefetch_tqid;
 
 		/*
-		 * Recalculate the max number of in-flight bytes for pool-wide
-		 * scanning operations (minimum 1MB). Limits for the issuing
-		 * phase are done per top-level vdev and are handled separately.
+		 * Calculate the max number of in-flight bytes for pool-wide
+		 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
+		 * Limits for the issuing phase are done per top-level vdev and
+		 * are handled separately.
 		 */
-		scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
-		    dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
+		scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
+		    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
 
 		if (scnp->scn_ddt_bookmark.ddb_class <=
 		    scnp->scn_ddt_class_max) {

From 973934b965268b5333564dbdf4e76b34cc7e7b6f Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 24 Jan 2023 15:23:32 -0800
Subject: [PATCH 04/28] Increase default zfs_rebuild_vdev_limit to 64MB

When testing distributed rebuild performance with more capable
hardware it was observed than increasing the zfs_rebuild_vdev_limit
to 64M reduced the rebuild time by 17%.  Beyond 64MB there was
some improvement (~2%) but it was not significant when weighed
against the increased memory usage. Memory usage is capped at 1/4
of arc_c_max.

Additionally, vr_bytes_inflight_max has been moved so it's updated
per-metaslab to allow the size to be adjust while a rebuild is
running.

Reviewed-by: Akash B <akash-b@hpe.com>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #14428
---
 man/man4/zfs.4            |  2 +-
 module/zfs/vdev_rebuild.c | 27 ++++++++++++++++++---------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 031981f9b143..88a044f63f28 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1769,7 +1769,7 @@ completes in order to verify the checksums of all blocks which have been
 resilvered.
 This is enabled by default and strongly recommended.
 .
-.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64
+.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64
 Maximum amount of I/O that can be concurrently issued for a sequential
 resilver per leaf device, given in bytes.
 .
diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
index 1f56275c853b..62aa61b3b9e7 100644
--- a/module/zfs/vdev_rebuild.c
+++ b/module/zfs/vdev_rebuild.c
@@ -34,6 +34,7 @@
 #include <sys/zio.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
+#include <sys/arc_impl.h>
 #include <sys/zap.h>
 
 /*
@@ -116,13 +117,12 @@ static uint64_t zfs_rebuild_max_segment = 1024 * 1024;
  * segment size is also large (zfs_rebuild_max_segment=1M).  This helps keep
  * the queue depth short.
  *
- * 32MB was selected as the default value to achieve good performance with
- * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
- * rebuild was unable to saturate all of the drives using smaller values.
- * With a value of 32MB the sequential resilver write rate was measured at
- * 800MB/s sustained while rebuilding to a distributed spare.
+ * 64MB was observed to deliver the best performance and set as the default.
+ * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c)
+ * and a rebuild rate of 1.2GB/s was measured to the distribute spare.
+ * Smaller values were unable to fully saturate the available pool I/O.
  */
-static uint64_t zfs_rebuild_vdev_limit = 32 << 20;
+static uint64_t zfs_rebuild_vdev_limit = 64 << 20;
 
 /*
  * Automatically start a pool scrub when the last active sequential resilver
@@ -754,6 +754,7 @@ vdev_rebuild_thread(void *arg)
 {
 	vdev_t *vd = arg;
 	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
 	int error = 0;
 
 	/*
@@ -786,9 +787,6 @@ vdev_rebuild_thread(void *arg)
 	vr->vr_pass_bytes_scanned = 0;
 	vr->vr_pass_bytes_issued = 0;
 
-	vr->vr_bytes_inflight_max = MAX(1ULL << 20,
-	    zfs_rebuild_vdev_limit * vd->vdev_children);
-
 	uint64_t update_est_time = gethrtime();
 	vdev_rebuild_update_bytes_est(vd, 0);
 
@@ -804,6 +802,17 @@ vdev_rebuild_thread(void *arg)
 		metaslab_t *msp = vd->vdev_ms[i];
 		vr->vr_scan_msp = msp;
 
+		/*
+		 * Calculate the max number of in-flight bytes for top-level
+		 * vdev scanning operations (minimum 1MB, maximum 1/4 of
+		 * arc_c_max shared by all top-level vdevs).  Limits for the
+		 * issuing phase are done per top-level vdev and are handled
+		 * separately.
+		 */
+		uint64_t limit = (arc_c_max / 4) / MAX(rvd->vdev_children, 1);
+		vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20,
+		    zfs_rebuild_vdev_limit * vd->vdev_children));
+
 		/*
 		 * Removal of vdevs from the vdev tree may eliminate the need
 		 * for the rebuild, in which case it should be canceled.  The

From 05b72415d16e3cfbdeb0f59d5f24640f10ee34d6 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com>
Date: Fri, 3 Feb 2023 04:09:57 +0500
Subject: [PATCH 05/28] Fix console progress reporting for recursive send

After commit 19d3961, progress reporting (-v) with replication flag
enabled does not report the progress on the console. This commit
fixes the issue by updating the logic to check for pa->progress
instead of pa_verbosity in send_progress_thread().

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #14448
---
 cmd/zfs/zfs_main.c           | 2 +-
 lib/libzfs/libzfs_sendrecv.c | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 9d48b2b32e83..cec025ce6de8 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -4532,7 +4532,7 @@ zfs_do_send(int argc, char **argv)
 		}
 	}
 
-	if (flags.parsable && flags.verbosity == 0)
+	if ((flags.parsable || flags.progressastitle) && flags.verbosity == 0)
 		flags.verbosity = 1;
 
 	if (excludes.count > 0 && !flags.replicate) {
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index 038613a1fcfa..1d2ad1944051 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -84,6 +84,7 @@ typedef struct progress_arg {
 	boolean_t pa_estimate;
 	int pa_verbosity;
 	boolean_t pa_astitle;
+	boolean_t pa_progress;
 	uint64_t pa_size;
 } progress_arg_t;
 
@@ -940,7 +941,7 @@ send_progress_thread(void *arg)
 	struct tm tm;
 	int err;
 
-	if (!pa->pa_parsable && pa->pa_verbosity != 0) {
+	if (!pa->pa_parsable && pa->pa_progress) {
 		(void) fprintf(stderr,
 		    "TIME       %s   %sSNAPSHOT %s\n",
 		    pa->pa_estimate ? "BYTES" : " SENT",
@@ -990,7 +991,7 @@ send_progress_thread(void *arg)
 			(void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n",
 			    tm.tm_hour, tm.tm_min, tm.tm_sec,
 			    (u_longlong_t)bytes, zhp->zfs_name);
-		} else if (pa->pa_verbosity != 0) {
+		} else if (pa->pa_progress) {
 			zfs_nicebytes(bytes, buf, sizeof (buf));
 			(void) fprintf(stderr, "%02d:%02d:%02d   %5s   %s\n",
 			    tm.tm_hour, tm.tm_min, tm.tm_sec,
@@ -1206,6 +1207,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
 			pa.pa_verbosity = sdd->verbosity;
 			pa.pa_size = sdd->size;
 			pa.pa_astitle = sdd->progressastitle;
+			pa.pa_progress = sdd->progress;
 
 			if ((err = pthread_create(&tid, NULL,
 			    send_progress_thread, &pa)) != 0) {
@@ -1886,6 +1888,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
 			pa.pa_verbosity = flags->verbosity;
 			pa.pa_size = size;
 			pa.pa_astitle = flags->progressastitle;
+			pa.pa_progress = flags->progress;
 
 			error = pthread_create(&tid, NULL,
 			    send_progress_thread, &pa);
@@ -2696,6 +2699,7 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd,
 		pa.pa_verbosity = flags->verbosity;
 		pa.pa_size = size;
 		pa.pa_astitle = flags->progressastitle;
+		pa.pa_progress = flags->progress;
 
 		err = pthread_create(&ptid, NULL,
 		    send_progress_thread, &pa);

From f18e083bf8ce0c0d1997002f9986122be6d4ebe8 Mon Sep 17 00:00:00 2001
From: George Wilson <george.wilson@delphix.com>
Date: Thu, 2 Feb 2023 18:11:35 -0500
Subject: [PATCH 06/28] rootdelay on zfs should be adaptive

The 'rootdelay' boot option currently pauses the boot for a specified
amount of time. The original intent was to ensure that slower
configurations would have ample time to enumerate the devices to make
importing the root pool successful. This, however, causes unnecessary
boot delay for environments like Azure which set this parameter by
default.

This commit changes the initramfs logic to pause until it can
successfully load the 'zfs' module. The timeout specified by
'rootdelay' now becomes the maximum amount of time that initramfs will
wait before failing the boot.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Signed-off-by: George Wilson <gwilson@delphix.com>
Closes #14430
---
 contrib/initramfs/scripts/zfs | 54 +++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs
index c724f0c2cf57..41a4d96f8732 100644
--- a/contrib/initramfs/scripts/zfs
+++ b/contrib/initramfs/scripts/zfs
@@ -272,30 +272,46 @@ import_pool()
 # with more logging etc.
 load_module_initrd()
 {
-	[ -n "$ROOTDELAY" ] && ZFS_INITRD_PRE_MOUNTROOT_SLEEP="$ROOTDELAY"
+	ZFS_INITRD_PRE_MOUNTROOT_SLEEP=${ROOTDELAY:-0}
 
-	if [ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" -gt 0 ] 2>/dev/null
-	then
-		if [ "$quiet" != "y" ]; then
-			zfs_log_begin_msg "Sleeping for" \
-				"$ZFS_INITRD_PRE_MOUNTROOT_SLEEP seconds..."
+	if [ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" -gt 0 ]; then
+		[ "$quiet" != "y" ] && zfs_log_begin_msg "Delaying for up to '${ZFS_INITRD_PRE_MOUNTROOT_SLEEP}' seconds."
+	fi
+
+	START=$(/bin/date -u +%s)
+	END=$((START+ZFS_INITRD_PRE_MOUNTROOT_SLEEP))
+	while true; do
+
+		# Wait for all of the /dev/{hd,sd}[a-z] device nodes to appear.
+		if command -v wait_for_udev > /dev/null 2>&1 ; then
+			wait_for_udev 10
+		elif command -v wait_for_dev > /dev/null 2>&1 ; then
+			wait_for_dev
 		fi
-		sleep "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP"
+
+		#
+		# zpool import refuse to import without a valid
+		# /proc/self/mounts
+		#
+		[ ! -f /proc/self/mounts ] && mount proc /proc
+
+		# Load the module
+		if load_module "zfs"; then
+			ret=0
+			break
+		else
+			ret=1
+		fi
+
+		[ "$(/bin/date -u +%s)" -gt "$END" ] && break
+		sleep 1
+
+	done
+	if [ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" -gt 0 ]; then
 		[ "$quiet" != "y" ] && zfs_log_end_msg
 	fi
 
-	# Wait for all of the /dev/{hd,sd}[a-z] device nodes to appear.
-	if command -v wait_for_udev > /dev/null 2>&1 ; then
-		wait_for_udev 10
-	elif command -v wait_for_dev > /dev/null 2>&1 ; then
-		wait_for_dev
-	fi
-
-	# zpool import refuse to import without a valid /proc/self/mounts
-	[ ! -f /proc/self/mounts ] && mount proc /proc
-
-	# Load the module
-	load_module "zfs" || return 1
+	[ "$ret" -ne 0 ] && return 1
 
 	if [ "$ZFS_INITRD_POST_MODPROBE_SLEEP" -gt 0 ] 2>/dev/null
 	then

From c799866b970bee4fc11a7e8c9a1d70e02481fe66 Mon Sep 17 00:00:00 2001
From: Allan Jude <allan@klarasystems.com>
Date: Thu, 2 Feb 2023 18:12:51 -0500
Subject: [PATCH 07/28] Resolve WS-2021-0184 vulnerability in zstd

Pull in d40f55cd950919d7eac951b122668e55e33e5202 from upstream

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Closes #14439
---
 module/zstd/lib/compress/zstd_double_fast.c |  4 ++--
 module/zstd/lib/compress/zstd_fast.c        |  6 +++---
 module/zstd/lib/compress/zstd_lazy.c        | 12 ++++++++----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/module/zstd/lib/compress/zstd_double_fast.c b/module/zstd/lib/compress/zstd_double_fast.c
index 27eed66cfedd..4a95c01a090d 100644
--- a/module/zstd/lib/compress/zstd_double_fast.c
+++ b/module/zstd/lib/compress/zstd_double_fast.c
@@ -409,7 +409,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
         hashSmall[hSmall] = hashLong[hLong] = current;   /* update hash table */
 
         if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
-            & (repIndex > dictStartIndex))
+            & (offset_1 < current+1 - dictStartIndex)) /* note: we are searching at current+1 */
           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
@@ -477,7 +477,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                 U32 const repIndex2 = current2 - offset_2;
                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
                 if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3)   /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
-                    & (repIndex2 > dictStartIndex))
+                    & (offset_2 < current2 - dictStartIndex))
                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
diff --git a/module/zstd/lib/compress/zstd_fast.c b/module/zstd/lib/compress/zstd_fast.c
index 85a3a7a91e49..17894b85472f 100644
--- a/module/zstd/lib/compress/zstd_fast.c
+++ b/module/zstd/lib/compress/zstd_fast.c
@@ -416,9 +416,9 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
         const BYTE* const repMatch = repBase + repIndex;
         hashTable[h] = current;   /* update hash table */
         DEBUGLOG(7, "offset_1 = %u , current = %u", offset_1, current);
-        assert(offset_1 <= current +1);   /* check repIndex */
 
-        if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex))
+        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
+             & (offset_1 < current+1 - dictStartIndex) ) /* note: we are searching at current+1 */
            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
             const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
             size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
@@ -453,7 +453,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
                 U32 const current2 = (U32)(ip-base);
                 U32 const repIndex2 = current2 - offset_2;
                 const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex))  /* intentional overflow */
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 < current - dictStartIndex))  /* intentional overflow */
                    && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
diff --git a/module/zstd/lib/compress/zstd_lazy.c b/module/zstd/lib/compress/zstd_lazy.c
index 4cf5c88b5325..22d80597ec62 100644
--- a/module/zstd/lib/compress/zstd_lazy.c
+++ b/module/zstd/lib/compress/zstd_lazy.c
@@ -975,7 +975,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
             const U32 repIndex = (U32)(current+1 - offset_1);
             const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
             const BYTE* const repMatch = repBase + repIndex;
-            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))   /* intentional overflow */
+            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
+               & (offset_1 < current+1 - windowLow) ) /* note: we are searching at current+1 */
             if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
                 /* repcode detected we should take it */
                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1006,7 +1007,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                 const U32 repIndex = (U32)(current - offset_1);
                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
                 const BYTE* const repMatch = repBase + repIndex;
-                if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */
+                    if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
+                       & (offset_1 < current - windowLow) ) /* equivalent to `current > repIndex >= windowLow` */
                 if (MEM_read32(ip) == MEM_read32(repMatch)) {
                     /* repcode detected */
                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1037,7 +1039,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                     const U32 repIndex = (U32)(current - offset_1);
                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
                     const BYTE* const repMatch = repBase + repIndex;
-                    if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */
+                if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
+                   & (offset_1 < current - windowLow) ) /* equivalent to `current > repIndex >= windowLow` */
                     if (MEM_read32(ip) == MEM_read32(repMatch)) {
                         /* repcode detected */
                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1083,7 +1086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
             const U32 repIndex = repCurrent - offset_2;
             const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
             const BYTE* const repMatch = repBase + repIndex;
-            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */
+            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
+               & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
             if (MEM_read32(ip) == MEM_read32(repMatch)) {
                 /* repcode detected we should take it */
                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;

From 326f1e3d88dab158b8aad8c49b96fa292b5912e5 Mon Sep 17 00:00:00 2001
From: rob-wing <98866084+rob-wing@users.noreply.github.com>
Date: Thu, 2 Feb 2023 14:16:40 -0900
Subject: [PATCH 08/28] zfs_main.c: fix unused variable error with GCC

zfs_setproctitle_init() is stubbed out on FreeBSD.

Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Signed-off-by: Rob Wing <rob.fx907@gmail.com>
Closes #14441
---
 cmd/zfs/zfs_main.c | 1 -
 include/libzutil.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index cec025ce6de8..1ffe805aab70 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -8672,7 +8672,6 @@ main(int argc, char **argv)
 	int i = 0;
 	const char *cmdname;
 	char **newargv;
-	extern char **environ;
 
 	(void) setlocale(LC_ALL, "");
 	(void) setlocale(LC_NUMERIC, "C");
diff --git a/include/libzutil.h b/include/libzutil.h
index 4d4bddaad5f3..948ac08cd772 100644
--- a/include/libzutil.h
+++ b/include/libzutil.h
@@ -183,6 +183,7 @@ _LIBZUTIL_H int printf_color(const char *color, const char *format, ...);
 _LIBZUTIL_H const char *zfs_basename(const char *path);
 _LIBZUTIL_H ssize_t zfs_dirnamelen(const char *path);
 #ifdef __linux__
+extern char **environ;
 _LIBZUTIL_H void zfs_setproctitle_init(int argc, char *argv[], char *envp[]);
 _LIBZUTIL_H void zfs_setproctitle(const char *fmt, ...);
 #else

From ac2038a19c3b7e9ba913a850e05ba773725bd25d Mon Sep 17 00:00:00 2001
From: George Amanakis <gamanakis@gmail.com>
Date: Fri, 3 Feb 2023 00:17:37 +0100
Subject: [PATCH 09/28] Teach zdb about DMU_OT_ERROR_LOG objects

With the persistent error log feature we need to account for
spa_errlog_{scrub, last} containing mappings to other error log objects,
which need to be marked as in-use as well.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #14442
Closes #14434
---
 cmd/zdb/zdb.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index b04b220c768e..d6e3c5806944 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -7509,6 +7509,19 @@ mos_leak_log_spacemaps(spa_t *spa)
 		mos_obj_refd(sls->sls_sm_obj);
 }
 
+static void
+errorlog_count_refd(objset_t *mos, uint64_t errlog)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	for (zap_cursor_init(&zc, mos, errlog);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		mos_obj_refd(za.za_first_integer);
+	}
+	zap_cursor_fini(&zc);
+}
+
 static int
 dump_mos_leaks(spa_t *spa)
 {
@@ -7529,6 +7542,12 @@ dump_mos_leaks(spa_t *spa)
 	mos_obj_refd(spa->spa_history);
 	mos_obj_refd(spa->spa_errlog_last);
 	mos_obj_refd(spa->spa_errlog_scrub);
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+		errorlog_count_refd(mos, spa->spa_errlog_last);
+		errorlog_count_refd(mos, spa->spa_errlog_scrub);
+	}
+
 	mos_obj_refd(spa->spa_all_vdev_zaps);
 	mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
 	mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);

From 90f4e01f8ab89edc39bfa227032acb66ae511988 Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <paul.dagnelie@delphix.com>
Date: Thu, 2 Feb 2023 15:19:26 -0800
Subject: [PATCH 10/28] Prevent error messages when running tests with no
 timeout

Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #14450
---
 tests/test-runner/bin/test-runner.py.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in
index cb453b266f3c..28276ebc47e3 100755
--- a/tests/test-runner/bin/test-runner.py.in
+++ b/tests/test-runner/bin/test-runner.py.in
@@ -297,7 +297,7 @@ User: %s
         proc = Popen(privcmd, stdout=PIPE, stderr=PIPE)
         # Allow a special timeout value of 0 to mean infinity
         if int(self.timeout) == 0:
-            self.timeout = sys.maxsize
+            self.timeout = sys.maxsize / (10 ** 9)
         t = Timer(int(self.timeout), self.kill_cmd, [proc])
 
         try:

From 6017fd9377b217481097dda1206132ec81fcc8ef Mon Sep 17 00:00:00 2001
From: Reno Reckling <e-github@wthack.de>
Date: Fri, 3 Feb 2023 00:22:12 +0100
Subject: [PATCH 11/28] Fix variable shadowing in libzfs_mount

We accidentally reused variable name "i" for inner and outer loops.

Reviewed-by: Rich Ercolani <Rincebrain@gmail.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Signed-off-by: Reno Reckling <e-github@wthack.de>
Closes #14452
Closes #14445
---
 lib/libzfs/libzfs_mount.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c
index 57737bc6c01a..8612e082ba34 100644
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@@ -1422,10 +1422,10 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 	 * Walk through and first unshare everything.
 	 */
 	for (i = 0; i < used; i++) {
-		for (enum sa_protocol i = 0; i < SA_PROTOCOL_COUNT; ++i) {
-			if (sa_is_shared(sets[i].mountpoint, i) &&
+		for (enum sa_protocol p = 0; p < SA_PROTOCOL_COUNT; ++p) {
+			if (sa_is_shared(sets[i].mountpoint, p) &&
 			    unshare_one(hdl, sets[i].mountpoint,
-			    sets[i].mountpoint, i) != 0)
+			    sets[i].mountpoint, p) != 0)
 				goto out;
 		}
 	}

From dffd40b3b6185b001d529db50561ec11098d75cc Mon Sep 17 00:00:00 2001
From: Jorgen Lundman <lundman@lundman.net>
Date: Tue, 7 Feb 2023 02:27:55 +0900
Subject: [PATCH 12/28] Unify assembly files with macOS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The remaining changes needed to make the assembly files work
with macOS.

Reviewed-by: Attila Fülöp <attila@fueloep.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Jorgen Lundman <lundman@lundman.net>
Closes #14451
---
 module/icp/asm-x86_64/blake3/blake3_avx2.S     | 1 -
 module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S | 5 +++++
 module/icp/asm-x86_64/modes/gcm_pclmulqdq.S    | 2 +-
 module/icp/asm-x86_64/sha2/sha256_impl.S       | 2 +-
 module/icp/asm-x86_64/sha2/sha512_impl.S       | 3 +--
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/module/icp/asm-x86_64/blake3/blake3_avx2.S b/module/icp/asm-x86_64/blake3/blake3_avx2.S
index 8f9e766486f1..0ebec5c1095e 100644
--- a/module/icp/asm-x86_64/blake3/blake3_avx2.S
+++ b/module/icp/asm-x86_64/blake3/blake3_avx2.S
@@ -1791,7 +1791,6 @@ ENTRY_ALIGN(zfs_blake3_hash_many_avx2, 64)
 SET_SIZE(zfs_blake3_hash_many_avx2)
 
 SECTION_STATIC
-.section .rodata
 
 .p2align  6
 ADD0:
diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
index 165492a0ed76..909b2147dff9 100644
--- a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
+++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
@@ -53,6 +53,11 @@
 /* Windows userland links with OpenSSL */
 #if !defined (_WIN32) || defined (_KERNEL)
 
+/* Apple needs _ */
+#if defined (__APPLE__)
+#define	gcm_avx_can_use_movbe _gcm_avx_can_use_movbe
+#endif
+
 .extern gcm_avx_can_use_movbe
 
 .text
diff --git a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
index e40b3df32753..dec782fda33e 100644
--- a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
+++ b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
@@ -101,7 +101,7 @@ gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
 
 // static uint8_t byte_swap16_mask[] = {
 //	 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
-.section .rodata
+SECTION_STATIC
 .balign XMM_ALIGN
 .Lbyte_swap16_mask:
 	.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S
index f3d701528459..f1fde51c1d69 100644
--- a/module/icp/asm-x86_64/sha2/sha256_impl.S
+++ b/module/icp/asm-x86_64/sha2/sha256_impl.S
@@ -2063,7 +2063,7 @@ ENTRY_NP(SHA256TransformBlocks)
 .cfi_endproc
 SET_SIZE(SHA256TransformBlocks)
 
-.section .rodata
+SECTION_STATIC
 .balign	64
 SET_OBJ(K256)
 K256:
diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S
index 520f5b6dab24..b2f7d4863d8a 100644
--- a/module/icp/asm-x86_64/sha2/sha512_impl.S
+++ b/module/icp/asm-x86_64/sha2/sha512_impl.S
@@ -2064,7 +2064,7 @@ ENTRY_NP(SHA512TransformBlocks)
 .cfi_endproc
 SET_SIZE(SHA512TransformBlocks)
 
-.section .rodata
+SECTION_STATIC
 .balign	64
 SET_OBJ(K512)
 K512:
@@ -2113,4 +2113,3 @@ K512:
 #if defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
-

From 0a5b942d4aadbf5be99ff62976384db83d342f8c Mon Sep 17 00:00:00 2001
From: Jorgen Lundman <lundman@lundman.net>
Date: Tue, 7 Feb 2023 02:34:59 +0900
Subject: [PATCH 13/28] Restore FreeBSD to use .rodata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In https://github.com/openzfs/zfs/pull/14228 the FreeBSD
SECTION_STATIC was set to ".data" instead of ".rodata". This
commit just restores it back to .rodata.

Reviewed-by: Attila Fülöp <attila@fueloep.org>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Jorgen Lundman <lundman@lundman.net>
Closes #14460
---
 include/os/freebsd/spl/sys/ia32/asm_linkage.h        | 2 +-
 lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/os/freebsd/spl/sys/ia32/asm_linkage.h b/include/os/freebsd/spl/sys/ia32/asm_linkage.h
index 058d600007af..1ebfd8350661 100644
--- a/include/os/freebsd/spl/sys/ia32/asm_linkage.h
+++ b/include/os/freebsd/spl/sys/ia32/asm_linkage.h
@@ -36,7 +36,7 @@
 #define	ENDBR
 
 #define	SECTION_TEXT .text
-#define	SECTION_STATIC .data
+#define	SECTION_STATIC .rodata
 
 #ifdef	__cplusplus
 extern "C" {
diff --git a/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h b/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h
index 9964f183cc68..08c73037990f 100644
--- a/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h
+++ b/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h
@@ -40,7 +40,7 @@
 #define	ENDBR
 
 #define	SECTION_TEXT .text
-#define	SECTION_STATIC .data
+#define	SECTION_STATIC .rodata
 
 #ifdef	__cplusplus
 extern "C" {

From 14872aaa4f909d72c6b5e4105dadcfa13c7d9d66 Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <matthew.ahrens@delphix.com>
Date: Mon, 6 Feb 2023 09:37:06 -0800
Subject: [PATCH 14/28] EIO caused by encryption + recursive gang

Encrypted blocks can not have 3 DVAs, because they use the space of the
3rd DVA for the IV+salt.  zio_write_gang_block() takes this into
account, setting `gbh_copies` to no more than 2 in this case.  Gang
members BP's do not have the X (encrypted) bit set (nor do they have the
DMU level and type fields set), because encryption is not handled at
this level.  The gang block is reassembled, and then encryption (and
compression) are handled.

To check if this gang block is encrypted, the code in
zio_write_gang_block() checks `pio->io_bp`.  This is normally fine,
because the block that's being ganged is typically the encrypted BP.

The problem is that if there is "recursive ganging", where a gang member
is itself a gang block, then when zio_write_gang_block() is called to
create a gang block for a gang member, `pio->io_bp` is the gang member's
BP, which doesn't have the X bit set, so the number of DVA's is not
restricted to 2.  It should instead be looking at the the "gang leader",
i.e. the top-level gang block, to determine how many DVA's can be used,
to avoid a "NDVA's inversion" (where a child has more DVA's than its
parent).

gang leader BP: X (encrypted) bit set, 2 DVA's, IV+salt in 3rd DVA's
space:
```
DVA[0]=<1:...:100400> DVA[1]=<0:...:100400> salt=... iv=...
[L0 ZFS plain file] fletcher4 uncompressed encrypted LE
gang unique double size=100000L/100000P birth=... fill=1 cksum=...
```

leader's GBH contains a BP with gang bit set and 3 DVA's:
```
DVA[0]=<1:...:55600> DVA[1]=<0:...:55600>
[L0 unallocated] fletcher4 uncompressed unencrypted LE
contiguous unique double size=55600L/55600P birth=... fill=0 cksum=...

DVA[0]=<1:...:55600> DVA[1]=<0:...:55600>
[L0 unallocated] fletcher4 uncompressed unencrypted LE
contiguous unique double size=55600L/55600P birth=... fill=0 cksum=...

DVA[0]=<1:...:55600> DVA[1]=<0:...:55600> DVA[2]=<1:...:200>
[L0 unallocated] fletcher4 uncompressed unencrypted LE
gang unique double size=55400L/55400P birth=... fill=0 cksum=...
```

On nondebug bits, having the 3rd DVA in the gang block works for the
most part, because it's true that all 3 DVA's are available in the gang
member BP (in the GBH).  However, for accounting purposes, gang block
DVA's ASIZE include all the space allocated below them, i.e. the
512-byte gang block header (GBH) as well as the gang members below that.
We see that above where the gang leader BP is 1MB logical (and after
compression: 0x`100000P`), but the ASIZE of each DVA is 2 sectors (1KB)
more than 1MB (0x`100400`).

Since thre are 3 copies of a block below it, we increment the ATIME of
the 3rd DVA of the gang leader by the space used by the 3rd DVA of the
child (1 sector, in this case).  But there isn't really a 3rd DVA of the
parent; the salt is stored in place of the 3rd DVA's ASIZE.

So when zio_write_gang_member_ready() increments the parent's BP's
`DVA[2]`'s ASIZE, it's actually incrementing the parent's salt.  When we
later try to read the encrypted recursively-ganged block, the salt
doesn't match what we used to write it, so MAC verification fails and we
get an EIO.

```
zio_encrypt():  encrypted 515/2/0/403 salt: 25 25 bb 9d ad d6 cd 89
zio_decrypt(): decrypting 515/2/0/403 salt: 26 25 bb 9d ad d6 cd 89
```

This commit addresses the problem by not increasing the number of copies
of the GBH beyond 2 (even for non-encrypted blocks).  This simplifies
the logic while maintaining the ability to traverse all metadata
(including gang blocks) even if one copy is lost.  (Note that 3 copies
of the GBH will still be created if requested, e.g. for `copies=3` or
MOS blocks.)  Additionally, the code that increments the parent's DVA's
ASIZE is made to check the parent DVA's NDVAS even on nondebug bits.  So
if there's a similar bug in the future, it will cause a panic when
trying to write, rather than corrupting the parent BP and causing an
error when reading.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Caused-by: #14356
Closes #14440
Closes #14413
---
 module/zfs/zio.c                              | 16 ++--
 tests/runfiles/common.run                     |  2 +-
 tests/zfs-tests/tests/Makefile.am             |  1 +
 .../functional/no_space/enospc_ganging.ksh    | 86 +++++++++++++++++++
 4 files changed, 97 insertions(+), 8 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 5d7ed6d582a2..d888a584a93c 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2778,7 +2778,7 @@ zio_write_gang_member_ready(zio_t *zio)
 	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
 	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
 	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
-	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+	VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
@@ -2816,18 +2816,20 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
 	int copies = gio->io_prop.zp_copies;
-	int gbh_copies;
 	zio_prop_t zp;
 	int error;
 	boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
 
 	/*
-	 * encrypted blocks need DVA[2] free so encrypted gang headers can't
-	 * have a third copy.
+	 * If one copy was requested, store 2 copies of the GBH, so that we
+	 * can still traverse all the data (e.g. to free or scrub) even if a
+	 * block is damaged.  Note that we can't store 3 copies of the GBH in
+	 * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
 	 */
-	gbh_copies = MIN(copies + 1, spa_max_replication(spa));
-	if (BP_IS_ENCRYPTED(bp) && gbh_copies >= SPA_DVAS_PER_BP)
-		gbh_copies = SPA_DVAS_PER_BP - 1;
+	int gbh_copies = copies;
+	if (gbh_copies == 1) {
+		gbh_copies = MIN(2, spa_max_replication(spa));
+	}
 
 	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 005c539fc89d..7a7cf927c77e 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -704,7 +704,7 @@ tags = ['functional', 'nestedfs']
 
 [tests/functional/no_space]
 tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos',
-    'enospc_df', 'enospc_rm']
+    'enospc_df', 'enospc_ganging', 'enospc_rm']
 tags = ['functional', 'no_space']
 
 [tests/functional/nopwrite]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index bbe94f9177ae..ad2ec4670556 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1539,6 +1539,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/no_space/enospc_002_pos.ksh \
 	functional/no_space/enospc_003_pos.ksh \
 	functional/no_space/enospc_df.ksh \
+	functional/no_space/enospc_ganging.ksh \
 	functional/no_space/enospc_rm.ksh \
 	functional/no_space/setup.ksh \
 	functional/online_offline/cleanup.ksh \
diff --git a/tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh b/tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh
new file mode 100755
index 000000000000..1d35fba5dbfa
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh
@@ -0,0 +1,86 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Exercise gang block IO paths for non-encrypted and encrypted datasets.
+#
+
+verify_runnable "both"
+log_assert "Verify IO when file system is full and ganging."
+
+function cleanup
+{
+        log_must set_tunable64 METASLAB_FORCE_GANGING $metaslab_force_ganging
+	default_cleanup_noexit
+}
+
+log_onexit cleanup
+
+default_setup_noexit $DISKS
+
+typeset metaslab_force_ganging=$(get_tunable METASLAB_FORCE_GANGING)
+shift=$(random_int_between 15 17)
+log_must set_tunable64 METASLAB_FORCE_GANGING $((2**$shift))
+
+keyfile=/$TESTPOOL/keyencfods
+log_must eval "echo 'password' > $keyfile"
+bs=1024k
+count=512
+
+log_must dd if=/dev/urandom of=$TESTDIR/data bs=$bs count=$count
+data_checksum=$(sha256digest $TESTDIR/data)
+
+# Test common large block configuration.
+log_must zfs create -o recordsize=1m -o primarycache=metadata $TESTPOOL/gang
+mntpnt=$(get_prop mountpoint $TESTPOOL/gang)
+
+log_must dd if=$TESTDIR/data of=$mntpnt/file bs=$bs count=$count
+sync_pool $TESTPOOL
+log_must dd if=$mntpnt/file of=$TESTDIR/out bs=$bs count=$count
+out_checksum=$(sha256digest $TESTDIR/out)
+
+if [[ "$data_checksum" != "$out_checksum" ]]; then
+    log_fail "checksum mismatch ($data_checksum != $out_checksum)"
+fi
+
+log_must rm -f $TESTDIR/out
+log_must zfs destroy $TESTPOOL/gang
+
+# Test common large block configuration with encryption.
+log_must zfs create \
+	-o recordsize=1m \
+	-o primarycache=metadata \
+	-o compression=off \
+	-o encryption=on \
+	-o keyformat=passphrase \
+	-o keylocation=file://$keyfile \
+	-o copies=2 \
+	$TESTPOOL/gang
+mntpnt=$(get_prop mountpoint $TESTPOOL/gang)
+
+log_must dd if=$TESTDIR/data of=$mntpnt/file bs=$bs count=$count
+sync_pool $TESTPOOL
+log_must dd if=$mntpnt/file of=$TESTDIR/out bs=$bs count=$count
+out_checksum=$(sha256digest $TESTDIR/out)
+
+if [[ "$data_checksum" != "$out_checksum" ]]; then
+    log_fail "checksum mismatch ($data_checksum != $out_checksum)"
+fi
+
+log_must rm -f $TESTDIR/out
+log_must zfs destroy $TESTPOOL/gang
+
+log_pass "Verified IO when file system is full and ganging."

From 3a7d2a0ce0b7e7a2b005256be54fda00feaf86c0 Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Fri, 23 Dec 2022 00:00:38 -0500
Subject: [PATCH 15/28] zfs_get_temporary_prop() should not pass NULL to
 strcpy()

`dsl_dir_activity_in_progress()` can call `zfs_get_temporary_prop()` with
the forth value set to NULL, which will pass NULL to `strcpy()` when
there is a match

Clang's static analyzer caught this with the help of CodeChecker for
Cross Translation Unit analysis.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #14456
---
 module/os/freebsd/zfs/zfs_vfsops.c | 3 ++-
 module/os/linux/zfs/zfs_vfsops.c   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c
index 2820f10b5de8..a1e0595bda34 100644
--- a/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/module/os/freebsd/zfs/zfs_vfsops.c
@@ -230,7 +230,8 @@ zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
 
 	vfs_unbusy(vfsp);
 	if (tmp != *val) {
-		(void) strcpy(setpoint, "temporary");
+		if (setpoint)
+			(void) strcpy(setpoint, "temporary");
 		*val = tmp;
 	}
 	return (0);
diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index c921e587c75c..2d9b27a90884 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -608,7 +608,8 @@ zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
 	}
 
 	if (tmp != *val) {
-		(void) strcpy(setpoint, "temporary");
+		if (setpoint)
+			(void) strcpy(setpoint, "temporary");
 		*val = tmp;
 	}
 	return (0);

From cfb49616cdeb02c7cbb3dd314c55b9452ee21182 Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Mon, 23 Jan 2023 15:03:21 -0500
Subject: [PATCH 16/28] Cleanup: spa vdev processing should check NULL pointers

The PVS Studio 2016 FreeBSD kernel report stated:

\contrib\opensolaris\uts\common\fs\zfs\spa.c (1341): error V595: The 'spa->spa_spares.sav_vdevs' pointer was utilized before it was verified against nullptr. Check lines: 1341, 1342.
\sys\cddl\contrib\opensolaris\uts\common\fs\zfs\spa.c (1355): error V595: The 'spa->spa_l2cache.sav_vdevs' pointer was utilized before it was verified against nullptr. Check lines: 1355, 1357.
\sys\cddl\contrib\opensolaris\uts\common\fs\zfs\spa.c (1398): error V595: The 'spa->spa_spares.sav_vdevs' pointer was utilized before it was verified against nullptr. Check lines: 1398, 1408.
\sys\cddl\contrib\opensolaris\uts\common\fs\zfs\spa.c (1583): error V595: The 'oldvdevs' pointer was utilized before it was verified against nullptr. Check lines: 1583, 1595.

In practice, all of these uses were safe because a NULL pointer
implied a 0 vdev count, which kept us from iterating over vdevs.
However, rearranging the code to check the pointer first is not a
terrible micro-optimization and makes it more readable, so let us
do that.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #14456
---
 module/zfs/spa.c | 58 +++++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 67b3a03a951a..bbb83fc610b1 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1714,9 +1714,9 @@ spa_unload(spa_t *spa)
 	 */
 	spa_l2cache_drop(spa);
 
-	for (int i = 0; i < spa->spa_spares.sav_count; i++)
-		vdev_free(spa->spa_spares.sav_vdevs[i]);
 	if (spa->spa_spares.sav_vdevs) {
+		for (int i = 0; i < spa->spa_spares.sav_count; i++)
+			vdev_free(spa->spa_spares.sav_vdevs[i]);
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 		spa->spa_spares.sav_vdevs = NULL;
@@ -1727,11 +1727,11 @@ spa_unload(spa_t *spa)
 	}
 	spa->spa_spares.sav_count = 0;
 
-	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
-		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
-		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
-	}
 	if (spa->spa_l2cache.sav_vdevs) {
+		for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+			vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
+			vdev_free(spa->spa_l2cache.sav_vdevs[i]);
+		}
 		kmem_free(spa->spa_l2cache.sav_vdevs,
 		    spa->spa_l2cache.sav_count * sizeof (void *));
 		spa->spa_l2cache.sav_vdevs = NULL;
@@ -1789,20 +1789,21 @@ spa_load_spares(spa_t *spa)
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
-	for (i = 0; i < spa->spa_spares.sav_count; i++) {
-		vd = spa->spa_spares.sav_vdevs[i];
+	if (spa->spa_spares.sav_vdevs) {
+		for (i = 0; i < spa->spa_spares.sav_count; i++) {
+			vd = spa->spa_spares.sav_vdevs[i];
 
-		/* Undo the call to spa_activate() below */
-		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
-		    B_FALSE)) != NULL && tvd->vdev_isspare)
-			spa_spare_remove(tvd);
-		vdev_close(vd);
-		vdev_free(vd);
-	}
+			/* Undo the call to spa_activate() below */
+			if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
+			    B_FALSE)) != NULL && tvd->vdev_isspare)
+				spa_spare_remove(tvd);
+			vdev_close(vd);
+			vdev_free(vd);
+		}
 
-	if (spa->spa_spares.sav_vdevs)
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
+	}
 
 	if (spa->spa_spares.sav_config == NULL)
 		nspares = 0;
@@ -2013,23 +2014,24 @@ spa_load_l2cache(spa_t *spa)
 	/*
 	 * Purge vdevs that were dropped
 	 */
-	for (i = 0; i < oldnvdevs; i++) {
-		uint64_t pool;
+	if (oldvdevs) {
+		for (i = 0; i < oldnvdevs; i++) {
+			uint64_t pool;
 
-		vd = oldvdevs[i];
-		if (vd != NULL) {
-			ASSERT(vd->vdev_isl2cache);
+			vd = oldvdevs[i];
+			if (vd != NULL) {
+				ASSERT(vd->vdev_isl2cache);
 
-			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
-			    pool != 0ULL && l2arc_vdev_present(vd))
-				l2arc_remove_vdev(vd);
-			vdev_clear_stats(vd);
-			vdev_free(vd);
+				if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+				    pool != 0ULL && l2arc_vdev_present(vd))
+					l2arc_remove_vdev(vd);
+				vdev_clear_stats(vd);
+				vdev_free(vd);
+			}
 		}
-	}
 
-	if (oldvdevs)
 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
+	}
 
 	for (i = 0; i < sav->sav_count; i++)
 		nvlist_free(l2cache[i]);

From 66953686c05901277115e837bbaab4df4ffd63c6 Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Thu, 2 Feb 2023 17:30:39 -0500
Subject: [PATCH 17/28] Add assertion and make variables unsigned in
 abd_alloc_chunks()

Clang's static analyzer pointed out that if alloc_pages >= nr_pages
before the loop, the value of page will be undefined and will be used
anyway. This should not be possible, but as cleanup, we add an
assertion. We also recognize that the local variables should be unsigned
in the first place, so we make them unsigned. This is not enough to
avoid the need for the assertion, since there is still the case that
alloc_pages == nr_pages and nr_pages == 0, which the assertion
implicitly checks.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #14456
---
 module/os/linux/zfs/abd_os.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 16530d82693e..13150adbe0cf 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -272,18 +272,20 @@ abd_alloc_chunks(abd_t *abd, size_t size)
 	struct page *page, *tmp_page = NULL;
 	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
 	gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
-	int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
-	int nr_pages = abd_chunkcnt_for_bytes(size);
-	int chunks = 0, zones = 0;
+	unsigned int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
+	unsigned int nr_pages = abd_chunkcnt_for_bytes(size);
+	unsigned int chunks = 0, zones = 0;
 	size_t remaining_size;
 	int nid = NUMA_NO_NODE;
-	int alloc_pages = 0;
+	unsigned int alloc_pages = 0;
 
 	INIT_LIST_HEAD(&pages);
 
+	ASSERT3U(alloc_pages, <, nr_pages);
+
 	while (alloc_pages < nr_pages) {
-		unsigned chunk_pages;
-		int order;
+		unsigned int chunk_pages;
+		unsigned int order;
 
 		order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
 		chunk_pages = (1U << order);

From eb823cbc76d28a7cafdf6a7aafdefe7e74fe26bc Mon Sep 17 00:00:00 2001
From: Ryan Moeller <ryan@iXsystems.com>
Date: Mon, 6 Feb 2023 14:16:01 -0500
Subject: [PATCH 18/28] initramfs: Make mountpoint=none work

In initramfs, mount.zfs fails to mount a dataset with mountpoint=none,
but mount.zfs -o zfsutil works.  Use -o zfsutil when mountpoint=none.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Signed-off-by: Ryan Moeller <ryan@iXsystems.com>
Closes #14455
---
 contrib/initramfs/scripts/zfs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs
index 41a4d96f8732..d1a99e4a87ba 100644
--- a/contrib/initramfs/scripts/zfs
+++ b/contrib/initramfs/scripts/zfs
@@ -359,9 +359,11 @@ mount_fs()
 				# isn't the root fs.
 				return 0
 			fi
-			ZFS_CMD="mount.zfs"
 			# Last hail-mary: Hope 'rootmnt' is set!
 			mountpoint=""
+			if [ "$mountpoint" = "legacy" ]; then
+				ZFS_CMD="mount.zfs"
+			fi
 		else
 			mountpoint="$mountpoint1"
 		fi

From ac7648179c856750b719c7a9e0464466df390b26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rob=20N=20=E2=98=85?= <robn@despairlabs.com>
Date: Wed, 8 Feb 2023 08:48:22 +1100
Subject: [PATCH 19/28] zdb: zero-pad checksum output

The leading zeroes are part of the checksum so we should show them.

Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #14464
---
 cmd/zdb/zdb.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index d6e3c5806944..d239da67613c 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -2377,7 +2377,8 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
 		(void) snprintf(blkbuf + strlen(blkbuf),
-		    buflen - strlen(blkbuf), " cksum=%llx:%llx:%llx:%llx",
+		    buflen - strlen(blkbuf),
+		    " cksum=%016llx:%016llx:%016llx:%016llx",
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],
 		    (u_longlong_t)bp->blk_cksum.zc_word[2],
@@ -8383,7 +8384,9 @@ zdb_read_block(char *thing, spa_t *spa)
 				    DVA_GET_OFFSET(&bp->blk_dva[0]);
 				ck_zio->io_bp = bp;
 				zio_checksum_compute(ck_zio, ck, pabd, lsize);
-				printf("%12s\tcksum=%llx:%llx:%llx:%llx\n",
+				printf(
+				    "%12s\t"
+				    "cksum=%016llx:%016llx:%016llx:%016llx\n",
 				    zio_checksum_table[ck].ci_name,
 				    (u_longlong_t)bp->blk_cksum.zc_word[0],
 				    (u_longlong_t)bp->blk_cksum.zc_word[1],

From 7883ea2234fac0cd976eb2ca0c6b51e7a5da7668 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20V=C3=B6gele?= <andreas@andreasvoegele.com>
Date: Thu, 9 Feb 2023 20:57:50 +0100
Subject: [PATCH 20/28] rpm: Use libtirpc-devel and /usr/lib on SUSE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SUSE Linux distributions require libtirpc-devel.  The dracut and udev
directories are /usr/lib/dracut and /usr/lib/udev.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Signed-off-by: Andreas Vögele <andreas@andreasvoegele.com>
Closes #14467
Closes #14468
---
 rpm/generic/zfs.spec.in | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 251470a67fba..8c538a00d203 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -3,7 +3,7 @@
 
 # Set the default udev directory based on distribution.
 %if %{undefined _udevdir}
-%if 0%{?fedora}%{?rhel}%{?centos}%{?openEuler}
+%if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %global _udevdir    %{_prefix}/lib/udev
 %else
 %global _udevdir    /lib/udev
@@ -12,7 +12,7 @@
 
 # Set the default udevrule directory based on distribution.
 %if %{undefined _udevruledir}
-%if 0%{?fedora}%{?rhel}%{?centos}%{?openEuler}
+%if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %global _udevruledir    %{_prefix}/lib/udev/rules.d
 %else
 %global _udevruledir    /lib/udev/rules.d
@@ -21,7 +21,7 @@
 
 # Set the default dracut directory based on distribution.
 %if %{undefined _dracutdir}
-%if 0%{?fedora}%{?rhel}%{?centos}%{?openEuler}
+%if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %global _dracutdir  %{_prefix}/lib/dracut
 %else
 %global _dracutdir  %{_prefix}/share/dracut
@@ -110,7 +110,7 @@ BuildRequires:  libblkid-devel
 BuildRequires:  libudev-devel
 BuildRequires:  libattr-devel
 BuildRequires:  openssl-devel
-%if 0%{?fedora}%{?openEuler} || 0%{?rhel} >= 8 || 0%{?centos} >= 8
+%if 0%{?fedora}%{?suse_version}%{?openEuler} || 0%{?rhel} >= 8 || 0%{?centos} >= 8
 BuildRequires:  libtirpc-devel
 %endif
 

From 87a4dfa561900dafaa446661538faa485af5f17a Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 13 Feb 2023 16:21:53 -0500
Subject: [PATCH 21/28] Improve arc_read() error reporting

Debugging reported NULL de-reference panic in dnode_hold_impl() I found
that for certain types of errors arc_read() may only return error code,
but not properly report it via done and pio arguments.  Lack of done
calls may result in reference and/or memory leaks in higher level code.
Lack of error reporting via pio may result in unnoticed errors there.
For example, dbuf_read(), where dbuf_read_impl() ignores arc_read()
return, relies completely on the pio mechanism and missed the errors.

This patch makes arc_read() to always call done callback and always
propagate errors to parent zio, if either is provided.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14454
---
 module/zfs/arc.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 2a52d0d24572..aa806706de29 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -5958,6 +5958,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
 	boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
+	arc_buf_t *buf = NULL;
 	int rc = 0;
 
 	ASSERT(!embedded_bp ||
@@ -5987,7 +5988,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 	if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER,
 	    BLK_VERIFY_LOG)) {
 		rc = SET_ERROR(ECKSUM);
-		goto out;
+		goto done;
 	}
 
 	if (!embedded_bp) {
@@ -6008,14 +6009,13 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
 	    (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
 		boolean_t is_data = !HDR_ISTYPE_METADATA(hdr);
-		arc_buf_t *buf = NULL;
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 			if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 				mutex_exit(hash_lock);
 				ARCSTAT_BUMP(arcstat_cached_only_in_progress);
 				rc = SET_ERROR(ENOENT);
-				goto out;
+				goto done;
 			}
 
 			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
@@ -6144,9 +6144,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 		ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 		    demand, prefetch, is_data, data, metadata, hits);
 		*arc_flags |= ARC_FLAG_CACHED;
-
-		if (done)
-			done(NULL, zb, bp, buf, private);
+		goto done;
 	} else {
 		uint64_t lsize = BP_GET_LSIZE(bp);
 		uint64_t psize = BP_GET_PSIZE(bp);
@@ -6159,10 +6157,10 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 		int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
 
 		if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
-			rc = SET_ERROR(ENOENT);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
-			goto out;
+			rc = SET_ERROR(ENOENT);
+			goto done;
 		}
 
 		if (hdr == NULL) {
@@ -6482,6 +6480,16 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 		spa_read_history_add(spa, zb, *arc_flags);
 	spl_fstrans_unmark(cookie);
 	return (rc);
+
+done:
+	if (done)
+		done(NULL, zb, bp, buf, private);
+	if (pio && rc != 0) {
+		zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags);
+		zio->io_error = rc;
+		zio_nowait(zio);
+	}
+	goto out;
 }
 
 arc_prune_t *

From 13312e2fa107b2a10595d67de82ecdf74db317ed Mon Sep 17 00:00:00 2001
From: Prakash Surya <prakash.surya@delphix.com>
Date: Mon, 13 Feb 2023 16:35:59 -0800
Subject: [PATCH 22/28] Reduce need for contiguous memory for ioctls

We've had cases where we trigger an OOM despite having memory freely
available on the system. For example, here, we had about 21GB free:

    kernel: Node 0 Normal: 2418758*4kB (UME) 1549533*8kB (UE) 0*16kB
    0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB =
    22071296kB

The problem being, all the memory is in 4K and 8K contiguous regions,
but the allocation request was for a 16K contiguous region:

    kernel: SafeExecutors-4 invoked oom-killer:
    gfp_mask=0x42dc0(GFP_KERNEL|__GFP_NOWARN|__GFP_COMP|__GFP_ZERO),
    order=2, oom_score_adj=0

The offending allocation came from this call trace:

    kernel: Call Trace:
    kernel:  dump_stack+0x57/0x7a
    kernel:  dump_header+0x4f/0x1e1
    kernel:  oom_kill_process.cold.33+0xb/0x10
    kernel:  out_of_memory+0x1ad/0x490
    kernel:  __alloc_pages_slowpath+0xd55/0xe40
    kernel:  __alloc_pages_nodemask+0x2df/0x330
    kernel:  kmalloc_large_node+0x42/0x90
    kernel:  __kmalloc_node+0x25a/0x320
    kernel:  ? spl_kmem_free_impl+0x21/0x30 [spl]
    kernel:  spl_kmem_alloc_impl+0xa5/0x100 [spl]
    kernel:  spl_kmem_zalloc+0x19/0x20 [spl]
    kernel:  zfsdev_ioctl+0x2b/0xe0 [zfs]
    kernel:  do_vfs_ioctl+0xa9/0x640
    kernel:  ? __audit_syscall_entry+0xdd/0x130
    kernel:  ksys_ioctl+0x67/0x90
    kernel:  __x64_sys_ioctl+0x1a/0x20
    kernel:  do_syscall_64+0x5e/0x200
    kernel:  entry_SYSCALL_64_after_hwframe+0x44/0xa9
    kernel: RIP: 0033:0x7fdca3674317

The problem is, for each ioctl that ZFS makes, it has to allocate a
zfs_cmd_t structure, which is 13744 bytes in size (on my system):

    sdb> sizeof zfs_cmd
    (size_t)13744

This size, coupled with the fact that we currently allocate it with
kmem_zalloc, means we need a 16K contiguous region of memory to satisfy
the request.

The solution taken by this change, is to use "vmem" instead of "kmem" to
do the allocation, such that we don't necessarily need a contiguous 16K
memory region to satisfy the allocation.

Arguably, a better solution would be not to require such a large
allocation to begin with (e.g. reduce the size of the zfs_cmd_t
structure), but that'd be a much larger change than this "one liner".
Thus, I've opted for this approach for now; we can always circle back
and attempt to reduce the size of the structure in the future.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Signed-off-by: Prakash Surya <prakash.surya@delphix.com>
Closes #14474
---
 module/os/freebsd/zfs/kmod_core.c  | 10 +++++-----
 module/os/linux/zfs/zfs_ioctl_os.c |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c
index a1b1c8e4e6d9..f4c87013dbf0 100644
--- a/module/os/freebsd/zfs/kmod_core.c
+++ b/module/os/freebsd/zfs/kmod_core.c
@@ -142,7 +142,7 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag,
 		return (EINVAL);
 
 	uaddr = (void *)zp->zfs_cmd;
-	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+	zc = vmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 #ifdef ZFS_LEGACY_SUPPORT
 	/*
 	 * Remap ioctl code for legacy user binaries
@@ -150,10 +150,10 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag,
 	if (zp->zfs_ioctl_version == ZFS_IOCVER_LEGACY) {
 		vecnum = zfs_ioctl_legacy_to_ozfs(vecnum);
 		if (vecnum < 0) {
-			kmem_free(zc, sizeof (zfs_cmd_t));
+			vmem_free(zc, sizeof (zfs_cmd_t));
 			return (ENOTSUP);
 		}
-		zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP);
+		zcl = vmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP);
 		if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) {
 			error = SET_ERROR(EFAULT);
 			goto out;
@@ -180,9 +180,9 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag,
 out:
 #ifdef ZFS_LEGACY_SUPPORT
 	if (zcl)
-		kmem_free(zcl, sizeof (zfs_cmd_legacy_t));
+		vmem_free(zcl, sizeof (zfs_cmd_legacy_t));
 #endif
-	kmem_free(zc, sizeof (zfs_cmd_t));
+	vmem_free(zc, sizeof (zfs_cmd_t));
 	MPASS(tsd_get(rrw_tsd_key) == NULL);
 	return (error);
 }
diff --git a/module/os/linux/zfs/zfs_ioctl_os.c b/module/os/linux/zfs/zfs_ioctl_os.c
index f8d1777b07a6..f068f544f0ec 100644
--- a/module/os/linux/zfs/zfs_ioctl_os.c
+++ b/module/os/linux/zfs/zfs_ioctl_os.c
@@ -135,7 +135,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
 
 	vecnum = cmd - ZFS_IOC_FIRST;
 
-	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+	zc = vmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 
 	if (ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t), 0)) {
 		error = -SET_ERROR(EFAULT);
@@ -146,7 +146,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
 	if (error == 0 && rc != 0)
 		error = -SET_ERROR(EFAULT);
 out:
-	kmem_free(zc, sizeof (zfs_cmd_t));
+	vmem_free(zc, sizeof (zfs_cmd_t));
 	return (error);
 
 }

From 34ce4c42ffdcd8933768533343d8b29f9612fbae Mon Sep 17 00:00:00 2001
From: George Amanakis <gamanakis@gmail.com>
Date: Tue, 14 Feb 2023 01:37:46 +0100
Subject: [PATCH 23/28] Fix a race condition in dsl_dataset_sync() when
 activating features

The zio returned from arc_write() in dmu_objset_sync() uses
zio_nowait(). However we may reach the end of dsl_dataset_sync()
which checks if we need to activate features in the filesystem
without knowing if that zio has even run through the ZIO pipeline yet.
In that case we will flag features to be activated in
dsl_dataset_block_born() but dsl_dataset_sync() has already
completed its run and those features will not actually be activated.
Mitigate this by moving the feature activation code in
dsl_dataset_sync_done(). Also add new ASSERTs in
dsl_scan_visitbp() checking if a block contradicts any filesystem
flags.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #13816
---
 include/sys/dsl_dataset.h |  1 +
 module/zfs/dmu_send.c     |  1 +
 module/zfs/dsl_dataset.c  | 23 ++++++++++++-----------
 module/zfs/dsl_scan.c     | 20 ++++++++++++++++++++
 4 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h
index 3450527af7e0..b0d8c7994c07 100644
--- a/include/sys/dsl_dataset.h
+++ b/include/sys/dsl_dataset.h
@@ -372,6 +372,7 @@ int dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
     minor_t cleanup_minor, const char *htag);
+boolean_t zfeature_active(spa_feature_t f, void *arg);
 
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
 
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 7f8de23f0e29..33beb04b19b1 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -493,6 +493,7 @@ dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object,
 	    (bp != NULL ? BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    io_compressed : lsize != psize);
 	if (raw || compressed) {
+		ASSERT(bp != NULL);
 		ASSERT(raw || dscp->dsc_featureflags &
 		    DMU_BACKUP_FEATURE_COMPRESSED);
 		ASSERT(!BP_IS_EMBEDDED(bp));
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 57a58f88cec5..0584a356f9ac 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -1039,7 +1039,7 @@ dsl_dataset_has_owner(dsl_dataset_t *ds)
 	return (rv);
 }
 
-static boolean_t
+boolean_t
 zfeature_active(spa_feature_t f, void *arg)
 {
 	switch (spa_feature_table[f].fi_type) {
@@ -2121,16 +2121,6 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 	}
 
 	dmu_objset_sync(ds->ds_objset, zio, tx);
-
-	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
-		if (zfeature_active(f, ds->ds_feature_activation[f])) {
-			if (zfeature_active(f, ds->ds_feature[f]))
-				continue;
-			dsl_dataset_activate_feature(ds->ds_object, f,
-			    ds->ds_feature_activation[f], tx);
-			ds->ds_feature[f] = ds->ds_feature_activation[f];
-		}
-	}
 }
 
 /*
@@ -2303,6 +2293,17 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
 	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
 
 	dmu_buf_rele(ds->ds_dbuf, ds);
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (zfeature_active(f,
+		    ds->ds_feature_activation[f])) {
+			if (zfeature_active(f, ds->ds_feature[f]))
+				continue;
+			dsl_dataset_activate_feature(ds->ds_object, f,
+			    ds->ds_feature_activation[f], tx);
+			ds->ds_feature[f] = ds->ds_feature_activation[f];
+		}
+	}
 }
 
 int
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 2cc77eab6275..f971aa211e0c 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2038,6 +2038,26 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
 		return;
 	}
 
+	/*
+	 * Check if this block contradicts any filesystem flags.
+	 */
+	spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS;
+	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
+		ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+	f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+	if (f != SPA_FEATURE_NONE)
+		ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+	f = zio_compress_to_feature(BP_GET_COMPRESS(bp));
+	if (f != SPA_FEATURE_NONE)
+		ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+		scn->scn_lt_min_this_txg++;
+		return;
+	}
+
 	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
 		scn->scn_lt_min_this_txg++;
 		return;

From cfd57573ffea911f464aeaf7ba415daf32fe0bd5 Mon Sep 17 00:00:00 2001
From: Rich Ercolani <214141+rincebrain@users.noreply.github.com>
Date: Mon, 13 Feb 2023 19:40:13 -0500
Subject: [PATCH 24/28] quick fix for lingering snapdir unmount problems

Unfortunately, even after e79b6807, I still, much more rarely,
tripped asserts when playing with many ctldir mounts at once.

Since this appears to happen if we dispatched twice too fast, just
ignore it. We don't actually need to do anything if someone already
started doing it for us.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #14462
---
 module/os/linux/zfs/zfs_ctldir.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c
index 519f13212fac..cc4bc9702f8e 100644
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@@ -392,7 +392,20 @@ zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
 
 	zfsctl_snapshot_hold(se);
 	rw_enter(&se->se_taskqid_lock, RW_WRITER);
-	ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID);
+	/*
+	 * If this condition happens, we managed to:
+	 * - dispatch once
+	 * - want to dispatch _again_ before it returned
+	 *
+	 * So let's just return - if that task fails at unmounting,
+	 * we'll eventually dispatch again, and if it succeeds,
+	 * no problem.
+	 */
+	if (se->se_taskqid != TASKQID_INVALID) {
+		rw_exit(&se->se_taskqid_lock);
+		zfsctl_snapshot_rele(se);
+		return;
+	}
 	se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
 	    snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
 	rw_exit(&se->se_taskqid_lock);

From ab672133a9bde75d20afd59d8db1405c7300a557 Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Tue, 14 Feb 2023 14:03:42 -0500
Subject: [PATCH 25/28] Give strlcat() full buffer lengths rather than smaller
 buffer lengths

strlcat() is supposed to be given the length of the destination buffer,
including the existing contents. Unfortunately, I had been overzealous
when I wrote a51288aabbbc176a8a73a8b3cd56f79607db32cf, since I gave it
the length of the destination buffer, minus the existing contents. This
likely caused a regression on large strings.

On the topic of being overzealous, the use of strlcat() in
dmu_send_estimate_fast() was unnecessary because recv_clone_name is a
fixed length string. We continue using strlcat() mostly as defensive
programming, in case the string length is ever changed, even though it
is unnecessary.

Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #14476
---
 lib/libshare/nfs.c           | 2 +-
 lib/libzfs/libzfs_sendrecv.c | 4 ++--
 module/zfs/dmu_send.c        | 3 +--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/libshare/nfs.c b/lib/libshare/nfs.c
index 118ad7ef2209..3962c87453d4 100644
--- a/lib/libshare/nfs.c
+++ b/lib/libshare/nfs.c
@@ -97,7 +97,7 @@ nfs_init_tmpfile(const char *prefix, const char *mdir, struct tmpfile *tmpf)
 	}
 
 	strlcpy(tmpf->name, prefix, sizeof (tmpf->name));
-	strlcat(tmpf->name, ".XXXXXXXX", sizeof (tmpf->name) - strlen(prefix));
+	strlcat(tmpf->name, ".XXXXXXXX", sizeof (tmpf->name));
 
 	int fd = mkostemp(tmpf->name, O_CLOEXEC);
 	if (fd == -1) {
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index 1d2ad1944051..66a22e333663 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -4590,7 +4590,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 			    B_FALSE, destsnap) == 0) {
 				*strchr(destsnap, '@') = '\0';
 				(void) strlcat(destsnap, suffix,
-				    sizeof (destsnap) - strlen(destsnap));
+				    sizeof (destsnap));
 			}
 		}
 	} else {
@@ -4626,7 +4626,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 			    B_FALSE, destsnap) == 0) {
 				*strchr(destsnap, '@') = '\0';
 				(void) strlcat(destsnap, snap,
-				    sizeof (destsnap) - strlen(destsnap));
+				    sizeof (destsnap));
 			}
 		}
 	}
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 33beb04b19b1..f86a0a5b1c57 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -3029,8 +3029,7 @@ dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds,
 
 		dsl_dataset_name(origds, dsname);
 		(void) strcat(dsname, "/");
-		(void) strlcat(dsname, recv_clone_name,
-		    sizeof (dsname) - strlen(dsname));
+		(void) strlcat(dsname, recv_clone_name, sizeof (dsname));
 
 		err = dsl_dataset_hold(origds->ds_dir->dd_pool,
 		    dsname, FTAG, &ds);

From 3fc92adc409a36de229c78c7ca3d4689e9386bd3 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 14 Feb 2023 11:04:34 -0800
Subject: [PATCH 26/28] Linux: use filemap_range_has_page()

As of the 4.13 kernel filemap_range_has_page() can be used to
check if there is a page mapped in a given file range.  When
available this interface should be used which eliminates the
need for the zp->z_is_mapped boolean.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #14493
---
 config/kernel-filemap.m4                    | 26 +++++++++++++++++++++
 config/kernel.m4                            |  2 ++
 include/os/freebsd/zfs/sys/zfs_znode_impl.h |  3 ++-
 include/os/linux/zfs/sys/trace_acl.h        |  9 +++----
 include/os/linux/zfs/sys/zfs_znode_impl.h   | 16 ++++++++++++-
 include/sys/zfs_znode.h                     |  1 -
 module/os/linux/zfs/zfs_ctldir.c            |  2 ++
 module/os/linux/zfs/zfs_vnops_os.c          |  5 ++--
 module/os/linux/zfs/zfs_znode.c             |  4 +++-
 module/os/linux/zfs/zpl_file.c              |  6 +++--
 module/zfs/zfs_vnops.c                      |  8 ++++---
 11 files changed, 65 insertions(+), 17 deletions(-)
 create mode 100644 config/kernel-filemap.m4

diff --git a/config/kernel-filemap.m4 b/config/kernel-filemap.m4
new file mode 100644
index 000000000000..745928168f92
--- /dev/null
+++ b/config/kernel-filemap.m4
@@ -0,0 +1,26 @@
+dnl #
+dnl # filemap_range_has_page was not available till 4.13
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP], [
+	ZFS_LINUX_TEST_SRC([filemap_range_has_page], [
+		#include <linux/fs.h>
+	],[
+		struct address_space *mapping = NULL;
+		loff_t lstart = 0;
+		loff_t lend = 0;
+		bool ret __attribute__ ((unused));
+
+		ret = filemap_range_has_page(mapping, lstart, lend);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_FILEMAP], [
+	AC_MSG_CHECKING([whether filemap_range_has_page() is available])
+	ZFS_LINUX_TEST_RESULT([filemap_range_has_page], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_FILEMAP_RANGE_HAS_PAGE, 1,
+		[filemap_range_has_page() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 353988e9c867..121d73ef641a 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -150,6 +150,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_USER_NS_COMMON_INUM
 	ZFS_AC_KERNEL_SRC_IDMAP_MNT_API
 	ZFS_AC_KERNEL_SRC_IATTR_VFSID
+	ZFS_AC_KERNEL_SRC_FILEMAP
 
 	AC_MSG_CHECKING([for available kernel interfaces])
 	ZFS_LINUX_TEST_COMPILE_ALL([kabi])
@@ -273,6 +274,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_USER_NS_COMMON_INUM
 	ZFS_AC_KERNEL_IDMAP_MNT_API
 	ZFS_AC_KERNEL_IATTR_VFSID
+	ZFS_AC_KERNEL_FILEMAP
 ])
 
 dnl #
diff --git a/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/include/os/freebsd/zfs/sys/zfs_znode_impl.h
index 41a5bb218c12..8cde33dbcbbb 100644
--- a/include/os/freebsd/zfs/sys/zfs_znode_impl.h
+++ b/include/os/freebsd/zfs/sys/zfs_znode_impl.h
@@ -116,7 +116,8 @@ typedef struct zfs_soft_state {
 #define	Z_ISLNK(type) ((type) == VLNK)
 #define	Z_ISDIR(type) ((type) == VDIR)
 
-#define	zn_has_cached_data(zp)		vn_has_cached_data(ZTOV(zp))
+#define	zn_has_cached_data(zp, start, end) \
+    vn_has_cached_data(ZTOV(zp))
 #define	zn_flush_cached_data(zp, sync)	vn_flush_cached_data(ZTOV(zp), sync)
 #define	zn_rlimit_fsize(zp, uio) \
     vn_rlimit_fsize(ZTOV(zp), GET_UIO_STRUCT(uio), zfs_uio_td(uio))
diff --git a/include/os/linux/zfs/sys/trace_acl.h b/include/os/linux/zfs/sys/trace_acl.h
index 2c734322267a..15dc77edafac 100644
--- a/include/os/linux/zfs/sys/trace_acl.h
+++ b/include/os/linux/zfs/sys/trace_acl.h
@@ -62,7 +62,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __field(uint32_t,		z_async_writes_cnt)
 	    __field(mode_t,		z_mode)
 	    __field(boolean_t,		z_is_sa)
-	    __field(boolean_t,		z_is_mapped)
 	    __field(boolean_t,		z_is_ctldir)
 
 	    __field(uint32_t,		i_uid)
@@ -96,7 +95,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __entry->z_async_writes_cnt	= zn->z_async_writes_cnt;
 	    __entry->z_mode		= zn->z_mode;
 	    __entry->z_is_sa		= zn->z_is_sa;
-	    __entry->z_is_mapped	= zn->z_is_mapped;
 	    __entry->z_is_ctldir	= zn->z_is_ctldir;
 
 	    __entry->i_uid		= KUID_TO_SUID(ZTOI(zn)->i_uid);
@@ -119,7 +117,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    "zn_prefetch %u blksz %u seq %u "
 	    "mapcnt %llu size %llu pflags %llu "
 	    "sync_cnt %u sync_writes_cnt %u async_writes_cnt %u "
-	    "mode 0x%x is_sa %d is_mapped %d is_ctldir %d "
+	    "mode 0x%x is_sa %d is_ctldir %d "
 	    "inode { uid %u gid %u ino %lu nlink %u size %lli "
 	    "blkbits %u bytes %u mode 0x%x generation %x } } "
 	    "ace { type %u flags %u access_mask %u } mask_matched %u",
@@ -128,9 +126,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __entry->z_seq, __entry->z_mapcnt, __entry->z_size,
 	    __entry->z_pflags, __entry->z_sync_cnt,
 	    __entry->z_sync_writes_cnt, __entry->z_async_writes_cnt,
-	    __entry->z_mode, __entry->z_is_sa, __entry->z_is_mapped,
-	    __entry->z_is_ctldir, __entry->i_uid,
-	    __entry->i_gid, __entry->i_ino, __entry->i_nlink,
+	    __entry->z_mode, __entry->z_is_sa, __entry->z_is_ctldir,
+	    __entry->i_uid, __entry->i_gid, __entry->i_ino, __entry->i_nlink,
 	    __entry->i_size, __entry->i_blkbits,
 	    __entry->i_bytes, __entry->i_mode, __entry->i_generation,
 	    __entry->z_type, __entry->z_flags, __entry->z_access_mask,
diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h
index 52568781011f..81607ef2a25e 100644
--- a/include/os/linux/zfs/sys/zfs_znode_impl.h
+++ b/include/os/linux/zfs/sys/zfs_znode_impl.h
@@ -47,9 +47,16 @@
 extern "C" {
 #endif
 
+#if defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
 #define	ZNODE_OS_FIELDS			\
 	inode_timespec_t z_btime; /* creation/birth time (cached) */ \
 	struct inode	z_inode;
+#else
+#define	ZNODE_OS_FIELDS			\
+	inode_timespec_t z_btime; /* creation/birth time (cached) */ \
+	struct inode	z_inode;                                     \
+	boolean_t	z_is_mapped;    /* we are mmap'ed */
+#endif
 
 /*
  * Convert between znode pointers and inode pointers
@@ -70,7 +77,14 @@ extern "C" {
 #define	Z_ISDEV(type)	(S_ISCHR(type) || S_ISBLK(type) || S_ISFIFO(type))
 #define	Z_ISDIR(type)	S_ISDIR(type)
 
-#define	zn_has_cached_data(zp)		((zp)->z_is_mapped)
+#if defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
+#define	zn_has_cached_data(zp, start, end) \
+	filemap_range_has_page(ZTOI(zp)->i_mapping, start, end)
+#else
+#define	zn_has_cached_data(zp, start, end) \
+	((zp)->z_is_mapped)
+#endif
+
 #define	zn_flush_cached_data(zp, sync)	write_inode_now(ZTOI(zp), sync)
 #define	zn_rlimit_fsize(zp, uio)	(0)
 
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h
index de38f56dc32d..fcee55b0199d 100644
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -188,7 +188,6 @@ typedef struct znode {
 	boolean_t	z_atime_dirty;	/* atime needs to be synced */
 	boolean_t	z_zn_prefetch;	/* Prefetch znodes? */
 	boolean_t	z_is_sa;	/* are we native sa? */
-	boolean_t	z_is_mapped;	/* are we mmap'ed */
 	boolean_t	z_is_ctldir;	/* are we .zfs entry */
 	boolean_t	z_suspended;	/* extra ref from a suspend? */
 	uint_t		z_blksz;	/* block size in bytes */
diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c
index cc4bc9702f8e..dca48e1e4010 100644
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@@ -498,7 +498,9 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
 	zp->z_atime_dirty = B_FALSE;
 	zp->z_zn_prefetch = B_FALSE;
 	zp->z_is_sa = B_FALSE;
+#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
 	zp->z_is_mapped = B_FALSE;
+#endif
 	zp->z_is_ctldir = B_TRUE;
 	zp->z_sa_hdl = NULL;
 	zp->z_blksz = 0;
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 47f132a38abe..302a88c2d353 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -987,7 +987,7 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
 
 	mutex_enter(&zp->z_lock);
 	may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
-	    !(zp->z_is_mapped);
+	    !zn_has_cached_data(zp, 0, LLONG_MAX);
 	mutex_exit(&zp->z_lock);
 
 	/*
@@ -1075,7 +1075,8 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
 		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
 		delete_now = may_delete_now && !toobig &&
 		    atomic_read(&ZTOI(zp)->i_count) == 1 &&
-		    !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked &&
+		    !zn_has_cached_data(zp, 0, LLONG_MAX) &&
+		    xattr_obj == xattr_obj_unlinked &&
 		    zfs_external_acl(zp) == acl_obj;
 	}
 
diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c
index 1faf25d93cc7..7b802a9bace0 100644
--- a/module/os/linux/zfs/zfs_znode.c
+++ b/module/os/linux/zfs/zfs_znode.c
@@ -551,7 +551,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
 	zp->z_unlinked = B_FALSE;
 	zp->z_atime_dirty = B_FALSE;
+#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
 	zp->z_is_mapped = B_FALSE;
+#endif
 	zp->z_is_ctldir = B_FALSE;
 	zp->z_suspended = B_FALSE;
 	zp->z_sa_hdl = NULL;
@@ -1641,7 +1643,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
 	 * Zero partial page cache entries.  This must be done under a
 	 * range lock in order to keep the ARC and page cache in sync.
 	 */
-	if (zp->z_is_mapped) {
+	if (zn_has_cached_data(zp, off, off + len - 1)) {
 		loff_t first_page, last_page, page_len;
 		loff_t first_page_offset, last_page_offset;
 
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index c56e3691e70a..e42b15042a3c 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -625,7 +625,6 @@ static int
 zpl_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct inode *ip = filp->f_mapping->host;
-	znode_t *zp = ITOZ(ip);
 	int error;
 	fstrans_cookie_t cookie;
 
@@ -640,9 +639,12 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
 	if (error)
 		return (error);
 
+#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
+	znode_t *zp = ITOZ(ip);
 	mutex_enter(&zp->z_lock);
 	zp->z_is_mapped = B_TRUE;
 	mutex_exit(&zp->z_lock);
+#endif
 
 	return (error);
 }
@@ -937,7 +939,7 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_WILLNEED:
 #ifdef HAVE_GENERIC_FADVISE
-		if (zn_has_cached_data(zp))
+		if (zn_has_cached_data(zp, offset, offset + len - 1))
 			error = generic_fadvise(filp, offset, len, advice);
 #endif
 		/*
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 0c392b9da0fb..10677d8d9947 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -106,7 +106,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
 		hole = B_FALSE;
 
 	/* Flush any mmap()'d data to disk */
-	if (zn_has_cached_data(zp))
+	if (zn_has_cached_data(zp, 0, file_sz - 1))
 		zn_flush_cached_data(zp, B_FALSE);
 
 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER);
@@ -288,7 +288,8 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			error = mappedread_sf(zp, nbytes, uio);
 		else
 #endif
-		if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) {
+		if (zn_has_cached_data(zp, zfs_uio_offset(uio),
+		    zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
 			error = mappedread(zp, nbytes, uio);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
@@ -696,7 +697,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			zfs_uioskip(uio, nbytes);
 			tx_bytes = nbytes;
 		}
-		if (tx_bytes && zn_has_cached_data(zp) &&
+		if (tx_bytes &&
+		    zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
 		    !(ioflag & O_DIRECT)) {
 			update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
 		}

From f04cb31e7c1775cb6fb1fac716525c7926a14484 Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Tue, 14 Feb 2023 14:05:41 -0500
Subject: [PATCH 27/28] Suppress Clang static analyzer complaint in
 zfs_replay_create()

Clang's static analyzer incorrectly complains about an undefined value
here when lr->lr_common.lrc_txtype == TX_SYMLINK and txtype ==
TX_CREATE. This is impossible, because of this line:

txtype = (lr->lr_common.lrc_txtype & ~TX_CI((uint64_t)0x1 << 63));

Changing the code to compare against txtype suppresses the report.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #14472
---
 module/zfs/zfs_replay.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c
index 0293e46d5858..32be27a8ba6e 100644
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@@ -512,9 +512,9 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 	 *
 	 * The _ATTR versions will grab the fuid info in their subcases.
 	 */
-	if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
-	    (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
-	    (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
+	if (txtype != TX_SYMLINK &&
+	    txtype != TX_MKDIR_ATTR &&
+	    txtype != TX_CREATE_ATTR) {
 		start = (lr + 1);
 		zfsvfs->z_fuid_replay =
 		    zfs_replay_fuid_domain(start, &start,

From 57cfae4a2f04aaff10c45b3f7975e0fe3ef3e8b8 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 15 Feb 2023 09:06:29 -0800
Subject: [PATCH 28/28] zdb: zero-pad checksum output follow up

Apply zero padding for checksums consistently.  The SNPRINTF_BLKPTR
macro was not updated in commit ac7648179c8 which results in the
`cli_root/zdb/zdb_checksum.ksh` test case reliably failing.

Reviewed-by: Igor Kozhukhov <igor@dilos.org>
Reviewed-by: Akash B <akash-b@hpe.com>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #14497
---
 include/sys/spa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/sys/spa.h b/include/sys/spa.h
index 500eb3491a99..c9d03bf645a9 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -678,7 +678,7 @@ typedef struct blkptr {
 		len += func(buf + len, size - len,			\
 		    "[L%llu %s] %s %s %s %s %s %s %s%c"			\
 		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
-		    "cksum=%llx:%llx:%llx:%llx",			\
+		    "cksum=%016llx:%016llx:%016llx:%016llx",		\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    checksum,						\