From c4dff1c2563b495ebe757f50c160425a576d9abf Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Wed, 16 Oct 2019 06:11:11 +0000
Subject: [PATCH 1/2] 10230 zfs mishandles partial writes

illumos/illumos-gate@b0ef425652e5cfce27df9fa5826a9cd64cee110a
https://github.com/illumos/illumos-gate/commit/b0ef425652e5cfce27df9fa5826a9cd64cee110a

https://www.illumos.org/issues/10230
  The trinity fuzzer calls pwritev with an iovec that has one or more entries
  which point to some initial valid data and then the rest point to addresses
  which are not mapped. This yields EFAULT once the write hits the invalid
  address, but we do successfully complete some amount of writing. The zfs_write
  code does not handle this properly. It loses track of the error return from
  dmu_write_uio_dbuf and it has an invalid ASSERT which does not account for the
  partial write case.

Author: Jerry Jelinek <jerry.jelinek@joyent.com>
---
 uts/common/fs/zfs/zfs_vnops.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/uts/common/fs/zfs/zfs_vnops.c b/uts/common/fs/zfs/zfs_vnops.c
index c57982d9b4a2..a68fc3dd3417 100644
--- a/uts/common/fs/zfs/zfs_vnops.c
+++ b/uts/common/fs/zfs/zfs_vnops.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
@@ -665,6 +665,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	ssize_t		n, nbytes;
 	int		max_blksz = zfsvfs->z_max_blksz;
 	int		error = 0;
+	int		prev_error;
 	arc_buf_t	*abuf;
 	iovec_t		*aiov = NULL;
 	xuio_t		*xuio = NULL;
@@ -972,7 +973,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		while ((end_size = zp->z_size) < uio->uio_loffset) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
-			ASSERT(error == 0);
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
@@ -982,12 +982,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
+		/*
+		 * Keep track of a possible pre-existing error from a partial
+		 * write via dmu_write_uio_dbuf above.
+		 */
+		prev_error = error;
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
-		if (error != 0)
+		if (prev_error != 0 || error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;

From b7cab79de23a8bfb689bfb5bddb4deb2e09b360d Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Wed, 16 Oct 2019 06:18:37 +0000
Subject: [PATCH 2/2] 10330 merge recent ZoL vdev and metaslab changes

illumos/illumos-gate@a0b03b161c4df3cfc54fbc741db09b3bdc23ffba
https://github.com/illumos/illumos-gate/commit/a0b03b161c4df3cfc54fbc741db09b3bdc23ffba

https://www.illumos.org/issues/10330
  3 recent ZoL changes in the vdev and metaslab code which we can pull over:
  PR 8324 c853f382db 8324 Change target size of metaslabs from 256GB to 16GB
  PR 8290 b194fab0fb 8290 Factor metaslab_load_wait() in metaslab_load()
  PR 8286 419ba59145 8286 Update vdev_is_spacemap_addressable() for new spacemap
  encoding

Author: Serapheim Dimitropoulos <serapheimd@gmail.com>
---
 cmd/zdb/zdb.c                         |  7 +--
 uts/common/fs/zfs/metaslab.c          | 82 ++++++++++++++-----------
 uts/common/fs/zfs/sys/metaslab.h      |  1 -
 uts/common/fs/zfs/sys/metaslab_impl.h |  4 +-
 uts/common/fs/zfs/vdev.c              | 88 ++++++++++++++++-----------
 uts/common/fs/zfs/vdev_initialize.c   | 18 ++----
 6 files changed, 108 insertions(+), 92 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 3b56a34fb707..65ef35f171a8 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -901,11 +901,8 @@ dump_metaslab(metaslab_t *msp)
 
 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
-		metaslab_load_wait(msp);
-		if (!msp->ms_loaded) {
-			VERIFY0(metaslab_load(msp));
-			range_tree_stat_verify(msp->ms_allocatable);
-		}
+		VERIFY0(metaslab_load(msp));
+		range_tree_stat_verify(msp->ms_allocatable);
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
 		mutex_exit(&msp->ms_lock);
diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c
index 1839bf79483c..44fc2b0e8604 100644
--- a/uts/common/fs/zfs/metaslab.c
+++ b/uts/common/fs/zfs/metaslab.c
@@ -1456,7 +1456,7 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 /*
  * Wait for any in-progress metaslab loads to complete.
  */
-void
+static void
 metaslab_load_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -1467,20 +1467,17 @@ metaslab_load_wait(metaslab_t *msp)
 	}
 }
 
-int
-metaslab_load(metaslab_t *msp)
+static int
+metaslab_load_impl(metaslab_t *msp)
 {
 	int error = 0;
-	boolean_t success = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(!msp->ms_loaded);
-	ASSERT(!msp->ms_loading);
+	ASSERT(msp->ms_loading);
 
-	msp->ms_loading = B_TRUE;
 	/*
 	 * Nobody else can manipulate a loading metaslab, so it's now safe
-	 * to drop the lock.  This way we don't have to hold the lock while
+	 * to drop the lock. This way we don't have to hold the lock while
 	 * reading the spacemap from disk.
 	 */
 	mutex_exit(&msp->ms_lock);
@@ -1497,29 +1494,49 @@ metaslab_load(metaslab_t *msp)
 		    msp->ms_start, msp->ms_size);
 	}
 
-	success = (error == 0);
-
 	mutex_enter(&msp->ms_lock);
-	msp->ms_loading = B_FALSE;
 
-	if (success) {
-		ASSERT3P(msp->ms_group, !=, NULL);
-		msp->ms_loaded = B_TRUE;
+	if (error != 0)
+		return (error);
 
-		/*
-		 * If the metaslab already has a spacemap, then we need to
-		 * remove all segments from the defer tree; otherwise, the
-		 * metaslab is completely empty and we can skip this.
-		 */
-		if (msp->ms_sm != NULL) {
-			for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-				range_tree_walk(msp->ms_defer[t],
-				    range_tree_remove, msp->ms_allocatable);
-			}
+	ASSERT3P(msp->ms_group, !=, NULL);
+	msp->ms_loaded = B_TRUE;
+
+	/*
+	 * If the metaslab already has a spacemap, then we need to
+	 * remove all segments from the defer tree; otherwise, the
+	 * metaslab is completely empty and we can skip this.
+	 */
+	if (msp->ms_sm != NULL) {
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			range_tree_walk(msp->ms_defer[t],
+			    range_tree_remove, msp->ms_allocatable);
 		}
-		msp->ms_max_size = metaslab_block_maxsize(msp);
 	}
+	msp->ms_max_size = metaslab_block_maxsize(msp);
+
+	return (0);
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * There may be another thread loading the same metaslab, if that's
+	 * the case just wait until the other thread is done and return.
+	 */
+	metaslab_load_wait(msp);
+	if (msp->ms_loaded)
+		return (0);
+	VERIFY(!msp->ms_loading);
+
+	msp->ms_loading = B_TRUE;
+	int error = metaslab_load_impl(msp);
+	msp->ms_loading = B_FALSE;
 	cv_broadcast(&msp->ms_load_cv);
+
 	return (error);
 }
 
@@ -2078,13 +2095,10 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int error = 0;
-		metaslab_load_wait(msp);
-		if (!msp->ms_loaded) {
-			if ((error = metaslab_load(msp)) != 0) {
-				metaslab_group_sort(msp->ms_group, msp, 0);
-				return (error);
-			}
+		int error = metaslab_load(msp);
+		if (error != 0) {
+			metaslab_group_sort(msp->ms_group, msp, 0);
+			return (error);
 		}
 		if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 			/*
@@ -2196,9 +2210,7 @@ metaslab_preload(void *arg)
 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 
 	mutex_enter(&msp->ms_lock);
-	metaslab_load_wait(msp);
-	if (!msp->ms_loaded)
-		(void) metaslab_load(msp);
+	(void) metaslab_load(msp);
 	msp->ms_selected_txg = spa_syncing_txg(spa);
 	mutex_exit(&msp->ms_lock);
 }
diff --git a/uts/common/fs/zfs/sys/metaslab.h b/uts/common/fs/zfs/sys/metaslab.h
index 32a19ca64584..f0c68c77fc06 100644
--- a/uts/common/fs/zfs/sys/metaslab.h
+++ b/uts/common/fs/zfs/sys/metaslab.h
@@ -48,7 +48,6 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
     metaslab_t **);
 void metaslab_fini(metaslab_t *);
 
-void metaslab_load_wait(metaslab_t *);
 int metaslab_load(metaslab_t *);
 void metaslab_unload(metaslab_t *);
 
diff --git a/uts/common/fs/zfs/sys/metaslab_impl.h b/uts/common/fs/zfs/sys/metaslab_impl.h
index fe09c274a958..a2c8e6051772 100644
--- a/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -369,8 +369,8 @@ struct metaslab {
 	uint64_t	ms_initializing; /* leaves initializing this ms */
 
 	/*
-	 * We must hold both ms_lock and ms_group->mg_lock in order to
-	 * modify ms_loaded.
+	 * We must always hold the ms_lock when modifying ms_loaded
+	 * and ms_loading.
 	 */
 	boolean_t	ms_loaded;
 	boolean_t	ms_loading;
diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c
index 27d1dcf97f5b..5441c9059d28 100644
--- a/uts/common/fs/zfs/vdev.c
+++ b/uts/common/fs/zfs/vdev.c
@@ -72,20 +72,20 @@ static vdev_ops_t *vdev_ops_table[] = {
 /* maximum scrub/resilver I/O queue per leaf vdev */
 int zfs_scrub_limit = 10;
 
-/* target number of metaslabs per top-level vdev */
-int vdev_max_ms_count = 200;
+/* default target for number of metaslabs per top-level vdev */
+int zfs_vdev_default_ms_count = 200;
 
 /* minimum number of metaslabs per top-level vdev */
-int vdev_min_ms_count = 16;
+int zfs_vdev_min_ms_count = 16;
 
 /* practical upper limit of total metaslabs per top-level vdev */
-int vdev_ms_count_limit = 1ULL << 17;
+int zfs_vdev_ms_count_limit = 1ULL << 17;
 
 /* lower limit for metaslab size (512M) */
-int vdev_default_ms_shift = 29;
+int zfs_vdev_default_ms_shift = 29;
 
-/* upper limit for metaslab size (256G) */
-int vdev_max_ms_shift = 38;
+/* upper limit for metaslab size (16G) */
+int zfs_vdev_max_ms_shift = 34;
 
 boolean_t vdev_validate_skip = B_FALSE;
 
@@ -2034,16 +2034,24 @@ void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	uint64_t asize = vd->vdev_asize;
-	uint64_t ms_count = asize >> vdev_default_ms_shift;
+	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
 	uint64_t ms_shift;
 
 	/*
 	 * There are two dimensions to the metaslab sizing calculation:
 	 * the size of the metaslab and the count of metaslabs per vdev.
-	 * In general, we aim for vdev_max_ms_count (200) metaslabs. The
-	 * range of the dimensions are as follows:
 	 *
-	 *	2^29 <= ms_size  <= 2^38
+	 * The default values used below are a good balance between memory
+	 * usage (larger metaslab size means more memory needed for loaded
+	 * metaslabs; more metaslabs means more memory needed for the
+	 * metaslab_t structs), metaslab load time (larger metaslabs take
+	 * longer to load), and metaslab sync time (more metaslabs means
+	 * more time spent syncing all of them).
+	 *
+	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
+	 * The range of the dimensions are as follows:
+	 *
+	 *	2^29 <= ms_size  <= 2^34
 	 *	  16 <= ms_count <= 131,072
 	 *
 	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
@@ -2052,35 +2060,41 @@ vdev_metaslab_set_size(vdev_t *vd)
 	 * of at least 16 metaslabs will override this minimum size goal.
 	 *
 	 * On the upper end of vdev sizes, we aim for a maximum metaslab
-	 * size of 256GB.  However, we will cap the total count to 2^17
-	 * metaslabs to keep our memory footprint in check.
+	 * size of 16GB.  However, we will cap the total count to 2^17
+	 * metaslabs to keep our memory footprint in check and let the
+	 * metaslab size grow from there if that limit is hit.
 	 *
 	 * The net effect of applying above constrains is summarized below.
 	 *
-	 *	vdev size	metaslab count
-	 *	-------------|-----------------
-	 *	< 8GB		~16
-	 *	8GB - 100GB	one per 512MB
-	 *	100GB - 50TB	~200
-	 *	50TB - 32PB	one per 256GB
-	 *	> 32PB		~131,072
-	 *	-------------------------------
+	 *   vdev size       metaslab count
+	 *  --------------|-----------------
+	 *      < 8GB        ~16
+	 *  8GB   - 100GB   one per 512MB
+	 *  100GB - 3TB     ~200
+	 *  3TB   - 2PB     one per 16GB
+	 *      > 2PB       ~131,072
+	 *  --------------------------------
+	 *
+	 *  Finally, note that all of the above calculate the initial
+	 *  number of metaslabs. Expanding a top-level vdev will result
+	 *  in additional metaslabs being allocated making it possible
+	 *  to exceed the zfs_vdev_ms_count_limit.
 	 */
 
-	if (ms_count < vdev_min_ms_count)
-		ms_shift = highbit64(asize / vdev_min_ms_count);
-	else if (ms_count > vdev_max_ms_count)
-		ms_shift = highbit64(asize / vdev_max_ms_count);
+	if (ms_count < zfs_vdev_min_ms_count)
+		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
+	else if (ms_count > zfs_vdev_default_ms_count)
+		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
 	else
-		ms_shift = vdev_default_ms_shift;
+		ms_shift = zfs_vdev_default_ms_shift;
 
 	if (ms_shift < SPA_MAXBLOCKSHIFT) {
 		ms_shift = SPA_MAXBLOCKSHIFT;
-	} else if (ms_shift > vdev_max_ms_shift) {
-		ms_shift = vdev_max_ms_shift;
+	} else if (ms_shift > zfs_vdev_max_ms_shift) {
+		ms_shift = zfs_vdev_max_ms_shift;
 		/* cap the total count to constrain memory footprint */
-		if ((asize >> ms_shift) > vdev_ms_count_limit)
-			ms_shift = highbit64(asize / vdev_ms_count_limit);
+		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
+			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
 	}
 
 	vd->vdev_ms_shift = ms_shift;
@@ -3386,13 +3400,17 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
 boolean_t
 vdev_is_spacemap_addressable(vdev_t *vd)
 {
+	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
+		return (B_TRUE);
+
 	/*
-	 * Assuming 47 bits of the space map entry dedicated for the entry's
-	 * offset (see description in space_map.h), we calculate the maximum
-	 * address that can be described by a space map entry for the given
-	 * device.
+	 * If double-word space map entries are not enabled we assume
+	 * 47 bits of the space map entry are dedicated to the entry's
+	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
+	 * to calculate the maximum address that can be described by a
+	 * space map entry for the given device.
 	 */
-	uint64_t shift = vd->vdev_ashift + 47;
+	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
 
 	if (shift >= 63) /* detect potential overflow */
 		return (B_TRUE);
diff --git a/uts/common/fs/zfs/vdev_initialize.c b/uts/common/fs/zfs/vdev_initialize.c
index 559c0153d6cc..bf246cd8ddcf 100644
--- a/uts/common/fs/zfs/vdev_initialize.c
+++ b/uts/common/fs/zfs/vdev_initialize.c
@@ -352,16 +352,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
 	return (0);
 }
 
-static void
-vdev_initialize_ms_load(metaslab_t *msp)
-{
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	metaslab_load_wait(msp);
-	if (!msp->ms_loaded)
-		VERIFY0(metaslab_load(msp));
-}
-
 static void
 vdev_initialize_mg_wait(metaslab_group_t *mg)
 {
@@ -484,10 +474,10 @@ vdev_initialize_calculate_progress(vdev_t *vd)
 		 * metaslab. Load it and walk the free tree for more accurate
 		 * progress estimation.
 		 */
-		vdev_initialize_ms_load(msp);
+		VERIFY0(metaslab_load(msp));
 
-		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs;
-		    rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
+		    rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
 			logical_rs.rs_start = rs->rs_start;
 			logical_rs.rs_end = rs->rs_end;
 			vdev_xlate(vd, &logical_rs, &physical_rs);
@@ -615,7 +605,7 @@ vdev_initialize_thread(void *arg)
 
 		vdev_initialize_ms_mark(msp);
 		mutex_enter(&msp->ms_lock);
-		vdev_initialize_ms_load(msp);
+		VERIFY0(metaslab_load(msp));
 
 		range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
 		    vd);