MFC r353611: 10330 merge recent ZoL vdev and metaslab changes

illumos/illumos-gate@a0b03b161c a0b03b161c https://www.illumos.org/issues/10330 3 recent ZoL changes in the vdev and metaslab code which we can pull over: PR 8324 c853f382db 8324 Change target size of metaslabs from 256GB to 16GB PR 8290 b194fab0fb 8290 Factor metaslab_load_wait() in metaslab_load() PR 8286 419ba59145 8286 Update vdev_is_spacemap_addressable() for new spacemap encoding Author: Serapheim Dimitropoulos <serapheimd@gmail.com> Obtained from: illumos, ZoL MFC after: 2 weeks
svn path=/head/; revision=353612
2019-10-16 06:26:51 +00:00 · 2019-10-16 06:26:51 +00:00 · 6cb9ab2bad · 2020-12-20 02:59:44 +00:00
commit 6cb9ab2bad
parent b399ca755a b7cab79de2
6 changed files with 114 additions and 98 deletions
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
@ -901,11 +901,8 @@ dump_metaslab(metaslab_t *msp)

 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
-		metaslab_load_wait(msp);
-		if (!msp->ms_loaded) {
-			VERIFY0(metaslab_load(msp));
-			range_tree_stat_verify(msp->ms_allocatable);
-		}
+		VERIFY0(metaslab_load(msp));
+		range_tree_stat_verify(msp->ms_allocatable);
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
 		mutex_exit(&msp->ms_lock);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@ -1468,7 +1468,7 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 /*
 * Wait for any in-progress metaslab loads to complete.
 */
-void
+static void
 metaslab_load_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
@ -1479,20 +1479,17 @@ metaslab_load_wait(metaslab_t *msp)
 	}
 }

-int
-metaslab_load(metaslab_t *msp)
+static int
+metaslab_load_impl(metaslab_t *msp)
 {
 	int error = 0;
-	boolean_t success = B_FALSE;

 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(!msp->ms_loaded);
-	ASSERT(!msp->ms_loading);
+	ASSERT(msp->ms_loading);

-	msp->ms_loading = B_TRUE;
 	/*
 	 * Nobody else can manipulate a loading metaslab, so it's now safe
-	 * to drop the lock.  This way we don't have to hold the lock while
+	 * to drop the lock. This way we don't have to hold the lock while
 	 * reading the spacemap from disk.
 	 */
 	mutex_exit(&msp->ms_lock);
@ -1509,29 +1506,49 @@ metaslab_load(metaslab_t *msp)
 		    msp->ms_start, msp->ms_size);
 	}

-	success = (error == 0);
-
 	mutex_enter(&msp->ms_lock);
-	msp->ms_loading = B_FALSE;

-	if (success) {
-		ASSERT3P(msp->ms_group, !=, NULL);
-		msp->ms_loaded = B_TRUE;
+	if (error != 0)
+		return (error);

-		/*
-		 * If the metaslab already has a spacemap, then we need to
-		 * remove all segments from the defer tree; otherwise, the
-		 * metaslab is completely empty and we can skip this.
-		 */
-		if (msp->ms_sm != NULL) {
-			for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-				range_tree_walk(msp->ms_defer[t],
-				    range_tree_remove, msp->ms_allocatable);
-			}
+	ASSERT3P(msp->ms_group, !=, NULL);
+	msp->ms_loaded = B_TRUE;
+
+	/*
+	 * If the metaslab already has a spacemap, then we need to
+	 * remove all segments from the defer tree; otherwise, the
+	 * metaslab is completely empty and we can skip this.
+	 */
+	if (msp->ms_sm != NULL) {
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			range_tree_walk(msp->ms_defer[t],
+			    range_tree_remove, msp->ms_allocatable);
 		}
-		msp->ms_max_size = metaslab_block_maxsize(msp);
 	}
+	msp->ms_max_size = metaslab_block_maxsize(msp);
+
+	return (0);
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * There may be another thread loading the same metaslab, if that's
+	 * the case just wait until the other thread is done and return.
+	 */
+	metaslab_load_wait(msp);
+	if (msp->ms_loaded)
+		return (0);
+	VERIFY(!msp->ms_loading);
+
+	msp->ms_loading = B_TRUE;
+	int error = metaslab_load_impl(msp);
+	msp->ms_loading = B_FALSE;
 	cv_broadcast(&msp->ms_load_cv);
+
 	return (error);
 }

@ -2091,13 +2108,10 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
 	ASSERT(MUTEX_HELD(&msp->ms_lock));

 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int error = 0;
-		metaslab_load_wait(msp);
-		if (!msp->ms_loaded) {
-			if ((error = metaslab_load(msp)) != 0) {
-				metaslab_group_sort(msp->ms_group, msp, 0);
-				return (error);
-			}
+		int error = metaslab_load(msp);
+		if (error != 0) {
+			metaslab_group_sort(msp->ms_group, msp, 0);
+			return (error);
 		}
 		if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 			/*
@ -2209,9 +2223,7 @@ metaslab_preload(void *arg)
 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));

 	mutex_enter(&msp->ms_lock);
-	metaslab_load_wait(msp);
-	if (!msp->ms_loaded)
-		(void) metaslab_load(msp);
+	(void) metaslab_load(msp);
 	msp->ms_selected_txg = spa_syncing_txg(spa);
 	mutex_exit(&msp->ms_lock);
 }
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@ -48,7 +48,6 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
    metaslab_t **);
 void metaslab_fini(metaslab_t *);

-void metaslab_load_wait(metaslab_t *);
 int metaslab_load(metaslab_t *);
 void metaslab_unload(metaslab_t *);

--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@ -370,8 +370,8 @@ struct metaslab {
 	uint64_t	ms_initializing; /* leaves initializing this ms */

 	/*
-	 * We must hold both ms_lock and ms_group->mg_lock in order to
-	 * modify ms_loaded.
+	 * We must always hold the ms_lock when modifying ms_loaded
+	 * and ms_loading.
 	 */
 	boolean_t	ms_loaded;
 	boolean_t	ms_loading;
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@ -163,34 +163,34 @@ static vdev_ops_t *vdev_ops_table[] = {
 };


-/* target number of metaslabs per top-level vdev */
-int vdev_max_ms_count = 200;
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RWTUN,
-    &vdev_max_ms_count, 0,
+/* default target for number of metaslabs per top-level vdev */
+int zfs_vdev_default_ms_count = 200;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_count, CTLFLAG_RWTUN,
+    &zfs_vdev_default_ms_count, 0,
    "Target number of metaslabs per top-level vdev");

 /* minimum number of metaslabs per top-level vdev */
-int vdev_min_ms_count = 16;
+int zfs_vdev_min_ms_count = 16;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN,
-    &vdev_min_ms_count, 0,
+    &zfs_vdev_min_ms_count, 0,
    "Minimum number of metaslabs per top-level vdev");

 /* practical upper limit of total metaslabs per top-level vdev */
-int vdev_ms_count_limit = 1ULL << 17;
+int zfs_vdev_ms_count_limit = 1ULL << 17;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN,
-    &vdev_ms_count_limit, 0,
+    &zfs_vdev_ms_count_limit, 0,
    "Maximum number of metaslabs per top-level vdev");

 /* lower limit for metaslab size (512M) */
-int vdev_default_ms_shift = 29;
+int zfs_vdev_default_ms_shift = 29;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN,
-    &vdev_default_ms_shift, 0,
+    &zfs_vdev_default_ms_shift, 0,
    "Default shift between vdev size and number of metaslabs");

-/* upper limit for metaslab size (256G) */
-int vdev_max_ms_shift = 38;
+/* upper limit for metaslab size (16G) */
+int zfs_vdev_max_ms_shift = 34;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN,
-    &vdev_max_ms_shift, 0,
+    &zfs_vdev_max_ms_shift, 0,
    "Maximum shift between vdev size and number of metaslabs");

 boolean_t vdev_validate_skip = B_FALSE;
@ -2205,16 +2205,24 @@ void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	uint64_t asize = vd->vdev_asize;
-	uint64_t ms_count = asize >> vdev_default_ms_shift;
+	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
 	uint64_t ms_shift;

 	/*
 	 * There are two dimensions to the metaslab sizing calculation:
 	 * the size of the metaslab and the count of metaslabs per vdev.
-	 * In general, we aim for vdev_max_ms_count (200) metaslabs. The
-	 * range of the dimensions are as follows:
 	 *
-	 *	2^29 <= ms_size  <= 2^38
+	 * The default values used below are a good balance between memory
+	 * usage (larger metaslab size means more memory needed for loaded
+	 * metaslabs; more metaslabs means more memory needed for the
+	 * metaslab_t structs), metaslab load time (larger metaslabs take
+	 * longer to load), and metaslab sync time (more metaslabs means
+	 * more time spent syncing all of them).
+	 *
+	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
+	 * The range of the dimensions are as follows:
+	 *
+	 *	2^29 <= ms_size  <= 2^34
 	 *	  16 <= ms_count <= 131,072
 	 *
 	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
@ -2223,35 +2231,41 @@ vdev_metaslab_set_size(vdev_t *vd)
 	 * of at least 16 metaslabs will override this minimum size goal.
 	 *
 	 * On the upper end of vdev sizes, we aim for a maximum metaslab
-	 * size of 256GB.  However, we will cap the total count to 2^17
-	 * metaslabs to keep our memory footprint in check.
+	 * size of 16GB.  However, we will cap the total count to 2^17
+	 * metaslabs to keep our memory footprint in check and let the
+	 * metaslab size grow from there if that limit is hit.
 	 *
 	 * The net effect of applying above constrains is summarized below.
 	 *
-	 *	vdev size	metaslab count
-	 *	-------------|-----------------
-	 *	< 8GB		~16
-	 *	8GB - 100GB	one per 512MB
-	 *	100GB - 50TB	~200
-	 *	50TB - 32PB	one per 256GB
-	 *	> 32PB		~131,072
-	 *	-------------------------------
+	 *   vdev size       metaslab count
+	 *  --------------|-----------------
+	 *      < 8GB        ~16
+	 *  8GB   - 100GB   one per 512MB
+	 *  100GB - 3TB     ~200
+	 *  3TB   - 2PB     one per 16GB
+	 *      > 2PB       ~131,072
+	 *  --------------------------------
+	 *
+	 *  Finally, note that all of the above calculate the initial
+	 *  number of metaslabs. Expanding a top-level vdev will result
+	 *  in additional metaslabs being allocated making it possible
+	 *  to exceed the zfs_vdev_ms_count_limit.
 	 */

-	if (ms_count < vdev_min_ms_count)
-		ms_shift = highbit64(asize / vdev_min_ms_count);
-	else if (ms_count > vdev_max_ms_count)
-		ms_shift = highbit64(asize / vdev_max_ms_count);
+	if (ms_count < zfs_vdev_min_ms_count)
+		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
+	else if (ms_count > zfs_vdev_default_ms_count)
+		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
 	else
-		ms_shift = vdev_default_ms_shift;
+		ms_shift = zfs_vdev_default_ms_shift;

 	if (ms_shift < SPA_MAXBLOCKSHIFT) {
 		ms_shift = SPA_MAXBLOCKSHIFT;
-	} else if (ms_shift > vdev_max_ms_shift) {
-		ms_shift = vdev_max_ms_shift;
+	} else if (ms_shift > zfs_vdev_max_ms_shift) {
+		ms_shift = zfs_vdev_max_ms_shift;
 		/* cap the total count to constrain memory footprint */
-		if ((asize >> ms_shift) > vdev_ms_count_limit)
-			ms_shift = highbit64(asize / vdev_ms_count_limit);
+		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
+			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
 	}

 	vd->vdev_ms_shift = ms_shift;
@ -3611,13 +3625,17 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
 boolean_t
 vdev_is_spacemap_addressable(vdev_t *vd)
 {
+	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
+		return (B_TRUE);
+
 	/*
-	 * Assuming 47 bits of the space map entry dedicated for the entry's
-	 * offset (see description in space_map.h), we calculate the maximum
-	 * address that can be described by a space map entry for the given
-	 * device.
+	 * If double-word space map entries are not enabled we assume
+	 * 47 bits of the space map entry are dedicated to the entry's
+	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
+	 * to calculate the maximum address that can be described by a
+	 * space map entry for the given device.
 	 */
-	uint64_t shift = vd->vdev_ashift + 47;
+	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;

 	if (shift >= 63) /* detect potential overflow */
 		return (B_TRUE);
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
@ -352,16 +352,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
 	return (0);
 }

-static void
-vdev_initialize_ms_load(metaslab_t *msp)
-{
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	metaslab_load_wait(msp);
-	if (!msp->ms_loaded)
-		VERIFY0(metaslab_load(msp));
-}
-
 static void
 vdev_initialize_mg_wait(metaslab_group_t *mg)
 {
@ -484,10 +474,10 @@ vdev_initialize_calculate_progress(vdev_t *vd)
 		 * metaslab. Load it and walk the free tree for more accurate
 		 * progress estimation.
 		 */
-		vdev_initialize_ms_load(msp);
+		VERIFY0(metaslab_load(msp));

-		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs;
-		    rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
+		    rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
 			logical_rs.rs_start = rs->rs_start;
 			logical_rs.rs_end = rs->rs_end;
 			vdev_xlate(vd, &logical_rs, &physical_rs);
@ -615,7 +605,7 @@ vdev_initialize_thread(void *arg)

 		vdev_initialize_ms_mark(msp);
 		mutex_enter(&msp->ms_lock);
-		vdev_initialize_ms_load(msp);
+		VERIFY0(metaslab_load(msp));

 		range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
 		    vd);