Metaslab max_size should be persisted while unloaded

When we unload metaslabs today in ZFS, the cached max_size value is discarded. We instead use the histogram to determine whether or not we think we can satisfy an allocation from the metaslab. This can result in situations where, if we're doing I/Os of a size not aligned to a histogram bucket, a metaslab is loaded even though it cannot satisfy the allocation we think it can. For example, a metaslab with 16 entries in the 16k-32k bucket may have entirely 16kB entries. If we try to allocate a 24kB buffer, we will load that metaslab because we think it should be able to handle the allocation. Doing so is expensive in CPU time, disk reads, and average IO latency. This is exacerbated if the write being attempted is a sync write. This change makes ZFS cache the max_size after the metaslab is unloaded. If we ever get a free (or a coalesced group of frees) larger than the max_size, we will update it. Otherwise, we leave it as is. When attempting to allocate, we use the max_size as a lower bound, and respect it unless we are in try_hard. However, we do age the max_size out at some point, since we expect the actual max_size to increase as we do more frees. A more sophisticated algorithm here might be helpful, but this works reasonably well. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matt Ahrens <mahrens@delphix.com> Signed-off-by: Paul Dagnelie <pcd@delphix.com> Closes #9055
2019-08-05 14:34:27 -07:00 · 2019-08-05 14:34:27 -07:00 · c81f1790e2
commit c81f1790e2
parent 99e755d653
7 changed files with 190 additions and 40 deletions
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@ -21,7 +21,7 @@

 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 * Copyright 2016 Nexenta Systems, Inc.
 * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
@ -955,7 +955,7 @@ dump_metaslab_stats(metaslab_t *msp)
 	/* max sure nicenum has enough space */
 	CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);

-	zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf));
+	zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));

 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 	    "segments", avl_numnodes(t), "maxsize", maxbuf,
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@ -66,7 +66,7 @@ uint64_t metaslab_allocated_space(metaslab_t *);
 void metaslab_sync(metaslab_t *, uint64_t);
 void metaslab_sync_done(metaslab_t *, uint64_t);
 void metaslab_sync_reassess(metaslab_group_t *);
-uint64_t metaslab_block_maxsize(metaslab_t *);
+uint64_t metaslab_largest_allocatable(metaslab_t *);

 /*
 * metaslab alloc flags
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@ -475,6 +475,12 @@ struct metaslab {
 	 * stay cached.
 	 */
 	uint64_t	ms_selected_txg;
+	/*
+	 * ms_load/unload_time can be used for performance monitoring
+	 * (e.g. by dtrace or mdb).
+	 */
+	hrtime_t	ms_load_time;	/* time last loaded */
+	hrtime_t	ms_unload_time;	/* time last unloaded */

 	uint64_t	ms_alloc_txg;	/* last successful alloc (debug only) */
 	uint64_t	ms_max_size;	/* maximum allocatable size	*/
@ -495,6 +501,7 @@ struct metaslab {
 	 * segment sizes.
 	 */
 	avl_tree_t	ms_allocatable_by_size;
+	avl_tree_t	ms_unflushed_frees_by_size;
 	uint64_t	ms_lbas[MAX_LBAS];

 	metaslab_group_t *ms_group;	/* metaslab group		*/
--- a/include/sys/range_tree.h
+++ b/include/sys/range_tree.h
@ -89,6 +89,8 @@ range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
 range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
 void range_tree_destroy(range_tree_t *rt);
 boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
+    uint64_t *ostart, uint64_t *osize);
 void range_tree_verify_not_present(range_tree_t *rt,
    uint64_t start, uint64_t size);
 range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@ -370,6 +370,22 @@ larger).
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE

+.sp
+.ne 2
+.na
+\fBzfs_metaslab_max_size_cache_sec\fR (ulong)
+.ad
+.RS 12n
+When we unload a metaslab, we cache the size of the largest free chunk. We use
+that cached size to determine whether or not to load a metaslab for a given
+allocation. As more frees accumulate in that metaslab while it's unloaded, the
+cached max size becomes less and less accurate. After a number of seconds
+controlled by this tunable, we stop considering the cached max size and start
+considering only the histogram instead.
+.sp
+Default value: \fB3600 seconds\fR (one hour)
+.RE
+
 .sp
 .ne 2
 .na
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@ -272,6 +272,12 @@ uint64_t metaslab_trace_max_entries = 5000;
 */
 int max_disabled_ms = 3;

+/*
+ * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
+ * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ */
+unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+
 static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@ -1165,17 +1171,83 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
 * Return the maximum contiguous segment within the metaslab.
 */
 uint64_t
-metaslab_block_maxsize(metaslab_t *msp)
+metaslab_largest_allocatable(metaslab_t *msp)
 {
 	avl_tree_t *t = &msp->ms_allocatable_by_size;
 	range_seg_t *rs;

-	if (t == NULL || (rs = avl_last(t)) == NULL)
-		return (0ULL);
+	if (t == NULL)
+		return (0);
+	rs = avl_last(t);
+	if (rs == NULL)
+		return (0);

 	return (rs->rs_end - rs->rs_start);
 }

+/*
+ * Return the maximum contiguous segment within the unflushed frees of this
+ * metaslab.
+ */
+uint64_t
+metaslab_largest_unflushed_free(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	if (msp->ms_unflushed_frees == NULL)
+		return (0);
+
+	range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size);
+	if (rs == NULL)
+		return (0);
+
+	/*
+	 * When a range is freed from the metaslab, that range is added to
+	 * both the unflushed frees and the deferred frees. While the block
+	 * will eventually be usable, if the metaslab were loaded the range
+	 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
+	 * txgs had passed.  As a result, when attempting to estimate an upper
+	 * bound for the largest currently-usable free segment in the
+	 * metaslab, we need to not consider any ranges currently in the defer
+	 * trees. This algorithm approximates the largest available chunk in
+	 * the largest range in the unflushed_frees tree by taking the first
+	 * chunk.  While this may be a poor estimate, it should only remain so
+	 * briefly and should eventually self-correct as frees are no longer
+	 * deferred. Similar logic applies to the ms_freed tree. See
+	 * metaslab_load() for more details.
+	 *
+	 * There are two primary sources of innacuracy in this estimate. Both
+	 * are tolerated for performance reasons. The first source is that we
+	 * only check the largest segment for overlaps. Smaller segments may
+	 * have more favorable overlaps with the other trees, resulting in
+	 * larger usable chunks.  Second, we only look at the first chunk in
+	 * the largest segment; there may be other usable chunks in the
+	 * largest segment, but we ignore them.
+	 */
+	uint64_t rstart = rs->rs_start;
+	uint64_t rsize = rs->rs_end - rstart;
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		uint64_t start = 0;
+		uint64_t size = 0;
+		boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
+		    rsize, &start, &size);
+		if (found) {
+			if (rstart == start)
+				return (0);
+			rsize = start - rstart;
+		}
+	}
+
+	uint64_t start = 0;
+	uint64_t size = 0;
+	boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
+	    rsize, &start, &size);
+	if (found)
+		rsize = start - rstart;
+
+	return (rsize);
+}
+
 static range_seg_t *
 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
 {
@ -1269,7 +1341,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 	 * If we're running low on space, find a segment based on size,
 	 * rather than iterating based on offset.
 	 */
-	if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold ||
+	if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
 		offset = -1;
 	} else {
@ -1375,7 +1447,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 	range_seg_t *rs, rsearch;
 	uint64_t hbit = highbit64(size);
 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
-	uint64_t max_size = metaslab_block_maxsize(msp);
+	uint64_t max_size = metaslab_largest_allocatable(msp);

 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(avl_numnodes(t), ==,
@ -1693,7 +1765,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)

 	msp->ms_weight = 0;
 	msp->ms_fragmentation = 0;
-	msp->ms_max_size = 0;

 	/*
 	 * This function is used for verification purposes. Regardless of
@ -1883,18 +1954,21 @@ metaslab_load_impl(metaslab_t *msp)
 	 * comment for ms_synchist and ms_deferhist[] for more info]
 	 */
 	uint64_t weight = msp->ms_weight;
+	uint64_t max_size = msp->ms_max_size;
 	metaslab_recalculate_weight_and_sort(msp);
 	if (!WEIGHT_IS_SPACEBASED(weight))
 		ASSERT3U(weight, <=, msp->ms_weight);
-	msp->ms_max_size = metaslab_block_maxsize(msp);
-
+	msp->ms_max_size = metaslab_largest_allocatable(msp);
+	ASSERT3U(max_size, <=, msp->ms_max_size);
 	hrtime_t load_end = gethrtime();
+		msp->ms_load_time = load_end;
 	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
 		zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
 		    "ms_id %llu, smp_length %llu, "
 		    "unflushed_allocs %llu, unflushed_frees %llu, "
 		    "freed %llu, defer %llu + %llu, "
-		    "loading_time %lld ms",
+		    "loading_time %lld ms, ms_max_size %llu, "
+		    "max size error %llu",
 		    spa_syncing_txg(spa), spa_name(spa),
 		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
 		    space_map_length(msp->ms_sm),
@ -1903,7 +1977,8 @@ metaslab_load_impl(metaslab_t *msp)
 		    range_tree_space(msp->ms_freed),
 		    range_tree_space(msp->ms_defer[0]),
 		    range_tree_space(msp->ms_defer[1]),
-		    (longlong_t)((load_end - load_start) / 1000000));
+		    (longlong_t)((load_end - load_start) / 1000000),
+		    msp->ms_max_size, msp->ms_max_size - max_size);
 	}

 	metaslab_verify_space(msp, spa_syncing_txg(spa));
@ -1967,10 +2042,10 @@ metaslab_unload(metaslab_t *msp)

 	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
+	msp->ms_unload_time = gethrtime();

 	msp->ms_activation_weight = 0;
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
-	msp->ms_max_size = 0;

 	/*
 	 * We explicitly recalculate the metaslab's weight based on its space
@ -2527,13 +2602,19 @@ metaslab_segment_weight(metaslab_t *msp)
 * weights we rely on the entire weight (excluding the weight-type bit).
 */
 boolean_t
-metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
+metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
 {
-	if (msp->ms_loaded) {
+	/*
+	 * If the metaslab is loaded, ms_max_size is definitive and we can use
+	 * the fast check. If it's not, the ms_max_size is a lower bound (once
+	 * set), and we should use the fast check as long as we're not in
+	 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
+	 * seconds since the metaslab was unloaded.
+	 */
+	if (msp->ms_loaded ||
+	    (msp->ms_max_size != 0 && !try_hard && gethrtime() <
+	    msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
 		return (msp->ms_max_size >= asize);
-	} else {
-		ASSERT0(msp->ms_max_size);
-	}

 	boolean_t should_allocate;
 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
@ -2571,14 +2652,21 @@ metaslab_weight(metaslab_t *msp)
 	metaslab_set_fragmentation(msp);

 	/*
-	 * Update the maximum size if the metaslab is loaded. This will
+	 * Update the maximum size. If the metaslab is loaded, this will
 	 * ensure that we get an accurate maximum size if newly freed space
-	 * has been added back into the free tree.
+	 * has been added back into the free tree. If the metaslab is
+	 * unloaded, we check if there's a larger free segment in the
+	 * unflushed frees. This is a lower bound on the largest allocatable
+	 * segment size. Coalescing of adjacent entries may reveal larger
+	 * allocatable segments, but we aren't aware of those until loading
+	 * the space map into a range tree.
 	 */
-	if (msp->ms_loaded)
-		msp->ms_max_size = metaslab_block_maxsize(msp);
-	else
-		ASSERT0(msp->ms_max_size);
+	if (msp->ms_loaded) {
+		msp->ms_max_size = metaslab_largest_allocatable(msp);
+	} else {
+		msp->ms_max_size = MAX(msp->ms_max_size,
+		    metaslab_largest_unflushed_free(msp));
+	}

 	/*
 	 * Segment-based weighting requires space map histogram support.
@ -3595,7 +3683,9 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 		ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
 		msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
 		ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
-		msp->ms_unflushed_frees = range_tree_create(NULL, NULL);
+		msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops,
+		    &msp->ms_unflushed_frees_by_size,
+		    metaslab_rangesize_compare, 0);

 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 	}
@ -3992,7 +4082,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	 * Now that we've attempted the allocation we need to update the
 	 * metaslab's maximum block size since it may have changed.
 	 */
-	msp->ms_max_size = metaslab_block_maxsize(msp);
+	msp->ms_max_size = metaslab_largest_allocatable(msp);
 	return (start);
 }

@ -4010,7 +4100,8 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 static metaslab_t *
 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
    dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
-    zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
+    boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
+    boolean_t *was_active)
 {
 	avl_index_t idx;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
@ -4020,7 +4111,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,

 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
 		int i;
-		if (!metaslab_should_allocate(msp, asize)) {
+		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			continue;
@ -4100,8 +4191,8 @@ metaslab_active_mask_verify(metaslab_t *msp)
 /* ARGSUSED */
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
-    int d, int allocator)
+    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+    int allocator, boolean_t try_hard)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
@ -4174,8 +4265,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 			was_active = B_TRUE;
 		} else {
 			msp = find_valid_metaslab(mg, activation_weight, dva, d,
-			    want_unique, asize, allocator, zal, search,
-			    &was_active);
+			    want_unique, asize, allocator, try_hard, zal,
+			    search, &was_active);
 		}

 		mutex_exit(&mg->mg_lock);
@ -4282,7 +4373,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		 * can accurately determine if the allocation attempt should
 		 * proceed.
 		 */
-		if (!metaslab_should_allocate(msp, asize)) {
+		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			/* Passivate this metaslab and select a new one. */
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
@ -4360,7 +4451,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		 */
 		uint64_t weight;
 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
-			weight = metaslab_block_maxsize(msp);
+			weight = metaslab_largest_allocatable(msp);
 			WEIGHT_SET_SPACEBASED(weight);
 		} else {
 			weight = metaslab_weight_from_range_tree(msp);
@ -4392,7 +4483,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		 * we may end up in an infinite loop retrying the same
 		 * metaslab.
 		 */
-		ASSERT(!metaslab_should_allocate(msp, asize));
+		ASSERT(!metaslab_should_allocate(msp, asize, try_hard));

 		mutex_exit(&msp->ms_lock);
 	}
@ -4403,14 +4494,14 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,

 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
-    int d, int allocator)
+    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+    int allocator, boolean_t try_hard)
 {
 	uint64_t offset;
 	ASSERT(mg->mg_initialized);

 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
-	    dva, d, allocator);
+	    dva, d, allocator, try_hard);

 	mutex_enter(&mg->mg_lock);
 	if (offset == -1ULL) {
@ -4592,7 +4683,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 		 * allow any metaslab to be used (unique=false).
 		 */
 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
-		    !try_hard, dva, d, allocator);
+		    !try_hard, dva, d, allocator, try_hard);

 		if (offset != -1ULL) {
 			/*
@ -5615,6 +5706,10 @@ MODULE_PARM_DESC(metaslab_df_max_search,
 module_param(metaslab_df_use_largest_segment, int, 0644);
 MODULE_PARM_DESC(metaslab_df_use_largest_segment,
 	"when looking in size tree, use largest segment instead of exact fit");
+
+module_param(zfs_metaslab_max_size_cache_sec, ulong, 0644);
+MODULE_PARM_DESC(zfs_metaslab_max_size_cache_sec,
+	"how long to trust the cached max chunk size of a metaslab");
 /* END CSTYLED */

 #endif
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@ -524,6 +524,36 @@ range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
 	return (range_tree_find(rt, start, size) != NULL);
 }

+/*
+ * Returns the first subset of the given range which overlaps with the range
+ * tree. Returns true if there is a segment in the range, and false if there
+ * isn't.
+ */
+boolean_t
+range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
+    uint64_t *ostart, uint64_t *osize)
+{
+	range_seg_t rsearch;
+	rsearch.rs_start = start;
+	rsearch.rs_end = start + 1;
+
+	avl_index_t where;
+	range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where);
+	if (rs != NULL) {
+		*ostart = start;
+		*osize = MIN(size, rs->rs_end - start);
+		return (B_TRUE);
+	}
+
+	rs = avl_nearest(&rt->rt_root, where, AVL_AFTER);
+	if (rs == NULL || rs->rs_start > start + size)
+		return (B_FALSE);
+
+	*ostart = rs->rs_start;
+	*osize = MIN(start + size, rs->rs_end) - rs->rs_start;
+	return (B_TRUE);
+}
+
 /*
 * Ensure that this range is not in the tree, regardless of whether
 * it is currently in the tree.