From 137146f48c713e9d14234ab8c5b1c371ed978ea6 Mon Sep 17 00:00:00 2001 From: Josh Paetzel Date: Wed, 15 Mar 2017 04:16:08 +0000 Subject: [PATCH 1/2] 7303 dynamic metaslab selection illumos/illumos-gate@8363e80ae72609660f6090766ca8c2c18aa53f0c https://github.com/illumos/illumos-gate/commit/8363e80ae72609660f6090766ca8c2c18aa53f0 https://www.illumos.org/issues/7303 This change introduces a new weighting algorithm to improve metaslab selection. The new weighting algorithm relies on the SPACEMAP_HISTOGRAM feature. As a result, the metaslab weight now encodes the type of weighting algorithm used (size-based vs segment-based). This also introduce a new allocation tracing facility and two new dcmds to help debug allocation problems. Each zio now contains a zio_alloc_list_t structure that is populated as the zio goes through the allocations stage. Here's an example of how to use the tracing facility: > c5ec000::print zio_t io_alloc_list | ::walk list | ::metaslab_trace MSID DVA ASIZE WEIGHT RESULT VDEV - 0 400 0 NOT_ALLOCATABLE ztest.0a - 0 400 0 NOT_ALLOCATABLE ztest.0a - 0 400 0 ENOSPC ztest.0a - 0 200 0 NOT_ALLOCATABLE ztest.0a - 0 200 0 NOT_ALLOCATABLE ztest.0a - 0 200 0 ENOSPC ztest.0a 1 0 400 1 x 8M 17b1a00 ztest.0a > 1ff2400::print zio_t io_alloc_list | ::walk list | ::metaslab_trace MSID DVA ASIZE WEIGHT RESULT VDEV - 0 200 0 NOT_ALLOCATABLE mirror-2 - 0 200 0 NOT_ALLOCATABLE mirror-0 1 0 200 1 x 4M 112ae00 mirror-1 - 1 200 0 NOT_ALLOCATABLE mirror-2 - 1 200 0 NOT_ALLOCATABLE mirror-0 1 1 200 1 x 4M 112b000 mirror-1 - 2 200 0 NOT_ALLOCATABLE mirror-2 If the metaslab is using segment-based weighting then the WEIGHT column will display the number of segments available in the bucket where the allocation attempt was made. Author: George Wilson Reviewed by: Alex Reece Reviewed by: Chris Siden Reviewed by: Dan Kimmel Reviewed by: Matthew Ahrens Reviewed by: Paul Dagnelie Reviewed by: Pavel Zakharov Reviewed by: Prakash Surya Reviewed by: Don Brady Approved by: Richard Lowe --- cmd/zdb/zdb.c | 25 +++++++++++++++++++++---- cmd/ztest/ztest.c | 2 +- lib/libzpool/common/kernel.c | 5 +++++ lib/libzpool/common/sys/zfs_context.h | 1 + 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 54bfef151637..577a3d585cd8 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2562,10 +2562,21 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) if (!dump_opt['L']) { vdev_t *rvd = spa->spa_root_vdev; + + /* + * We are going to be changing the meaning of the metaslab's + * ms_tree. Ensure that the allocator doesn't try to + * use the tree. + */ + spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; + spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; + metaslab_group_t *mg = vd->vdev_mg; for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(msp->ms_group, ==, mg); mutex_enter(&msp->ms_lock); metaslab_unload(msp); @@ -2586,8 +2597,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) (longlong_t)m, (longlong_t)vd->vdev_ms_count); - msp->ms_ops = &zdb_metaslab_ops; - /* * We don't want to spend the CPU * manipulating the size-ordered @@ -2597,7 +2606,10 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) msp->ms_tree->rt_ops = NULL; VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC)); - msp->ms_loaded = B_TRUE; + + if (!msp->ms_loaded) { + msp->ms_loaded = B_TRUE; + } } mutex_exit(&msp->ms_lock); } @@ -2619,8 +2631,10 @@ zdb_leak_fini(spa_t *spa) vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; + metaslab_group_t *mg = vd->vdev_mg; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(mg, ==, msp->ms_group); mutex_enter(&msp->ms_lock); /* @@ -2634,7 +2648,10 @@ zdb_leak_fini(spa_t *spa) * from the ms_tree. */ range_tree_vacate(msp->ms_tree, zdb_leak, vd); - msp->ms_loaded = B_FALSE; + + if (msp->ms_loaded) { + msp->ms_loaded = B_FALSE; + } mutex_exit(&msp->ms_lock); } diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index e50a14395ea6..f21886b97d9f 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -171,7 +171,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_mirrors = 2, .zo_raidz = 4, .zo_raidz_parity = 1, - .zo_vdev_size = SPA_MINDEVSIZE * 2, + .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ .zo_datasets = 7, .zo_threads = 23, .zo_passtime = 60, /* 60 seconds */ diff --git a/lib/libzpool/common/kernel.c b/lib/libzpool/common/kernel.c index 087ee993b115..07f9e53b6670 100644 --- a/lib/libzpool/common/kernel.c +++ b/lib/libzpool/common/kernel.c @@ -92,6 +92,11 @@ kstat_create(const char *module, int instance, const char *name, return (NULL); } +/*ARGSUSED*/ +void +kstat_named_init(kstat_named_t *knp, const char *name, uchar_t type) +{} + /*ARGSUSED*/ void kstat_install(kstat_t *ksp) diff --git a/lib/libzpool/common/sys/zfs_context.h b/lib/libzpool/common/sys/zfs_context.h index 2a4a699b48a6..7c08c644e586 100644 --- a/lib/libzpool/common/sys/zfs_context.h +++ b/lib/libzpool/common/sys/zfs_context.h @@ -301,6 +301,7 @@ extern void cv_broadcast(kcondvar_t *cv); */ extern kstat_t *kstat_create(const char *, int, const char *, const char *, uchar_t, ulong_t, uchar_t); +extern void kstat_named_init(kstat_named_t *, const char *, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); extern void kstat_waitq_enter(kstat_io_t *); From 7e45ac57cb520e0104bac10531cd59ad60249e3d Mon Sep 17 00:00:00 2001 From: Josh Paetzel Date: Wed, 15 Mar 2017 04:18:40 +0000 Subject: [PATCH 2/2] 7303 dynamic metaslab selection illumos/illumos-gate@8363e80ae72609660f6090766ca8c2c18aa53f0c https://github.com/illumos/illumos-gate/commit/8363e80ae72609660f6090766ca8c2c18aa53f0 https://www.illumos.org/issues/7303 This change introduces a new weighting algorithm to improve metaslab selection. The new weighting algorithm relies on the SPACEMAP_HISTOGRAM feature. As a result, the metaslab weight now encodes the type of weighting algorithm used (size-based vs segment-based). This also introduce a new allocation tracing facility and two new dcmds to help debug allocation problems. Each zio now contains a zio_alloc_list_t structure that is populated as the zio goes through the allocations stage. Here's an example of how to use the tracing facility: > c5ec000::print zio_t io_alloc_list | ::walk list | ::metaslab_trace MSID DVA ASIZE WEIGHT RESULT VDEV - 0 400 0 NOT_ALLOCATABLE ztest.0a - 0 400 0 NOT_ALLOCATABLE ztest.0a - 0 400 0 ENOSPC ztest.0a - 0 200 0 NOT_ALLOCATABLE ztest.0a - 0 200 0 NOT_ALLOCATABLE ztest.0a - 0 200 0 ENOSPC ztest.0a 1 0 400 1 x 8M 17b1a00 ztest.0a > 1ff2400::print zio_t io_alloc_list | ::walk list | ::metaslab_trace MSID DVA ASIZE WEIGHT RESULT VDEV - 0 200 0 NOT_ALLOCATABLE mirror-2 - 0 200 0 NOT_ALLOCATABLE mirror-0 1 0 200 1 x 4M 112ae00 mirror-1 - 1 200 0 NOT_ALLOCATABLE mirror-2 - 1 200 0 NOT_ALLOCATABLE mirror-0 1 1 200 1 x 4M 112b000 mirror-1 - 2 200 0 NOT_ALLOCATABLE mirror-2 If the metaslab is using segment-based weighting then the WEIGHT column will display the number of segments available in the bucket where the allocation attempt was made. Author: George Wilson Reviewed by: Alex Reece Reviewed by: Chris Siden Reviewed by: Dan Kimmel Reviewed by: Matthew Ahrens Reviewed by: Paul Dagnelie Reviewed by: Pavel Zakharov Reviewed by: Prakash Surya Reviewed by: Don Brady Approved by: Richard Lowe --- uts/common/fs/zfs/metaslab.c | 982 ++++++++++++++++++++------ uts/common/fs/zfs/spa.c | 13 + uts/common/fs/zfs/spa_misc.c | 2 + uts/common/fs/zfs/space_map.c | 1 - uts/common/fs/zfs/sys/metaslab.h | 13 +- uts/common/fs/zfs/sys/metaslab_impl.h | 106 ++- uts/common/fs/zfs/sys/zfs_debug.h | 17 +- uts/common/fs/zfs/sys/zio.h | 6 + uts/common/fs/zfs/zio.c | 18 +- 9 files changed, 941 insertions(+), 217 deletions(-) diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index e7624f040eda..edff16b52dc0 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -38,18 +38,13 @@ #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) -#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) -#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) -#define METASLAB_ACTIVE_MASK \ - (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) - uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core - * space_map representation must be before we compact it on-disk. + * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ int zfs_condense_pct = 200; @@ -122,7 +117,7 @@ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. - * Once the space_map's free space drops below this level we dynamically + * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ int metaslab_df_free_pct = 4; @@ -170,7 +165,38 @@ boolean_t metaslab_lba_weighting_enabled = B_TRUE; */ boolean_t metaslab_bias_enabled = B_TRUE; -static uint64_t metaslab_fragmentation(metaslab_t *); +/* + * Enable/disable segment-based metaslab selection. + */ +boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; + +/* + * When using segment-based metaslab selection, we will continue + * allocating from the active metaslab until we have exhausted + * zfs_metaslab_switch_threshold of its buckets. + */ +int zfs_metaslab_switch_threshold = 2; + +/* + * Internal switch to enable/disable the metaslab allocation tracing + * facility. + */ +boolean_t metaslab_trace_enabled = B_TRUE; + +/* + * Maximum entries that the metaslab allocation tracing facility will keep + * in a given list when running in non-debug mode. We limit the number + * of entries in non-debug mode to prevent us from using up too much memory. + * The limit should be sufficiently large that we don't expect any allocation + * to every exceed this value. In debug mode, the system will panic if this + * limit is ever reached allowing for further investigation. + */ +uint64_t metaslab_trace_max_entries = 5000; + +static uint64_t metaslab_weight(metaslab_t *); +static void metaslab_set_fragmentation(metaslab_t *); + +kmem_cache_t *metaslab_alloc_trace_cache; /* * ========================================================================== @@ -388,11 +414,6 @@ metaslab_class_expandable_space(metaslab_class_t *mc) return (space); } -/* - * ========================================================================== - * Metaslab groups - * ========================================================================== - */ static int metaslab_compare(const void *x1, const void *x2) { @@ -417,6 +438,57 @@ metaslab_compare(const void *x1, const void *x2) return (0); } +/* + * Verify that the space accounting on disk matches the in-core range_trees. + */ +void +metaslab_verify_space(metaslab_t *msp, uint64_t txg) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t allocated = 0; + uint64_t freed = 0; + uint64_t sm_free_space, msp_free_space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* + * We can only verify the metaslab space when we're called + * from syncing context with a loaded metaslab that has an allocated + * space map. Calling this in non-syncing context does not + * provide a consistent view of the metaslab since we're performing + * allocations in the future. + */ + if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || + !msp->ms_loaded) + return; + + sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - + space_map_alloc_delta(msp->ms_sm); + + /* + * Account for future allocations since we would have already + * deducted that space from the ms_freetree. + */ + for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { + allocated += + range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]); + } + freed = range_tree_space(msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]); + + msp_free_space = range_tree_space(msp->ms_tree) + allocated + + msp->ms_deferspace + freed; + + VERIFY3U(sm_free_space, ==, msp_free_space); +} + +/* + * ========================================================================== + * Metaslab groups + * ========================================================================== + */ /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below @@ -989,7 +1061,7 @@ static range_tree_ops_t metaslab_rt_ops = { /* * ========================================================================== - * Metaslab block operations + * Common allocator routines * ========================================================================== */ @@ -1008,31 +1080,22 @@ metaslab_block_maxsize(metaslab_t *msp) return (rs->rs_end - rs->rs_start); } -uint64_t -metaslab_block_alloc(metaslab_t *msp, uint64_t size) +static range_seg_t * +metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) { - uint64_t start; - range_tree_t *rt = msp->ms_tree; + range_seg_t *rs, rsearch; + avl_index_t where; - VERIFY(!msp->ms_condensing); + rsearch.rs_start = start; + rsearch.rs_end = start + size; - start = msp->ms_ops->msop_alloc(msp, size); - if (start != -1ULL) { - vdev_t *vd = msp->ms_group->mg_vd; - - VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); - VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); - range_tree_remove(rt, start, size); + rs = avl_find(t, &rsearch, &where); + if (rs == NULL) { + rs = avl_nearest(t, where, AVL_AFTER); } - return (start); -} -/* - * ========================================================================== - * Common allocator routines - * ========================================================================== - */ + return (rs); +} /* * This is a helper function that can be used by the allocator to find @@ -1043,15 +1106,7 @@ static uint64_t metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, uint64_t align) { - range_seg_t *rs, rsearch; - avl_index_t where; - - rsearch.rs_start = *cursor; - rsearch.rs_end = *cursor + size; - - rs = avl_find(t, &rsearch, &where); - if (rs == NULL) - rs = avl_nearest(t, where, AVL_AFTER); + range_seg_t *rs = metaslab_block_find(t, *cursor, size); while (rs != NULL) { uint64_t offset = P2ROUNDUP(rs->rs_start, align); @@ -1276,6 +1331,7 @@ int metaslab_load(metaslab_t *msp) { int error = 0; + boolean_t success = B_FALSE; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!msp->ms_loaded); @@ -1293,14 +1349,18 @@ metaslab_load(metaslab_t *msp) else range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); - msp->ms_loaded = (error == 0); + success = (error == 0); msp->ms_loading = B_FALSE; - if (msp->ms_loaded) { + if (success) { + ASSERT3P(msp->ms_group, !=, NULL); + msp->ms_loaded = B_TRUE; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defertree[t], range_tree_remove, msp->ms_tree); } + msp->ms_max_size = metaslab_block_maxsize(msp); } cv_broadcast(&msp->ms_load_cv); return (error); @@ -1313,6 +1373,7 @@ metaslab_unload(metaslab_t *msp) range_tree_vacate(msp->ms_tree, NULL, NULL); msp->ms_loaded = B_FALSE; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; + msp->ms_max_size = 0; } int @@ -1357,21 +1418,23 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); metaslab_group_add(mg, ms); - ms->ms_fragmentation = metaslab_fragmentation(ms); - ms->ms_ops = mg->mg_class->mc_ops; + metaslab_set_fragmentation(ms); /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. * If we're adding space to an existing pool, the new space * does not become available until after this txg has synced. + * The metaslab's weight will also be initialized when we sync + * out this txg. This ensures that we don't attempt to allocate + * from it before we have initialized it completely. */ if (txg <= TXG_INITIAL) metaslab_sync_done(ms, 0); /* * If metaslab_debug_load is set and we're initializing a metaslab - * that has an allocated space_map object then load the its space + * that has an allocated space map object then load the its space * map so that can verify frees. */ if (metaslab_debug_load && ms->ms_sm != NULL) { @@ -1398,7 +1461,6 @@ metaslab_fini(metaslab_t *msp) metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); - VERIFY(msp->ms_group == NULL); vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 0, -msp->ms_size); @@ -1471,8 +1533,8 @@ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { * not support this metric. Otherwise, the return value should be in the * range [0, 100]. */ -static uint64_t -metaslab_fragmentation(metaslab_t *msp) +static void +metaslab_set_fragmentation(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t fragmentation = 0; @@ -1480,18 +1542,22 @@ metaslab_fragmentation(metaslab_t *msp) boolean_t feature_enabled = spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM); - if (!feature_enabled) - return (ZFS_FRAG_INVALID); + if (!feature_enabled) { + msp->ms_fragmentation = ZFS_FRAG_INVALID; + return; + } /* * A null space map means that the entire metaslab is free * and thus is not fragmented. */ - if (msp->ms_sm == NULL) - return (0); + if (msp->ms_sm == NULL) { + msp->ms_fragmentation = 0; + return; + } /* - * If this metaslab's space_map has not been upgraded, flag it + * If this metaslab's space map has not been upgraded, flag it * so that we upgrade next time we encounter it. */ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { @@ -1504,12 +1570,14 @@ metaslab_fragmentation(metaslab_t *msp) spa_dbgmsg(spa, "txg %llu, requesting force condense: " "msp %p, vd %p", txg, msp, vd); } - return (ZFS_FRAG_INVALID); + msp->ms_fragmentation = ZFS_FRAG_INVALID; + return; } for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { uint64_t space = 0; uint8_t shift = msp->ms_sm->sm_shift; + int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, FRAGMENTATION_TABLE_SIZE - 1); @@ -1526,7 +1594,8 @@ metaslab_fragmentation(metaslab_t *msp) if (total > 0) fragmentation /= total; ASSERT3U(fragmentation, <=, 100); - return (fragmentation); + + msp->ms_fragmentation = fragmentation; } /* @@ -1535,30 +1604,20 @@ metaslab_fragmentation(metaslab_t *msp) * the LBA range, and whether the metaslab is loaded. */ static uint64_t -metaslab_weight(metaslab_t *msp) +metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); - - /* - * This vdev is in the process of being removed so there is nothing - * for us to do here. - */ - if (vd->vdev_removing) { - ASSERT0(space_map_allocated(msp->ms_sm)); - ASSERT0(vd->vdev_ms_shift); - return (0); - } + ASSERT(!vd->vdev_removing); /* * The baseline weight is the metaslab's free space. */ space = msp->ms_size - space_map_allocated(msp->ms_sm); - msp->ms_fragmentation = metaslab_fragmentation(msp); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { /* @@ -1607,6 +1666,210 @@ metaslab_weight(metaslab_t *msp) weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } + WEIGHT_SET_SPACEBASED(weight); + return (weight); +} + +/* + * Return the weight of the specified metaslab, according to the segment-based + * weighting algorithm. The metaslab must be loaded. This function can + * be called within a sync pass since it relies only on the metaslab's + * range tree which is always accurate when the metaslab is loaded. + */ +static uint64_t +metaslab_weight_from_range_tree(metaslab_t *msp) +{ + uint64_t weight = 0; + uint32_t segments = 0; + + ASSERT(msp->ms_loaded); + + for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; + i--) { + uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; + int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; + + segments <<= 1; + segments += msp->ms_tree->rt_histogram[i]; + + /* + * The range tree provides more precision than the space map + * and must be downgraded so that all values fit within the + * space map's histogram. This allows us to compare loaded + * vs. unloaded metaslabs to determine which metaslab is + * considered "best". + */ + if (i > max_idx) + continue; + + if (segments != 0) { + WEIGHT_SET_COUNT(weight, segments); + WEIGHT_SET_INDEX(weight, i); + WEIGHT_SET_ACTIVE(weight, 0); + break; + } + } + return (weight); +} + +/* + * Calculate the weight based on the on-disk histogram. This should only + * be called after a sync pass has completely finished since the on-disk + * information is updated in metaslab_sync(). + */ +static uint64_t +metaslab_weight_from_spacemap(metaslab_t *msp) +{ + uint64_t weight = 0; + + for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { + if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { + WEIGHT_SET_COUNT(weight, + msp->ms_sm->sm_phys->smp_histogram[i]); + WEIGHT_SET_INDEX(weight, i + + msp->ms_sm->sm_shift); + WEIGHT_SET_ACTIVE(weight, 0); + break; + } + } + return (weight); +} + +/* + * Compute a segment-based weight for the specified metaslab. The weight + * is determined by highest bucket in the histogram. The information + * for the highest bucket is encoded into the weight value. + */ +static uint64_t +metaslab_segment_weight(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + uint64_t weight = 0; + uint8_t shift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * The metaslab is completely free. + */ + if (space_map_allocated(msp->ms_sm) == 0) { + int idx = highbit64(msp->ms_size) - 1; + int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; + + if (idx < max_idx) { + WEIGHT_SET_COUNT(weight, 1ULL); + WEIGHT_SET_INDEX(weight, idx); + } else { + WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); + WEIGHT_SET_INDEX(weight, max_idx); + } + WEIGHT_SET_ACTIVE(weight, 0); + ASSERT(!WEIGHT_IS_SPACEBASED(weight)); + + return (weight); + } + + ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + + /* + * If the metaslab is fully allocated then just make the weight 0. + */ + if (space_map_allocated(msp->ms_sm) == msp->ms_size) + return (0); + /* + * If the metaslab is already loaded, then use the range tree to + * determine the weight. Otherwise, we rely on the space map information + * to generate the weight. + */ + if (msp->ms_loaded) { + weight = metaslab_weight_from_range_tree(msp); + } else { + weight = metaslab_weight_from_spacemap(msp); + } + + /* + * If the metaslab was active the last time we calculated its weight + * then keep it active. We want to consume the entire region that + * is associated with this weight. + */ + if (msp->ms_activation_weight != 0 && weight != 0) + WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); + return (weight); +} + +/* + * Determine if we should attempt to allocate from this metaslab. If the + * metaslab has a maximum size then we can quickly determine if the desired + * allocation size can be satisfied. Otherwise, if we're using segment-based + * weighting then we can determine the maximum allocation that this metaslab + * can accommodate based on the index encoded in the weight. If we're using + * space-based weights then rely on the entire weight (excluding the weight + * type bit). + */ +boolean_t +metaslab_should_allocate(metaslab_t *msp, uint64_t asize) +{ + boolean_t should_allocate; + + if (msp->ms_max_size != 0) + return (msp->ms_max_size >= asize); + + if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { + /* + * The metaslab segment weight indicates segments in the + * range [2^i, 2^(i+1)), where i is the index in the weight. + * Since the asize might be in the middle of the range, we + * should attempt the allocation if asize < 2^(i+1). + */ + should_allocate = (asize < + 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); + } else { + should_allocate = (asize <= + (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); + } + return (should_allocate); +} + +static uint64_t +metaslab_weight(metaslab_t *msp) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + uint64_t weight; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * This vdev is in the process of being removed so there is nothing + * for us to do here. + */ + if (vd->vdev_removing) { + ASSERT0(space_map_allocated(msp->ms_sm)); + ASSERT0(vd->vdev_ms_shift); + return (0); + } + + metaslab_set_fragmentation(msp); + + /* + * Update the maximum size if the metaslab is loaded. This will + * ensure that we get an accurate maximum size if newly freed space + * has been added back into the free tree. + */ + if (msp->ms_loaded) + msp->ms_max_size = metaslab_block_maxsize(msp); + + /* + * Segment-based weighting requires space map histogram support. + */ + if (zfs_metaslab_segment_weight_enabled && + spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && + (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == + sizeof (space_map_phys_t))) { + weight = metaslab_segment_weight(msp); + } else { + weight = metaslab_space_weight(msp); + } return (weight); } @@ -1625,6 +1888,7 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight) } } + msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } @@ -1635,18 +1899,56 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight) } static void -metaslab_passivate(metaslab_t *msp, uint64_t size) +metaslab_passivate(metaslab_t *msp, uint64_t weight) { + uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; + /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ - ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); - metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); + ASSERT(size >= SPA_MINBLOCKSIZE || + range_tree_space(msp->ms_tree) == 0); + ASSERT0(weight & METASLAB_ACTIVE_MASK); + + msp->ms_activation_weight = 0; + metaslab_group_sort(msp->ms_group, msp, weight); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } +/* + * Segment-based metaslabs are activated once and remain active until + * we either fail an allocation attempt (similar to space-based metaslabs) + * or have exhausted the free space in zfs_metaslab_switch_threshold + * buckets since the metaslab was activated. This function checks to see + * if we've exhaused the zfs_metaslab_switch_threshold buckets in the + * metaslab and passivates it proactively. This will allow us to select a + * metaslabs with larger contiguous region if any remaining within this + * metaslab group. If we're in sync pass > 1, then we continue using this + * metaslab so that we don't dirty more block and cause more sync passes. + */ +void +metaslab_segment_may_passivate(metaslab_t *msp) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) + return; + + /* + * Since we are in the middle of a sync pass, the most accurate + * information that is accessible to us is the in-core range tree + * histogram; calculate the new weight based on that information. + */ + uint64_t weight = metaslab_weight_from_range_tree(msp); + int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); + int current_idx = WEIGHT_GET_INDEX(weight); + + if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) + metaslab_passivate(msp, weight); +} + static void metaslab_preload(void *arg) { @@ -1659,11 +1961,7 @@ metaslab_preload(void *arg) metaslab_load_wait(msp); if (!msp->ms_loaded) (void) metaslab_load(msp); - - /* - * Set the ms_access_txg value so that we don't unload it right away. - */ - msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; + msp->ms_selected_txg = spa_syncing_txg(spa); mutex_exit(&msp->ms_lock); } @@ -1684,10 +1982,7 @@ metaslab_group_preload(metaslab_group_t *mg) /* * Load the next potential metaslabs */ - msp = avl_first(t); - while (msp != NULL) { - metaslab_t *msp_next = AVL_NEXT(t, msp); - + for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced @@ -1695,27 +1990,11 @@ metaslab_group_preload(metaslab_group_t *mg) * that force condensing happens in the next txg. */ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { - msp = msp_next; continue; } - /* - * We must drop the metaslab group lock here to preserve - * lock ordering with the ms_lock (when grabbing both - * the mg_lock and the ms_lock, the ms_lock must be taken - * first). As a result, it is possible that the ordering - * of the metaslabs within the avl tree may change before - * we reacquire the lock. The metaslab cannot be removed from - * the tree while we're in syncing context so it is safe to - * drop the mg_lock here. If the metaslabs are reordered - * nothing will break -- we just may end up loading a - * less than optimal one. - */ - mutex_exit(&mg->mg_lock); VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, msp, TQ_SLEEP) != NULL); - mutex_enter(&mg->mg_lock); - msp = msp_next; } mutex_exit(&mg->mg_lock); } @@ -1864,7 +2143,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) mutex_enter(&msp->ms_lock); /* - * While we would ideally like to create a space_map representation + * While we would ideally like to create a space map representation * that consists only of allocation records, doing so can be * prohibitively expensive because the in-core free tree can be * large, and therefore computationally expensive to subtract @@ -1927,7 +2206,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * metaslab_sync() is the metaslab's ms_tree. No other thread can * be modifying this txg's alloctree, freetree, freed_tree, or * space_map_phys_t. Therefore, we only hold ms_lock to satify - * space_map ASSERTs. We drop it whenever we call into the DMU, + * space map ASSERTs. We drop it whenever we call into the DMU, * because the DMU can call down to us (e.g. via zio_free()) at * any time. */ @@ -1949,7 +2228,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) mutex_enter(&msp->ms_lock); /* - * Note: metaslab_condense() clears the space_map's histogram. + * Note: metaslab_condense() clears the space map's histogram. * Therefore we must verify and remove this histogram before * condensing. */ @@ -1974,16 +2253,38 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) */ space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); - } else { + /* - * Since the space map is not loaded we simply update the - * exisiting histogram with what was freed in this txg. This - * means that the on-disk histogram may not have an accurate - * view of the free space but it's close enough to allow - * us to make allocation decisions. + * Since we've cleared the histogram we need to add back + * any free space that has already been processed, plus + * any deferred space. This allows the on-disk histogram + * to accurately reflect all free space even if some space + * is not yet available for allocation (i.e. deferred). */ - space_map_histogram_add(msp->ms_sm, *freetree, tx); + space_map_histogram_add(msp->ms_sm, *freed_tree, tx); + + /* + * Add back any deferred free space that has not been + * added back into the in-core free tree yet. This will + * ensure that we don't end up with a space map histogram + * that is completely empty unless the metaslab is fully + * allocated. + */ + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_histogram_add(msp->ms_sm, + msp->ms_defertree[t], tx); + } } + + /* + * Always add the free space from this sync pass to the space + * map histogram. We want to make sure that the on-disk histogram + * accounts for all free space. If the space map is not loaded, + * then we will lose some accuracy but will correct it the next + * time we load the space map. + */ + space_map_histogram_add(msp->ms_sm, *freetree, tx); + metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); @@ -2002,6 +2303,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) range_tree_vacate(alloctree, NULL, NULL); ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); mutex_exit(&msp->ms_lock); @@ -2023,9 +2325,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; range_tree_t **freed_tree; range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; + boolean_t defer_allowed = B_TRUE; ASSERT(!vd->vdev_ishole); @@ -2060,9 +2364,20 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; + uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - + metaslab_class_get_alloc(spa_normal_class(spa)); + if (free_space <= spa_get_slop_space(spa)) { + defer_allowed = B_FALSE; + } + + defer_delta = 0; alloc_delta = space_map_alloc_delta(msp->ms_sm); - defer_delta = range_tree_space(*freed_tree) - - range_tree_space(*defer_tree); + if (defer_allowed) { + defer_delta = range_tree_space(*freed_tree) - + range_tree_space(*defer_tree); + } else { + defer_delta -= range_tree_space(*defer_tree); + } vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); @@ -2083,7 +2398,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) */ range_tree_vacate(*defer_tree, msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); - range_tree_swap(freed_tree, defer_tree); + if (defer_allowed) { + range_tree_swap(freed_tree, defer_tree); + } else { + range_tree_vacate(*freed_tree, + msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); + } space_map_update(msp->ms_sm); @@ -2098,7 +2418,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } - if (msp->ms_loaded && msp->ms_access_txg < txg) { + /* + * Calculate the new weights before unloading any metaslabs. + * This will give us the most accurate weighting. + */ + metaslab_group_sort(mg, msp, metaslab_weight(msp)); + + /* + * If the metaslab is loaded and we've not tried to load or allocate + * from it in 'metaslab_unload_delay' txgs, then unload it. + */ + if (msp->ms_loaded && + msp->ms_selected_txg + metaslab_unload_delay < txg) { for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( msp->ms_alloctree[(txg + t) & TXG_MASK])); @@ -2108,7 +2439,6 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) metaslab_unload(msp); } - metaslab_group_sort(mg, msp, metaslab_weight(msp)); mutex_exit(&msp->ms_lock); } @@ -2141,6 +2471,113 @@ metaslab_distance(metaslab_t *msp, dva_t *dva) return (0); } +/* + * ========================================================================== + * Metaslab allocation tracing facility + * ========================================================================== + */ +kstat_t *metaslab_trace_ksp; +kstat_named_t metaslab_trace_over_limit; + +void +metaslab_alloc_trace_init(void) +{ + ASSERT(metaslab_alloc_trace_cache == NULL); + metaslab_alloc_trace_cache = kmem_cache_create( + "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", + "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); + if (metaslab_trace_ksp != NULL) { + metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; + kstat_named_init(&metaslab_trace_over_limit, + "metaslab_trace_over_limit", KSTAT_DATA_UINT64); + kstat_install(metaslab_trace_ksp); + } +} + +void +metaslab_alloc_trace_fini(void) +{ + if (metaslab_trace_ksp != NULL) { + kstat_delete(metaslab_trace_ksp); + metaslab_trace_ksp = NULL; + } + kmem_cache_destroy(metaslab_alloc_trace_cache); + metaslab_alloc_trace_cache = NULL; +} + +/* + * Add an allocation trace element to the allocation tracing list. + */ +static void +metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, + metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset) +{ + if (!metaslab_trace_enabled) + return; + + /* + * When the tracing list reaches its maximum we remove + * the second element in the list before adding a new one. + * By removing the second element we preserve the original + * entry as a clue to what allocations steps have already been + * performed. + */ + if (zal->zal_size == metaslab_trace_max_entries) { + metaslab_alloc_trace_t *mat_next; +#ifdef DEBUG + panic("too many entries in allocation list"); +#endif + atomic_inc_64(&metaslab_trace_over_limit.value.ui64); + zal->zal_size--; + mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); + list_remove(&zal->zal_list, mat_next); + kmem_cache_free(metaslab_alloc_trace_cache, mat_next); + } + + metaslab_alloc_trace_t *mat = + kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); + list_link_init(&mat->mat_list_node); + mat->mat_mg = mg; + mat->mat_msp = msp; + mat->mat_size = psize; + mat->mat_dva_id = dva_id; + mat->mat_offset = offset; + mat->mat_weight = 0; + + if (msp != NULL) + mat->mat_weight = msp->ms_weight; + + /* + * The list is part of the zio so locking is not required. Only + * a single thread will perform allocations for a given zio. + */ + list_insert_tail(&zal->zal_list, mat); + zal->zal_size++; + + ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); +} + +void +metaslab_trace_init(zio_alloc_list_t *zal) +{ + list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), + offsetof(metaslab_alloc_trace_t, mat_list_node)); + zal->zal_size = 0; +} + +void +metaslab_trace_fini(zio_alloc_list_t *zal) +{ + metaslab_alloc_trace_t *mat; + + while ((mat = list_remove_head(&zal->zal_list)) != NULL) + kmem_cache_free(metaslab_alloc_trace_cache, mat); + list_destroy(&zal->zal_list); + zal->zal_size = 0; +} + /* * ========================================================================== * Metaslab block operations @@ -2191,13 +2628,48 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) } static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, - uint64_t txg, uint64_t min_distance, dva_t *dva, int d) +metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) +{ + uint64_t start; + range_tree_t *rt = msp->ms_tree; + metaslab_class_t *mc = msp->ms_group->mg_class; + + VERIFY(!msp->ms_condensing); + + start = mc->mc_ops->msop_alloc(msp, size); + if (start != -1ULL) { + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + + VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); + range_tree_remove(rt, start, size); + + if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) + vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); + + range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size); + + /* Track the last successful allocation */ + msp->ms_alloc_txg = txg; + metaslab_verify_space(msp, txg); + } + + /* + * Now that we've attempted the allocation we need to update the + * metaslab's maximum block size since it may have changed. + */ + msp->ms_max_size = metaslab_block_maxsize(msp); + return (start); +} + +static uint64_t +metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) { - spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp = NULL; uint64_t offset = -1ULL; - avl_tree_t *t = &mg->mg_metaslab_tree; uint64_t activation_weight; uint64_t target_distance; int i; @@ -2210,20 +2682,39 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, } } + metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); + search->ms_weight = UINT64_MAX; + search->ms_start = 0; for (;;) { boolean_t was_active; + avl_tree_t *t = &mg->mg_metaslab_tree; + avl_index_t idx; mutex_enter(&mg->mg_lock); - for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { - if (msp->ms_weight < asize) { - spa_dbgmsg(spa, "%s: failed to meet weight " - "requirement: vdev %llu, txg %llu, mg %p, " - "msp %p, asize %llu, " - "weight %llu", spa_name(spa), - mg->mg_vd->vdev_id, txg, - mg, msp, asize, msp->ms_weight); - mutex_exit(&mg->mg_lock); - return (-1ULL); + + /* + * Find the metaslab with the highest weight that is less + * than what we've already tried. In the common case, this + * means that we will examine each metaslab at most once. + * Note that concurrent callers could reorder metaslabs + * by activation/passivation once we have dropped the mg_lock. + * If a metaslab is activated by another thread, and we fail + * to allocate from the metaslab we have selected, we may + * not try the newly-activated metaslab, and instead activate + * another metaslab. This is not optimal, but generally + * does not cause any problems (a possible exception being + * if every metaslab is completely full except for the + * the newly-activated metaslab which we fail to examine). + */ + msp = avl_find(t, search, &idx); + if (msp == NULL) + msp = avl_nearest(t, idx, AVL_AFTER); + for (; msp != NULL; msp = AVL_NEXT(t, msp)) { + + if (!metaslab_should_allocate(msp, asize)) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL); + continue; } /* @@ -2240,16 +2731,21 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, (space_map_allocated(msp->ms_sm) != 0 ? 0 : min_distance >> 1); - for (i = 0; i < d; i++) + for (i = 0; i < d; i++) { if (metaslab_distance(msp, &dva[i]) < target_distance) break; + } if (i == d) break; } mutex_exit(&mg->mg_lock); - if (msp == NULL) + if (msp == NULL) { + kmem_free(search, sizeof (*search)); return (-1ULL); + } + search->ms_weight = msp->ms_weight; + search->ms_start = msp->ms_start + 1; mutex_enter(&msp->ms_lock); @@ -2257,11 +2753,11 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we - * were blocked on the metaslab lock. + * were blocked on the metaslab lock. We check the + * active status first to see if we need to reselect + * a new metaslab. */ - if (msp->ms_weight < asize || (was_active && - !(msp->ms_weight & METASLAB_ACTIVE_MASK) && - activation_weight == METASLAB_WEIGHT_PRIMARY)) { + if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { mutex_exit(&msp->ms_lock); continue; } @@ -2278,6 +2774,21 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, mutex_exit(&msp->ms_lock); continue; } + msp->ms_selected_txg = txg; + + /* + * Now that we have the lock, recheck to see if we should + * continue to use this metaslab for this allocation. The + * the metaslab is now loaded so metaslab_should_allocate() can + * accurately determine if the allocation attempt should + * proceed. + */ + if (!metaslab_should_allocate(msp, asize)) { + /* Passivate this metaslab and select a new one. */ + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL); + goto next; + } /* * If this metaslab is currently condensing then pick again as @@ -2285,50 +2796,131 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, * to disk. */ if (msp->ms_condensing) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_CONDENSING); mutex_exit(&msp->ms_lock); continue; } - if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) - break; + offset = metaslab_block_alloc(msp, asize, txg); + metaslab_trace_add(zal, mg, msp, asize, d, offset); - metaslab_passivate(msp, metaslab_block_maxsize(msp)); + if (offset != -1ULL) { + /* Proactively passivate the metaslab, if needed */ + metaslab_segment_may_passivate(msp); + break; + } +next: + ASSERT(msp->ms_loaded); + + /* + * We were unable to allocate from this metaslab so determine + * a new weight for this metaslab. Now that we have loaded + * the metaslab we can provide a better hint to the metaslab + * selector. + * + * For space-based metaslabs, we use the maximum block size. + * This information is only available when the metaslab + * is loaded and is more accurate than the generic free + * space weight that was calculated by metaslab_weight(). + * This information allows us to quickly compare the maximum + * available allocation in the metaslab to the allocation + * size being requested. + * + * For segment-based metaslabs, determine the new weight + * based on the highest bucket in the range tree. We + * explicitly use the loaded segment weight (i.e. the range + * tree histogram) since it contains the space that is + * currently available for allocation and is accurate + * even within a sync pass. + */ + if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { + uint64_t weight = metaslab_block_maxsize(msp); + WEIGHT_SET_SPACEBASED(weight); + metaslab_passivate(msp, weight); + } else { + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + } + + /* + * We have just failed an allocation attempt, check + * that metaslab_should_allocate() agrees. Otherwise, + * we may end up in an infinite loop retrying the same + * metaslab. + */ + ASSERT(!metaslab_should_allocate(msp, asize)); mutex_exit(&msp->ms_lock); } - - if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) - vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - - range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); - msp->ms_access_txg = txg + metaslab_unload_delay; - mutex_exit(&msp->ms_lock); + kmem_free(search, sizeof (*search)); return (offset); } +static uint64_t +metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) +{ + uint64_t offset; + ASSERT(mg->mg_initialized); + + offset = metaslab_group_alloc_normal(mg, zal, asize, txg, + min_distance, dva, d); + + mutex_enter(&mg->mg_lock); + if (offset == -1ULL) { + mg->mg_failed_allocations++; + metaslab_trace_add(zal, mg, NULL, asize, d, + TRACE_GROUP_FAILURE); + if (asize == SPA_GANGBLOCKSIZE) { + /* + * This metaslab group was unable to allocate + * the minimum gang block size so it must be out of + * space. We must notify the allocation throttle + * to start skipping allocation attempts to this + * metaslab group until more space becomes available. + * Note: this failure cannot be caused by the + * allocation throttle since the allocation throttle + * is only responsible for skipping devices and + * not failing block allocations. + */ + mg->mg_no_free_space = B_TRUE; + } + } + mg->mg_allocations++; + mutex_exit(&mg->mg_lock); + return (offset); +} + +/* + * If we have to write a ditto block (i.e. more than one DVA for a given BP) + * on the same vdev as an existing DVA of this BP, then try to allocate it + * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the + * existing DVAs. + */ +int ditto_same_vdev_distance_shift = 3; + /* * Allocate a block for the specified i/o. */ static int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, - dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) + dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, + zio_alloc_list_t *zal) { metaslab_group_t *mg, *rotor; vdev_t *vd; - int dshift = 3; - int all_zero; - int zio_lock = B_FALSE; - boolean_t allocatable; - uint64_t asize; - uint64_t distance; + boolean_t try_hard = B_FALSE; ASSERT(!DVA_IS_VALID(&dva[d])); /* * For testing, make some blocks above a certain size be gang blocks. */ - if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) + if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) { + metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG); return (SET_ERROR(ENOSPC)); + } /* * Start at the rotor and loop through all mgs until we find something. @@ -2385,15 +2977,16 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, rotor = mg; top: - all_zero = B_TRUE; do { + boolean_t allocatable; + ASSERT(mg->mg_activation_count == 1); vd = mg->mg_vd; /* * Don't allocate from faulted devices. */ - if (zio_lock) { + if (try_hard) { spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); allocatable = vdev_allocatable(vd); spa_config_exit(spa, SCL_ZIO, FTAG); @@ -2408,63 +3001,54 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * inadvertently return ENOSPC and suspend the pool * even though space is still available. */ - if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) { + if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, psize); } - if (!allocatable) + if (!allocatable) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_NOT_ALLOCATABLE); goto next; + } ASSERT(mg->mg_initialized); /* - * Avoid writing single-copy data to a failing vdev. + * Avoid writing single-copy data to a failing, + * non-redundant vdev, unless we've already tried all + * other vdevs. */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && - d == 0 && dshift == 3 && vd->vdev_children == 0) { - all_zero = B_FALSE; + d == 0 && !try_hard && vd->vdev_children == 0) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_VDEV_ERROR); goto next; } ASSERT(mg->mg_class == mc); - distance = vd->vdev_asize >> dshift; - if (distance <= (1ULL << vd->vdev_ms_shift)) - distance = 0; - else - all_zero = B_FALSE; + /* + * If we don't need to try hard, then require that the + * block be 1/8th of the device away from any other DVAs + * in this BP. If we are trying hard, allow any offset + * to be used (distance=0). + */ + uint64_t distance = 0; + if (!try_hard) { + distance = vd->vdev_asize >> + ditto_same_vdev_distance_shift; + if (distance <= (1ULL << vd->vdev_ms_shift)) + distance = 0; + } - asize = vdev_psize_to_asize(vd, psize); + uint64_t asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - uint64_t offset = metaslab_group_alloc(mg, asize, txg, + uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, distance, dva, d); - mutex_enter(&mg->mg_lock); - if (offset == -1ULL) { - mg->mg_failed_allocations++; - if (asize == SPA_GANGBLOCKSIZE) { - /* - * This metaslab group was unable to allocate - * the minimum gang block size so it must be - * out of space. We must notify the allocation - * throttle to start skipping allocation - * attempts to this metaslab group until more - * space becomes available. - * - * Note: this failure cannot be caused by the - * allocation throttle since the allocation - * throttle is only responsible for skipping - * devices and not failing block allocations. - */ - mg->mg_no_free_space = B_TRUE; - } - } - mg->mg_allocations++; - mutex_exit(&mg->mg_lock); - if (offset != -1ULL) { /* * If we've just selected this metaslab group, @@ -2516,20 +3100,17 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, mc->mc_aliquot = 0; } while ((mg = mg->mg_next) != rotor); - if (!all_zero) { - dshift++; - ASSERT(dshift < 64); - goto top; - } - - if (!allocatable && !zio_lock) { - dshift = 3; - zio_lock = B_TRUE; + /* + * If we haven't tried hard, do so now. + */ + if (!try_hard) { + try_hard = B_TRUE; goto top; } bzero(&dva[d], sizeof (dva_t)); + metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC); return (SET_ERROR(ENOSPC)); } @@ -2578,6 +3159,7 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); range_tree_add(msp->ms_tree, offset, size); + msp->ms_max_size = metaslab_block_maxsize(msp); } else { if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); @@ -2695,7 +3277,8 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, - int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio) + int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, + zio_alloc_list_t *zal, zio_t *zio) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; @@ -2714,10 +3297,11 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); + ASSERT3P(zal, !=, NULL); for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, - txg, flags); + txg, flags, zal); if (error != 0) { for (d--; d >= 0; d--) { metaslab_free_dva(spa, &dva[d], txg, B_TRUE); diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index 4b26b007a71c..bb1ecaa4cc4a 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -1240,6 +1240,19 @@ spa_unload(spa_t *spa) spa->spa_sync_on = B_FALSE; } + /* + * Even though vdev_free() also calls vdev_metaslab_fini, we need + * to call it earlier, before we wait for async i/o to complete. + * This ensures that there is no async metaslab prefetching, by + * calling taskq_wait(mg_taskq). + */ + if (spa->spa_root_vdev != NULL) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) + vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); + spa_config_exit(spa, SCL_ALL, FTAG); + } + /* * Wait for any outstanding async I/O to complete. */ diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c index 7e00d9f42a15..5882d18f41ce 100644 --- a/uts/common/fs/zfs/spa_misc.c +++ b/uts/common/fs/zfs/spa_misc.c @@ -1877,6 +1877,7 @@ spa_init(int mode) refcount_init(); unique_init(); range_tree_init(); + metaslab_alloc_trace_init(); zio_init(); dmu_init(); zil_init(); @@ -1899,6 +1900,7 @@ spa_fini(void) zil_fini(); dmu_fini(); zio_fini(); + metaslab_alloc_trace_fini(); range_tree_fini(); unique_fini(); refcount_fini(); diff --git a/uts/common/fs/zfs/space_map.c b/uts/common/fs/zfs/space_map.c index e0fe6ac2604e..0b3af50a11fd 100644 --- a/uts/common/fs/zfs/space_map.c +++ b/uts/common/fs/zfs/space_map.c @@ -170,7 +170,6 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) dmu_buf_will_dirty(sm->sm_dbuf, tx); ASSERT(space_map_histogram_verify(sm, rt)); - /* * Transfer the content of the range tree histogram to the space * map histogram. The space map histogram contains 32 buckets ranging diff --git a/uts/common/fs/zfs/sys/metaslab.h b/uts/common/fs/zfs/sys/metaslab.h index f7271f08ad4e..82ed08c728bb 100644 --- a/uts/common/fs/zfs/sys/metaslab.h +++ b/uts/common/fs/zfs/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -36,10 +36,12 @@ extern "C" { #endif + typedef struct metaslab_ops { - uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size); + uint64_t (*msop_alloc)(metaslab_t *, uint64_t); } metaslab_ops_t; + extern metaslab_ops_t *zfs_metaslab_ops; int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, @@ -63,11 +65,16 @@ uint64_t metaslab_block_maxsize(metaslab_t *); #define METASLAB_DONT_THROTTLE 0x10 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, - blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *); + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); +void metaslab_alloc_trace_init(void); +void metaslab_alloc_trace_fini(void); +void metaslab_trace_init(zio_alloc_list_t *); +void metaslab_trace_fini(zio_alloc_list_t *); + metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *); void metaslab_class_destroy(metaslab_class_t *); int metaslab_class_validate(metaslab_class_t *); diff --git a/uts/common/fs/zfs/sys/metaslab_impl.h b/uts/common/fs/zfs/sys/metaslab_impl.h index 1c8993aca55a..c43f457b9fcd 100644 --- a/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/uts/common/fs/zfs/sys/metaslab_impl.h @@ -41,6 +41,94 @@ extern "C" { #endif +/* + * Metaslab allocation tracing record. + */ +typedef struct metaslab_alloc_trace { + list_node_t mat_list_node; + metaslab_group_t *mat_mg; + metaslab_t *mat_msp; + uint64_t mat_size; + uint64_t mat_weight; + uint32_t mat_dva_id; + uint64_t mat_offset; +} metaslab_alloc_trace_t; + +/* + * Used by the metaslab allocation tracing facility to indicate + * error conditions. These errors are stored to the offset member + * of the metaslab_alloc_trace_t record and displayed by mdb. + */ +typedef enum trace_alloc_type { + TRACE_ALLOC_FAILURE = -1ULL, + TRACE_TOO_SMALL = -2ULL, + TRACE_FORCE_GANG = -3ULL, + TRACE_NOT_ALLOCATABLE = -4ULL, + TRACE_GROUP_FAILURE = -5ULL, + TRACE_ENOSPC = -6ULL, + TRACE_CONDENSING = -7ULL, + TRACE_VDEV_ERROR = -8ULL +} trace_alloc_type_t; + +#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) +#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) +#define METASLAB_WEIGHT_TYPE (1ULL << 61) +#define METASLAB_ACTIVE_MASK \ + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) + +/* + * The metaslab weight is used to encode the amount of free space in a + * metaslab, such that the "best" metaslab appears first when sorting the + * metaslabs by weight. The weight (and therefore the "best" metaslab) can + * be determined in two different ways: by computing a weighted sum of all + * the free space in the metaslab (a space based weight) or by counting only + * the free segments of the largest size (a segment based weight). We prefer + * the segment based weight because it reflects how the free space is + * comprised, but we cannot always use it -- legacy pools do not have the + * space map histogram information necessary to determine the largest + * contiguous regions. Pools that have the space map histogram determine + * the segment weight by looking at each bucket in the histogram and + * determining the free space whose size in bytes is in the range: + * [2^i, 2^(i+1)) + * We then encode the largest index, i, that contains regions into the + * segment-weighted value. + * + * Space-based weight: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PS1| weighted-free space | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * space - the fragmentation-weighted space + * + * Segment-based weight: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PS0| idx| count of segments in region | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * idx - index for the highest bucket in the histogram + * count - number of segments in the specified bucket + */ +#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2) +#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x) + +#define WEIGHT_IS_SPACEBASED(weight) \ + ((weight) == 0 || BF64_GET((weight), 61, 1)) +#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1) + +/* + * These macros are only applicable to segment-based weighting. + */ +#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6) +#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x) +#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55) +#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x) + /* * A metaslab class encompasses a category of allocatable top-level vdevs. * Each top-level vdev is associated with a metaslab group which defines @@ -220,7 +308,6 @@ struct metaslab { kmutex_t ms_lock; kcondvar_t ms_load_cv; space_map_t *ms_sm; - metaslab_ops_t *ms_ops; uint64_t ms_id; uint64_t ms_start; uint64_t ms_size; @@ -233,12 +320,27 @@ struct metaslab { boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; + + /* + * We must hold both ms_lock and ms_group->mg_lock in order to + * modify ms_loaded. + */ boolean_t ms_loaded; boolean_t ms_loading; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ - uint64_t ms_access_txg; + uint64_t ms_activation_weight; /* activation weight */ + + /* + * Track of whenever a metaslab is selected for loading or allocation. + * We use this value to determine how long the metaslab should + * stay cached. + */ + uint64_t ms_selected_txg; + + uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ + uint64_t ms_max_size; /* maximum allocatable size */ /* * The metaslab block allocators can optionally use a size-ordered diff --git a/uts/common/fs/zfs/sys/zfs_debug.h b/uts/common/fs/zfs/sys/zfs_debug.h index 3dd992bd2559..06c73f394181 100644 --- a/uts/common/fs/zfs/sys/zfs_debug.h +++ b/uts/common/fs/zfs/sys/zfs_debug.h @@ -50,14 +50,15 @@ extern int zfs_flags; extern boolean_t zfs_recover; extern boolean_t zfs_free_leak_on_eio; -#define ZFS_DEBUG_DPRINTF (1<<0) -#define ZFS_DEBUG_DBUF_VERIFY (1<<1) -#define ZFS_DEBUG_DNODE_VERIFY (1<<2) -#define ZFS_DEBUG_SNAPNAMES (1<<3) -#define ZFS_DEBUG_MODIFY (1<<4) -#define ZFS_DEBUG_SPA (1<<5) -#define ZFS_DEBUG_ZIO_FREE (1<<6) -#define ZFS_DEBUG_HISTOGRAM_VERIFY (1<<7) +#define ZFS_DEBUG_DPRINTF (1 << 0) +#define ZFS_DEBUG_DBUF_VERIFY (1 << 1) +#define ZFS_DEBUG_DNODE_VERIFY (1 << 2) +#define ZFS_DEBUG_SNAPNAMES (1 << 3) +#define ZFS_DEBUG_MODIFY (1 << 4) +#define ZFS_DEBUG_SPA (1 << 5) +#define ZFS_DEBUG_ZIO_FREE (1 << 6) +#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7) +#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8) #ifdef ZFS_DEBUG extern void __dprintf(const char *file, const char *func, diff --git a/uts/common/fs/zfs/sys/zio.h b/uts/common/fs/zfs/sys/zio.h index 88159a5408bf..a110da20c838 100644 --- a/uts/common/fs/zfs/sys/zio.h +++ b/uts/common/fs/zfs/sys/zio.h @@ -382,6 +382,11 @@ typedef int zio_pipe_stage_t(zio_t *zio); #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 +typedef struct zio_alloc_list { + list_t zal_list; + uint64_t zal_size; +} zio_alloc_list_t; + typedef struct zio_link { zio_t *zl_parent; zio_t *zl_child; @@ -436,6 +441,7 @@ struct zio { avl_node_t io_queue_node; avl_node_t io_offset_node; avl_node_t io_alloc_node; + zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ enum zio_flag io_flags; diff --git a/uts/common/fs/zfs/zio.c b/uts/common/fs/zfs/zio.c index be7087d4db48..0e3c94444ca3 100644 --- a/uts/common/fs/zfs/zio.c +++ b/uts/common/fs/zfs/zio.c @@ -543,6 +543,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); + metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; @@ -601,6 +602,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, static void zio_destroy(zio_t *zio) { + metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); @@ -2099,7 +2101,8 @@ zio_write_gang_block(zio_t *pio) } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, - bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio); + bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, + &pio->io_alloc_list, pio); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -2752,7 +2755,8 @@ zio_dva_allocate(zio_t *zio) } error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio); + zio->io_prop.zp_copies, zio->io_txg, NULL, flags, + &zio->io_alloc_list, zio); if (error != 0) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " @@ -2816,18 +2820,24 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t use_slog) { int error = 1; + zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); + metaslab_trace_init(&io_alloc_list); + if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL); + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, + &io_alloc_list, NULL); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL); + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, + &io_alloc_list, NULL); } + metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size);