diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index fd0d23502a43..a513a647039c 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -119,6 +119,7 @@ void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, boolean_t); void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); +void metaslab_recalculate_weight_and_sort(metaslab_t *); #ifdef __cplusplus } diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 02ce02226b5d..676c5dd46bf3 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -402,6 +402,49 @@ struct metaslab { boolean_t ms_loaded; boolean_t ms_loading; + /* + * The following histograms count entries that are in the + * metaslab's space map (and its histogram) but are not in + * ms_allocatable yet, because they are in ms_freed, ms_freeing, + * or ms_defer[]. + * + * When the metaslab is not loaded, its ms_weight needs to + * reflect what is allocatable (i.e. what will be part of + * ms_allocatable if it is loaded). The weight is computed from + * the spacemap histogram, but that includes ranges that are + * not yet allocatable (because they are in ms_freed, + * ms_freeing, or ms_defer[]). Therefore, when calculating the + * weight, we need to remove those ranges. + * + * The ranges in the ms_freed and ms_defer[] range trees are all + * present in the spacemap. However, the spacemap may have + * multiple entries to represent a contiguous range, because it + * is written across multiple sync passes, but the changes of + * all sync passes are consolidated into the range trees. + * Adjacent ranges that are freed in different sync passes of + * one txg will be represented separately (as 2 or more entries) + * in the space map (and its histogram), but these adjacent + * ranges will be consolidated (represented as one entry) in the + * ms_freed/ms_defer[] range trees (and their histograms). + * + * When calculating the weight, we can not simply subtract the + * range trees' histograms from the spacemap's histogram, + * because the range trees' histograms may have entries in + * higher buckets than the spacemap, due to consolidation. + * Instead we must subtract the exact entries that were added to + * the spacemap's histogram. ms_synchist and ms_deferhist[] + * represent these exact entries, so we can subtract them from + * the spacemap's histogram when calculating ms_weight. + * + * ms_synchist represents the same ranges as ms_freeing + + * ms_freed, but without consolidation across sync passes. + * + * ms_deferhist[i] represents the same ranges as ms_defer[i], + * but without consolidation across sync passes. + */ + uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; + uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; + /* * Tracks the exact amount of allocated space of this metaslab * (and specifically the metaslab's space map) up to the most diff --git a/include/sys/space_map.h b/include/sys/space_map.h index 52536cccca46..7731a352f15d 100644 --- a/include/sys/space_map.h +++ b/include/sys/space_map.h @@ -201,6 +201,7 @@ int space_map_iterate(space_map_t *sm, uint64_t length, int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, dmu_tx_t *tx); +boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt); void space_map_histogram_clear(space_map_t *sm); void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 58c47a0abfb2..9f6f0048fcba 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -856,6 +856,7 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; + ASSERT(msp != NULL); /* skip if not active or not a member */ if (msp->ms_sm == NULL || msp->ms_group != mg) @@ -1416,6 +1417,203 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; * ========================================================================== */ +static void +metaslab_aux_histograms_clear(metaslab_t *msp) +{ + /* + * Auxiliary histograms are only cleared when resetting them, + * which can only happen while the metaslab is loaded. + */ + ASSERT(msp->ms_loaded); + + bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); + for (int t = 0; t < TXG_DEFER_SIZE; t++) + bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); +} + +static void +metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, + range_tree_t *rt) +{ + /* + * This is modeled after space_map_histogram_add(), so refer to that + * function for implementation details. We want this to work like + * the space map histogram, and not the range tree histogram, as we + * are essentially constructing a delta that will be later subtracted + * from the space map histogram. + */ + int idx = 0; + for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + ASSERT3U(i, >=, idx + shift); + histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); + + if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { + ASSERT3U(idx + shift, ==, i); + idx++; + ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); + } + } +} + +/* + * Called at every sync pass that the metaslab gets synced. + * + * The reason is that we want our auxiliary histograms to be updated + * wherever the metaslab's space map histogram is updated. This way + * we stay consistent on which parts of the metaslab space map's + * histogram are currently not available for allocations (e.g because + * they are in the defer, freed, and freeing trees). + */ +static void +metaslab_aux_histograms_update(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_sm; + ASSERT(sm != NULL); + + /* + * This is similar to the metaslab's space map histogram updates + * that take place in metaslab_sync(). The only difference is that + * we only care about segments that haven't made it into the + * ms_allocatable tree yet. + */ + if (msp->ms_loaded) { + metaslab_aux_histograms_clear(msp); + + metaslab_aux_histogram_add(msp->ms_synchist, + sm->sm_shift, msp->ms_freed); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + metaslab_aux_histogram_add(msp->ms_deferhist[t], + sm->sm_shift, msp->ms_defer[t]); + } + } + + metaslab_aux_histogram_add(msp->ms_synchist, + sm->sm_shift, msp->ms_freeing); +} + +/* + * Called every time we are done syncing (writing to) the metaslab, + * i.e. at the end of each sync pass. + * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] + */ +static void +metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + space_map_t *sm = msp->ms_sm; + + if (sm == NULL) { + /* + * We came here from metaslab_init() when creating/opening a + * pool, looking at a metaslab that hasn't had any allocations + * yet. + */ + return; + } + + /* + * This is similar to the actions that we take for the ms_freed + * and ms_defer trees in metaslab_sync_done(). + */ + uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; + if (defer_allowed) { + bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], + sizeof (msp->ms_synchist)); + } else { + bzero(msp->ms_deferhist[hist_index], + sizeof (msp->ms_deferhist[hist_index])); + } + bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); +} + +/* + * Ensure that the metaslab's weight and fragmentation are consistent + * with the contents of the histogram (either the range tree's histogram + * or the space map's depending whether the metaslab is loaded). + */ +static void +metaslab_verify_weight_and_frag(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* see comment in metaslab_verify_unflushed_changes() */ + if (msp->ms_group == NULL) + return; + + /* + * Devices being removed always return a weight of 0 and leave + * fragmentation and ms_max_size as is - there is nothing for + * us to verify here. + */ + vdev_t *vd = msp->ms_group->mg_vd; + if (vd->vdev_removing) + return; + + /* + * If the metaslab is dirty it probably means that we've done + * some allocations or frees that have changed our histograms + * and thus the weight. + */ + for (int t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&vd->vdev_ms_list, msp, t)) + return; + } + + /* + * This verification checks that our in-memory state is consistent + * with what's on disk. If the pool is read-only then there aren't + * any changes and we just have the initially-loaded state. + */ + if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) + return; + + /* some extra verification for in-core tree if you can */ + if (msp->ms_loaded) { + range_tree_stat_verify(msp->ms_allocatable); + VERIFY(space_map_histogram_verify(msp->ms_sm, + msp->ms_allocatable)); + } + + uint64_t weight = msp->ms_weight; + uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; + boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); + uint64_t frag = msp->ms_fragmentation; + uint64_t max_segsize = msp->ms_max_size; + + msp->ms_weight = 0; + msp->ms_fragmentation = 0; + msp->ms_max_size = 0; + + /* + * This function is used for verification purposes. Regardless of + * whether metaslab_weight() thinks this metaslab should be active or + * not, we want to ensure that the actual weight (and therefore the + * value of ms_weight) would be the same if it was to be recalculated + * at this point. + */ + msp->ms_weight = metaslab_weight(msp) | was_active; + + VERIFY3U(max_segsize, ==, msp->ms_max_size); + + /* + * If the weight type changed then there is no point in doing + * verification. Revert fields to their original values. + */ + if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || + (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { + msp->ms_fragmentation = frag; + msp->ms_weight = weight; + return; + } + + VERIFY3U(msp->ms_fragmentation, ==, frag); + VERIFY3U(msp->ms_weight, ==, weight); +} + /* * Wait for any in-progress metaslab loads to complete. */ @@ -1501,6 +1699,22 @@ metaslab_load_impl(metaslab_t *msp) range_tree_remove, msp->ms_allocatable); } + /* + * Call metaslab_recalculate_weight_and_sort() now that the + * metaslab is loaded so we get the metaslab's real weight. + * + * Unless this metaslab was created with older software and + * has not yet been converted to use segment-based weight, we + * expect the new weight to be better or equal to the weight + * that the metaslab had while it was not loaded. This is + * because the old weight does not take into account the + * consolidation of adjacent segments between TXGs. [see + * comment for ms_synchist and ms_deferhist[] for more info] + */ + uint64_t weight = msp->ms_weight; + metaslab_recalculate_weight_and_sort(msp); + if (!WEIGHT_IS_SPACEBASED(weight)) + ASSERT3U(weight, <=, msp->ms_weight); msp->ms_max_size = metaslab_block_maxsize(msp); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; @@ -1537,10 +1751,29 @@ void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); + + metaslab_verify_weight_and_frag(msp); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; + msp->ms_weight &= ~METASLAB_ACTIVE_MASK; msp->ms_max_size = 0; + + /* + * We explicitly recalculate the metaslab's weight based on its space + * map (as it is now not loaded). We want unload metaslabs to always + * have their weights calculated from the space map histograms, while + * loaded ones have it calculated from their in-core range tree + * [see metaslab_load()]. This way, the weight reflects the information + * available in-core, whether it is loaded or not + * + * If ms_group == NULL means that we came here from metaslab_fini(), + * at which point it doesn't make sense for us to do the recalculation + * and the sorting. + */ + if (msp->ms_group != NULL) + metaslab_recalculate_weight_and_sort(msp); } static void @@ -1683,6 +1916,9 @@ metaslab_fini(metaslab_t *msp) range_tree_destroy(msp->ms_checkpointing); + for (int t = 0; t < TXG_SIZE; t++) + ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); + mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); @@ -1698,7 +1934,7 @@ metaslab_fini(metaslab_t *msp) * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done * by calculating the space in each bucket of the spacemap histogram and - * multiplying that by the fragmetation metric in this table. Doing + * multiplying that by the fragmentation metric in this table. Doing * this for all buckets and dividing it by the total amount of free * space in this metaslab (i.e. the total free space in all buckets) gives * us the fragmentation metric. This means that a high fragmentation metric @@ -1933,14 +2169,38 @@ metaslab_weight_from_range_tree(metaslab_t *msp) static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) { - uint64_t weight = 0; + space_map_t *sm = msp->ms_sm; + ASSERT(!msp->ms_loaded); + ASSERT(sm != NULL); + ASSERT3U(space_map_object(sm), !=, 0); + ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + /* + * Create a joint histogram from all the segments that have made + * it to the metaslab's space map histogram, that are not yet + * available for allocation because they are still in the freeing + * pipeline (e.g. freeing, freed, and defer trees). Then subtract + * these segments from the space map's histogram to get a more + * accurate weight. + */ + uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + deferspace_histogram[i] += msp->ms_synchist[i]; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + deferspace_histogram[i] += msp->ms_deferhist[t][i]; + } + } + + uint64_t weight = 0; for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { - if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { - WEIGHT_SET_COUNT(weight, - msp->ms_sm->sm_phys->smp_histogram[i]); - WEIGHT_SET_INDEX(weight, i + - msp->ms_sm->sm_shift); + ASSERT3U(sm->sm_phys->smp_histogram[i], >=, + deferspace_histogram[i]); + uint64_t count = + sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; + if (count != 0) { + WEIGHT_SET_COUNT(weight, count); + WEIGHT_SET_INDEX(weight, i + sm->sm_shift); WEIGHT_SET_ACTIVE(weight, 0); break; } @@ -2084,6 +2344,15 @@ metaslab_weight(metaslab_t *msp) return (weight); } +void +metaslab_recalculate_weight_and_sort(metaslab_t *msp) +{ + /* note: we preserve the mask (e.g. indication of primary, etc..) */ + uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; + metaslab_group_sort(msp->ms_group, msp, + metaslab_weight(msp) | was_active); +} + static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) @@ -2613,6 +2882,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * time we load the space map. */ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); + metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); @@ -2755,6 +3025,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) */ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } + metaslab_aux_histograms_update_done(msp, defer_allowed); if (msp->ms_new) { msp->ms_new = B_FALSE; @@ -2762,12 +3033,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) mg->mg_ms_ready++; mutex_exit(&mg->mg_lock); } + /* - * Calculate the new weights before unloading any metaslabs. - * This will give us the most accurate weighting. + * Re-sort metaslab within its group now that we've adjusted + * its allocatable space. */ - metaslab_group_sort(mg, msp, metaslab_weight(msp) | - (msp->ms_weight & METASLAB_ACTIVE_MASK)); + metaslab_recalculate_weight_and_sort(msp); /* * If the metaslab is loaded and we've not tried to load or allocate @@ -4112,7 +4383,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, zio_alloc_list_t *zal, zio_t *zio, int allocator) { dva_t *dva = bp->blk_dva; - dva_t *hintdva = hintbp->blk_dva; + dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; ASSERT(bp->blk_birth == 0); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 81c34da074fd..b17682d81c1d 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1346,12 +1346,12 @@ vdev_metaslab_fini(vdev_t *vd) } if (vd->vdev_ms != NULL) { - uint64_t count = vd->vdev_ms_count; + metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_passivate(mg); - metaslab_group_passivate(vd->vdev_mg); + uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - if (msp != NULL) metaslab_fini(msp); } @@ -1359,6 +1359,9 @@ vdev_metaslab_fini(vdev_t *vd) vd->vdev_ms = NULL; vd->vdev_ms_count = 0; + + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + ASSERT0(mg->mg_histogram[i]); } ASSERT0(vd->vdev_ms_count); ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); @@ -3006,7 +3009,10 @@ vdev_load(vdev_t *vd) "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); - } else if ((error = vdev_metaslab_init(vd, 0)) != 0) { + } + + error = vdev_metaslab_init(vd, 0); + if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, @@ -3021,9 +3027,10 @@ vdev_load(vdev_t *vd) ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); - if ((error = space_map_open(&vd->vdev_checkpoint_sm, + error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, - vd->vdev_ashift))) { + vd->vdev_ashift); + if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]",