diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 5ef69790d925..3d175dacafb2 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -793,9 +793,9 @@ dump_spacemap(objset_t *os, space_map_t *sm) return; (void) printf("space map object %llu:\n", - (longlong_t)sm->sm_phys->smp_object); - (void) printf(" smp_objsize = 0x%llx\n", - (longlong_t)sm->sm_phys->smp_objsize); + (longlong_t)sm->sm_object); + (void) printf(" smp_length = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_length); (void) printf(" smp_alloc = 0x%llx\n", (longlong_t)sm->sm_phys->smp_alloc); @@ -3697,7 +3697,6 @@ zdb_load_obsolete_counts(vdev_t *vd) space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(prev_obsolete_sm); vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, prev_obsolete_sm); space_map_close(prev_obsolete_sm); @@ -3833,9 +3832,9 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); - space_map_update(checkpoint_sm); VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), checkpoint_sm_exclude_entry_cb, &cseea)); space_map_close(checkpoint_sm); @@ -4651,7 +4650,6 @@ verify_device_removal_feature_counts(spa_t *spa) spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(prev_obsolete_sm); dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); (void) printf("\n"); space_map_close(prev_obsolete_sm); @@ -4933,7 +4931,6 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), checkpoint_sm_obj, 0, current_vd->vdev_asize, current_vd->vdev_ashift)); - space_map_update(checkpoint_sm); verify_checkpoint_sm_entry_cb_arg_t vcsec; vcsec.vcsec_vd = ckpoint_vd; @@ -4941,6 +4938,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) vcsec.vcsec_num_entries = space_map_length(checkpoint_sm) / sizeof (uint64_t); VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), verify_checkpoint_sm_entry_cb, &vcsec)); if (dump_opt['m'] > 3) dump_spacemap(current->spa_meta_objset, checkpoint_sm); @@ -5100,7 +5098,6 @@ dump_leftover_checkpoint_blocks(spa_t *spa) VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); - space_map_update(checkpoint_sm); dump_spacemap(spa->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index f47bc19cfc2b..fd0d23502a43 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -52,6 +52,8 @@ void metaslab_fini(metaslab_t *); int metaslab_load(metaslab_t *); void metaslab_unload(metaslab_t *); +uint64_t metaslab_allocated_space(metaslab_t *); + void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 137a8476924a..02ce02226b5d 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -340,8 +340,34 @@ struct metaslab_group { * being written. */ struct metaslab { + /* + * This is the main lock of the metaslab and its purpose is to + * coordinate our allocations and frees [e.g metaslab_block_alloc(), + * metaslab_free_concrete(), ..etc] with our various syncing + * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. + * + * The lock is also used during some miscellaneous operations like + * using the metaslab's histogram for the metaslab group's histogram + * aggregation, or marking the metaslab for initialization. + */ kmutex_t ms_lock; + + /* + * Acquired together with the ms_lock whenever we expect to + * write to metaslab data on-disk (i.e flushing entries to + * the metaslab's space map). It helps coordinate readers of + * the metaslab's space map [see spa_vdev_remove_thread()] + * with writers [see metaslab_sync()]. + * + * Note that metaslab_load(), even though a reader, uses + * a completely different mechanism to deal with the reading + * of the metaslab's space map based on ms_synced_length. That + * said, the function still uses the ms_sync_lock after it + * has read the ms_sm [see relevant comment in metaslab_load() + * as to why]. + */ kmutex_t ms_sync_lock; + kcondvar_t ms_load_cv; space_map_t *ms_sm; uint64_t ms_id; @@ -351,6 +377,7 @@ struct metaslab { range_tree_t *ms_allocating[TXG_SIZE]; range_tree_t *ms_allocatable; + uint64_t ms_allocated_this_txg; /* * The following range trees are accessed only from syncing context. @@ -375,6 +402,12 @@ struct metaslab { boolean_t ms_loaded; boolean_t ms_loading; + /* + * Tracks the exact amount of allocated space of this metaslab + * (and specifically the metaslab's space map) up to the most + * recently completed sync pass [see usage in metaslab_sync()]. + */ + uint64_t ms_allocated_space; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ uint64_t ms_activation_weight; /* activation weight */ @@ -411,6 +444,9 @@ struct metaslab { avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ + /* updated every time we are done syncing the metaslab's space map */ + uint64_t ms_synced_length; + boolean_t ms_new; }; diff --git a/include/sys/space_map.h b/include/sys/space_map.h index 64c97bb4dd6e..52536cccca46 100644 --- a/include/sys/space_map.h +++ b/include/sys/space_map.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_SPACE_MAP_H @@ -55,10 +55,17 @@ extern "C" { * for backward compatibility. */ typedef struct space_map_phys { - uint64_t smp_object; /* on-disk space map object */ - uint64_t smp_objsize; /* size of the object */ - int64_t smp_alloc; /* space allocated from the map */ - uint64_t smp_pad[5]; /* reserved */ + /* object number: not needed but kept for backwards compatibility */ + uint64_t smp_object; + + /* length of the object in bytes */ + uint64_t smp_length; + + /* space allocated from the map */ + int64_t smp_alloc; + + /* reserved */ + uint64_t smp_pad[5]; /* * The smp_histogram maintains a histogram of free regions. Each @@ -81,8 +88,6 @@ typedef struct space_map { uint64_t sm_start; /* start of map */ uint64_t sm_size; /* size of map */ uint8_t sm_shift; /* unit shift */ - uint64_t sm_length; /* synced length */ - int64_t sm_alloc; /* synced space allocated */ objset_t *sm_os; /* objset for this map */ uint64_t sm_object; /* object id for this map */ uint32_t sm_blksz; /* block size for space map */ @@ -189,7 +194,10 @@ boolean_t sm_entry_is_double_word(uint64_t e); typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg); int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); -int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg); +int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t length); +int space_map_iterate(space_map_t *sm, uint64_t length, + sm_cb_t callback, void *arg); int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, dmu_tx_t *tx); @@ -197,10 +205,8 @@ void space_map_histogram_clear(space_map_t *sm); void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx); -void space_map_update(space_map_t *sm); - uint64_t space_map_object(space_map_t *sm); -uint64_t space_map_allocated(space_map_t *sm); +int64_t space_map_allocated(space_map_t *sm); uint64_t space_map_length(space_map_t *sm); void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, @@ -216,8 +222,6 @@ int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, uint64_t start, uint64_t size, uint8_t shift); void space_map_close(space_map_t *sm); -int64_t space_map_alloc_delta(space_map_t *sm); - #ifdef __cplusplus } #endif diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index aeca0ed20faf..58c47a0abfb2 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -496,45 +496,62 @@ metaslab_compare(const void *x1, const void *x2) return (AVL_CMP(m1->ms_start, m2->ms_start)); } +uint64_t +metaslab_allocated_space(metaslab_t *msp) +{ + return (msp->ms_allocated_space); +} + /* * Verify that the space accounting on disk matches the in-core range_trees. */ -void +static void metaslab_verify_space(metaslab_t *msp, uint64_t txg) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - uint64_t allocated = 0; + uint64_t allocating = 0; uint64_t sm_free_space, msp_free_space; ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!msp->ms_condensing); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can only verify the metaslab space when we're called - * from syncing context with a loaded metaslab that has an allocated - * space map. Calling this in non-syncing context does not - * provide a consistent view of the metaslab since we're performing - * allocations in the future. + * from syncing context with a loaded metaslab that has an + * allocated space map. Calling this in non-syncing context + * does not provide a consistent view of the metaslab since + * we're performing allocations in the future. */ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || !msp->ms_loaded) return; - sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - - space_map_alloc_delta(msp->ms_sm); + /* + * Even though the smp_alloc field can get negative (e.g. + * see vdev_checkpoint_sm), that should never be the case + * when it come's to a metaslab's space map. + */ + ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); + + sm_free_space = msp->ms_size - metaslab_allocated_space(msp); /* - * Account for future allocations since we would have already - * deducted that space from the ms_freetree. + * Account for future allocations since we would have + * already deducted that space from the ms_allocatable. */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { - allocated += + allocating += range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } - msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + + ASSERT3U(msp->ms_deferspace, ==, + range_tree_space(msp->ms_defer[0]) + + range_tree_space(msp->ms_defer[1])); + + msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + msp->ms_deferspace + range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); @@ -1420,27 +1437,52 @@ metaslab_load_impl(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loading); + ASSERT(!msp->ms_condensing); /* - * Nobody else can manipulate a loading metaslab, so it's now safe - * to drop the lock. This way we don't have to hold the lock while - * reading the spacemap from disk. + * We temporarily drop the lock to unblock other operations while we + * are reading the space map. Therefore, metaslab_sync() and + * metaslab_sync_done() can run at the same time as we do. + * + * metaslab_sync() can append to the space map while we are loading. + * Therefore we load only entries that existed when we started the + * load. Additionally, metaslab_sync_done() has to wait for the load + * to complete because there are potential races like metaslab_load() + * loading parts of the space map that are currently being appended + * by metaslab_sync(). If we didn't, the ms_allocatable would have + * entries that metaslab_sync_done() would try to re-add later. + * + * That's why before dropping the lock we remember the synced length + * of the metaslab and read up to that point of the space map, + * ignoring entries appended by metaslab_sync() that happen after we + * drop the lock. */ + uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); - /* - * If the space map has not been allocated yet, then treat - * all the space in the metaslab as free and add it to ms_allocatable. - */ if (msp->ms_sm != NULL) { - error = space_map_load(msp->ms_sm, msp->ms_allocatable, - SM_FREE); + error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, + SM_FREE, length); } else { + /* + * The space map has not been allocated yet, so treat + * all the space in the metaslab as free and add it to the + * ms_allocatable tree. + */ range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); } + /* + * We need to grab the ms_sync_lock to prevent metaslab_sync() from + * changing the ms_sm and the metaslab's range trees while we are + * about to use them and populate the ms_allocatable. The ms_lock + * is insufficient for this because metaslab_sync() doesn't hold + * the ms_lock while writing the ms_checkpointing tree to disk. + */ + mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); + ASSERT(!msp->ms_condensing); if (error != 0) return (error); @@ -1449,18 +1491,22 @@ metaslab_load_impl(metaslab_t *msp) msp->ms_loaded = B_TRUE; /* - * If the metaslab already has a spacemap, then we need to - * remove all segments from the defer tree; otherwise, the - * metaslab is completely empty and we can skip this. + * The ms_allocatable contains the segments that exist in the + * ms_defer trees [see ms_synced_length]. Thus we need to remove + * them from ms_allocatable as they will be added again in + * metaslab_sync_done(). */ - if (msp->ms_sm != NULL) { - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defer[t], - range_tree_remove, msp->ms_allocatable); - } + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defer[t], + range_tree_remove, msp->ms_allocatable); } + msp->ms_max_size = metaslab_block_maxsize(msp); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + metaslab_verify_space(msp, spa_syncing_txg(spa)); + mutex_exit(&msp->ms_sync_lock); + return (0); } @@ -1477,6 +1523,7 @@ metaslab_load(metaslab_t *msp) if (msp->ms_loaded) return (0); VERIFY(!msp->ms_loading); + ASSERT(!msp->ms_condensing); msp->ms_loading = B_TRUE; int error = metaslab_load_impl(msp); @@ -1533,6 +1580,13 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. + * + * Note: + * When called from vdev_expand(), we can't call into the DMU as + * we are holding the spa_config_lock as a writer and we would + * deadlock [see relevant comment in vdev_metaslab_init()]. in + * that case, the object parameter is zero though, so we won't + * call into the DMU. */ if (object != 0) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, @@ -1544,14 +1598,16 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, } ASSERT(ms->ms_sm != NULL); + ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } /* - * We create the main range tree here, but we don't create the + * We create the ms_allocatable here, but we don't create the * other range trees until metaslab_sync_done(). This serves * two purposes: it allows metaslab_sync_done() to detect the - * addition of new space; and for debugging, it ensures that we'd - * data fault on any attempt to use this metaslab before it's ready. + * addition of new space; and for debugging, it ensures that + * we'd data fault on any attempt to use this metaslab before + * it's ready. */ ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); @@ -1568,8 +1624,11 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, * out this txg. This ensures that we don't attempt to allocate * from it before we have initialized it completely. */ - if (txg <= TXG_INITIAL) + if (txg <= TXG_INITIAL) { metaslab_sync_done(ms, 0); + metaslab_space_update(vd, mg->mg_class, + metaslab_allocated_space(ms), 0, 0); + } /* * If metaslab_debug_load is set and we're initializing a metaslab @@ -1603,7 +1662,7 @@ metaslab_fini(metaslab_t *msp) mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); metaslab_space_update(vd, mg->mg_class, - -space_map_allocated(msp->ms_sm), 0, -msp->ms_size); + -metaslab_allocated_space(msp), 0, -msp->ms_size); space_map_close(msp->ms_sm); @@ -1674,10 +1733,10 @@ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { }; /* - * Calclate the metaslab's fragmentation metric. A return value - * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does - * not support this metric. Otherwise, the return value should be in the - * range [0, 100]. + * Calculate the metaslab's fragmentation metric and set ms_fragmentation. + * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not + * been upgraded and does not support this metric. Otherwise, the return + * value should be in the range [0, 100]. */ static void metaslab_set_fragmentation(metaslab_t *msp) @@ -1770,7 +1829,7 @@ metaslab_space_weight(metaslab_t *msp) /* * The baseline weight is the metaslab's free space. */ - space = msp->ms_size - space_map_allocated(msp->ms_sm); + space = msp->ms_size - metaslab_allocated_space(msp); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { @@ -1906,7 +1965,7 @@ metaslab_segment_weight(metaslab_t *msp) /* * The metaslab is completely free. */ - if (space_map_allocated(msp->ms_sm) == 0) { + if (metaslab_allocated_space(msp) == 0) { int idx = highbit64(msp->ms_size) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; @@ -1928,7 +1987,7 @@ metaslab_segment_weight(metaslab_t *msp) /* * If the metaslab is fully allocated then just make the weight 0. */ - if (space_map_allocated(msp->ms_sm) == msp->ms_size) + if (metaslab_allocated_space(msp) == msp->ms_size) return (0); /* * If the metaslab is already loaded, then use the range tree to @@ -2008,6 +2067,8 @@ metaslab_weight(metaslab_t *msp) */ if (msp->ms_loaded) msp->ms_max_size = metaslab_block_maxsize(msp); + else + ASSERT0(msp->ms_max_size); /* * Segment-based weighting requires space map histogram support. @@ -2411,17 +2472,17 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) VERIFY(txg <= spa_final_dirty_txg(spa)); /* - * The only state that can actually be changing concurrently with - * metaslab_sync() is the metaslab's ms_allocatable. No other - * thread can be modifying this txg's alloc, freeing, + * The only state that can actually be changing concurrently + * with metaslab_sync() is the metaslab's ms_allocatable. No + * other thread can be modifying this txg's alloc, freeing, * freed, or space_map_phys_t. We drop ms_lock whenever we - * could call into the DMU, because the DMU can call down to us - * (e.g. via zio_free()) at any time. + * could call into the DMU, because the DMU can call down to + * us (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state - * concurrently, and it is locked out by the ms_sync_lock. Note - * that the ms_lock is insufficient for this, because it is dropped - * by space_map_write(). + * concurrently, and it is locked out by the ms_sync_lock. + * Note that the ms_lock is insufficient for this, because it + * is dropped by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); @@ -2433,7 +2494,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); + ASSERT(msp->ms_sm != NULL); + ASSERT0(metaslab_allocated_space(msp)); } if (!range_tree_is_empty(msp->ms_checkpointing) && @@ -2481,6 +2544,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) mutex_enter(&msp->ms_lock); } + msp->ms_allocated_space += range_tree_space(alloctree); + ASSERT3U(msp->ms_allocated_space, >=, + range_tree_space(msp->ms_freeing)); + msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); + if (!range_tree_is_empty(msp->ms_checkpointing)) { ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); @@ -2494,14 +2562,13 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_write(vd->vdev_checkpoint_sm, msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); - space_map_update(vd->vdev_checkpoint_sm); spa->spa_checkpoint_info.sci_dspace += range_tree_space(msp->ms_checkpointing); vd->vdev_stat.vs_checkpoint_space += range_tree_space(msp->ms_checkpointing); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, - -vd->vdev_checkpoint_sm->sm_alloc); + -space_map_allocated(vd->vdev_checkpoint_sm)); range_tree_vacate(msp->ms_checkpointing, NULL, NULL); } @@ -2553,16 +2620,18 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * For sync pass 1, we avoid traversing this txg's free range tree - * and instead will just swap the pointers for freeing and - * freed. We can safely do this since the freed_tree is - * guaranteed to be empty on the initial pass. + * and instead will just swap the pointers for freeing and freed. + * We can safely do this since the freed_tree is guaranteed to be + * empty on the initial pass. */ if (spa_sync_pass(spa) == 1) { range_tree_swap(&msp->ms_freeing, &msp->ms_freed); + ASSERT0(msp->ms_allocated_this_txg); } else { range_tree_vacate(msp->ms_freeing, range_tree_add, msp->ms_freed); } + msp->ms_allocated_this_txg += range_tree_space(alloctree); range_tree_vacate(alloctree, NULL, NULL); ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); @@ -2640,7 +2709,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) } defer_delta = 0; - alloc_delta = space_map_alloc_delta(msp->ms_sm); + alloc_delta = msp->ms_allocated_this_txg - + range_tree_space(msp->ms_freed); if (defer_allowed) { defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); @@ -2672,7 +2742,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); } - space_map_update(msp->ms_sm); + + msp->ms_synced_length = space_map_length(msp->ms_sm); msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); @@ -2724,6 +2795,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_checkpointing)); + msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); } diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index 230ae5785a85..d6f68ceda589 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -263,7 +263,7 @@ spa_checkpoint_accounting_verify(spa_t *spa) if (vd->vdev_checkpoint_sm != NULL) { ckpoint_sm_space_sum += - -vd->vdev_checkpoint_sm->sm_alloc; + -space_map_allocated(vd->vdev_checkpoint_sm); vs_ckpoint_space_sum += vd->vdev_stat.vs_checkpoint_space; ASSERT3U(ckpoint_sm_space_sum, ==, @@ -349,7 +349,7 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) error, vd->vdev_id); } ASSERT0(words_after); - ASSERT0(vd->vdev_checkpoint_sm->sm_alloc); + ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm)); ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); space_map_free(vd->vdev_checkpoint_sm, tx); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index 9ba6ff6ff4c2..5cf3feaae108 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include @@ -81,20 +81,22 @@ sm_entry_is_double_word(uint64_t e) /* * Iterate through the space map, invoking the callback on each (non-debug) - * space map entry. + * space map entry. Stop after reading 'end' bytes of the space map. */ int -space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) +space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) { - uint64_t sm_len = space_map_length(sm); - ASSERT3U(sm->sm_blksz, !=, 0); + uint64_t blksz = sm->sm_blksz; - dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len, + ASSERT3U(blksz, !=, 0); + ASSERT3U(end, <=, space_map_length(sm)); + ASSERT0(P2PHASE(end, sizeof (uint64_t))); + + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end, ZIO_PRIORITY_SYNC_READ); - uint64_t blksz = sm->sm_blksz; int error = 0; - for (uint64_t block_base = 0; block_base < sm_len && error == 0; + for (uint64_t block_base = 0; block_base < end && error == 0; block_base += blksz) { dmu_buf_t *db; error = dmu_buf_hold(sm->sm_os, space_map_object(sm), @@ -103,7 +105,7 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) return (error); uint64_t *block_start = db->db_data; - uint64_t block_length = MIN(sm_len - block_base, blksz); + uint64_t block_length = MIN(end - block_base, blksz); uint64_t *block_end = block_start + (block_length / sizeof (uint64_t)); @@ -186,7 +188,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, * dmu_buf_hold(). */ uint64_t last_word_offset = - sm->sm_phys->smp_objsize - sizeof (uint64_t); + sm->sm_phys->smp_length - sizeof (uint64_t); error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, FTAG, &db, DMU_READ_NO_PREFETCH); if (error != 0) @@ -199,7 +201,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, uint64_t *words = db->db_data; *nwords = - (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); @@ -298,8 +300,7 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, uint64_t e = buf[i]; if (sm_entry_is_debug(e)) { - sm->sm_phys->smp_objsize -= sizeof (uint64_t); - space_map_update(sm); + sm->sm_phys->smp_length -= sizeof (uint64_t); continue; } @@ -354,15 +355,13 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, sm->sm_phys->smp_alloc -= entry_run; else sm->sm_phys->smp_alloc += entry_run; - sm->sm_phys->smp_objsize -= words * sizeof (uint64_t); - space_map_update(sm); + sm->sm_phys->smp_length -= words * sizeof (uint64_t); } } if (space_map_length(sm) == 0) { ASSERT0(error); - ASSERT0(sm->sm_phys->smp_objsize); - ASSERT0(sm->sm_alloc); + ASSERT0(space_map_allocated(sm)); } zio_buf_free(buf, bufsz); @@ -390,6 +389,33 @@ space_map_load_callback(space_map_entry_t *sme, void *arg) return (0); } +/* + * Load the spacemap into the rangetree, like space_map_load. But only + * read the first 'length' bytes of the spacemap. + */ +int +space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t length) +{ + space_map_load_arg_t smla; + + VERIFY0(range_tree_space(rt)); + + if (maptype == SM_FREE) + range_tree_add(rt, sm->sm_start, sm->sm_size); + + smla.smla_rt = rt; + smla.smla_sm = sm; + smla.smla_type = maptype; + int err = space_map_iterate(sm, length, + space_map_load_callback, &smla); + + if (err != 0) + range_tree_vacate(rt, NULL, NULL); + + return (err); +} + /* * Load the space map disk into the specified range tree. Segments of maptype * are added to the range tree, other segment types are removed. @@ -397,30 +423,7 @@ space_map_load_callback(space_map_entry_t *sme, void *arg) int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) { - uint64_t space; - int err; - space_map_load_arg_t smla; - - VERIFY0(range_tree_space(rt)); - space = space_map_allocated(sm); - - if (maptype == SM_FREE) { - range_tree_add(rt, sm->sm_start, sm->sm_size); - space = sm->sm_size - space; - } - - smla.smla_rt = rt; - smla.smla_sm = sm; - smla.smla_type = maptype; - err = space_map_iterate(sm, space_map_load_callback, &smla); - - if (err == 0) { - VERIFY3U(range_tree_space(rt), ==, space); - } else { - range_tree_vacate(rt, NULL, NULL); - } - - return (err); + return (space_map_load_length(sm, rt, maptype, space_map_length(sm))); } void @@ -506,10 +509,10 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); - dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize, + dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length, sizeof (dentry), &dentry, tx); - sm->sm_phys->smp_objsize += sizeof (dentry); + sm->sm_phys->smp_length += sizeof (dentry); } /* @@ -541,7 +544,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, uint64_t *block_base = db->db_data; uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); uint64_t *block_cursor = block_base + - (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); ASSERT3P(block_cursor, <=, block_end); @@ -564,7 +567,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, if (block_cursor == block_end) { dmu_buf_rele(db, tag); - uint64_t next_word_offset = sm->sm_phys->smp_objsize; + uint64_t next_word_offset = sm->sm_phys->smp_length; VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), next_word_offset, tag, &db, DMU_READ_PREFETCH)); @@ -594,7 +597,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, SM_DEBUG_SYNCPASS_ENCODE(0) | SM_DEBUG_TXG_ENCODE(0); block_cursor++; - sm->sm_phys->smp_objsize += sizeof (uint64_t); + sm->sm_phys->smp_length += sizeof (uint64_t); ASSERT3P(block_cursor, ==, block_end); continue; } @@ -625,7 +628,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, words); break; } - sm->sm_phys->smp_objsize += words * sizeof (uint64_t); + sm->sm_phys->smp_length += words * sizeof (uint64_t); start += run_len; size -= run_len; @@ -652,7 +655,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * We do this right after we write the intro debug entry * because the estimate does not take it into account. */ - uint64_t initial_objsize = sm->sm_phys->smp_objsize; + uint64_t initial_objsize = sm->sm_phys->smp_length; uint64_t estimated_growth = space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); uint64_t estimated_final_objsize = initial_objsize + estimated_growth; @@ -663,7 +666,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * and use that to get a hold of the last block, so we can * start appending to it. */ - uint64_t next_word_offset = sm->sm_phys->smp_objsize; + uint64_t next_word_offset = sm->sm_phys->smp_length; VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); ASSERT3U(db->db_size, ==, sm->sm_blksz); @@ -711,7 +714,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * Therefore we expect the actual objsize to be equal or less * than whatever we estimated it to be. */ - ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize); + ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length); #endif } @@ -792,8 +795,6 @@ space_map_open(space_map_t **smp, objset_t *os, uint64_t object, sm->sm_shift = shift; sm->sm_os = os; sm->sm_object = object; - sm->sm_length = 0; - sm->sm_alloc = 0; sm->sm_blksz = 0; sm->sm_dbuf = NULL; sm->sm_phys = NULL; @@ -870,23 +871,10 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) } dmu_buf_will_dirty(sm->sm_dbuf, tx); - sm->sm_phys->smp_objsize = 0; + sm->sm_phys->smp_length = 0; sm->sm_phys->smp_alloc = 0; } -/* - * Update the in-core space_map allocation and length values. - */ -void -space_map_update(space_map_t *sm) -{ - if (sm == NULL) - return; - - sm->sm_alloc = sm->sm_phys->smp_alloc; - sm->sm_length = sm->sm_phys->smp_objsize; -} - uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) { @@ -1068,32 +1056,14 @@ space_map_object(space_map_t *sm) return (sm != NULL ? sm->sm_object : 0); } -/* - * Returns the already synced, on-disk allocated space. - */ -uint64_t +int64_t space_map_allocated(space_map_t *sm) { - return (sm != NULL ? sm->sm_alloc : 0); + return (sm != NULL ? sm->sm_phys->smp_alloc : 0); } -/* - * Returns the already synced, on-disk length; - */ uint64_t space_map_length(space_map_t *sm) { - return (sm != NULL ? sm->sm_length : 0); -} - -/* - * Returns the allocated space that is currently syncing. - */ -int64_t -space_map_alloc_delta(space_map_t *sm) -{ - if (sm == NULL) - return (0); - ASSERT(sm->sm_dbuf != NULL); - return (sm->sm_phys->smp_alloc - space_map_allocated(sm)); + return (sm != NULL ? sm->sm_phys->smp_length : 0); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 7add0d6e6cb1..81c34da074fd 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2701,13 +2701,6 @@ vdev_dtl_load(vdev_t *vd) ASSERT(vd->vdev_dtl_sm != NULL); mutex_enter(&vd->vdev_dtl_lock); - - /* - * Now that we've opened the space_map we need to update - * the in-core DTL. - */ - space_map_update(vd->vdev_dtl_sm); - error = space_map_load(vd->vdev_dtl_sm, vd->vdev_dtl[DTL_MISSING], SM_ALLOC); mutex_exit(&vd->vdev_dtl_lock); @@ -2867,10 +2860,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) } dmu_tx_commit(tx); - - mutex_enter(&vd->vdev_dtl_lock); - space_map_update(vd->vdev_dtl_sm); - mutex_exit(&vd->vdev_dtl_lock); } /* @@ -3042,15 +3031,15 @@ vdev_load(vdev_t *vd) return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); - space_map_update(vd->vdev_checkpoint_sm); /* * Since the checkpoint_sm contains free entries - * exclusively we can use sm_alloc to indicate the - * cumulative checkpointed space that has been freed. + * exclusively we can use space_map_allocated() to + * indicate the cumulative checkpointed space that + * has been freed. */ vd->vdev_stat.vs_checkpoint_space = - -vd->vdev_checkpoint_sm->sm_alloc; + -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { @@ -3088,7 +3077,6 @@ vdev_load(vdev_t *vd) (u_longlong_t)obsolete_sm_object, error); return (error); } - space_map_update(vd->vdev_obsolete_sm); } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); @@ -3519,8 +3507,8 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { - ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc, - !=, 0); + ASSERT3U(space_map_allocated( + tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 2f8268f0fab6..68dfe83128a7 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -684,7 +684,6 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr) VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(prev_obsolete_sm); counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); if (prev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, @@ -838,7 +837,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) VERIFY0(space_map_open(&vd->vdev_obsolete_sm, spa->spa_meta_objset, obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(vd->vdev_obsolete_sm); } ASSERT(vd->vdev_obsolete_sm != NULL); @@ -847,7 +845,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) space_map_write(vd->vdev_obsolete_sm, vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); - space_map_update(vd->vdev_obsolete_sm); range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); } diff --git a/module/zfs/vdev_indirect_mapping.c b/module/zfs/vdev_indirect_mapping.c index c02a4f5a4ce5..e4d998f09b85 100644 --- a/module/zfs/vdev_indirect_mapping.c +++ b/module/zfs/vdev_indirect_mapping.c @@ -560,6 +560,7 @@ vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, losma.losma_counts = counts; losma.losma_vim = vim; VERIFY0(space_map_iterate(obsolete_space_sm, + space_map_length(obsolete_space_sm), load_obsolete_sm_callback, &losma)); } diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index e68f23e3f4e9..a69eca354c10 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -452,7 +452,7 @@ vdev_initialize_calculate_progress(vdev_t *vd) mutex_enter(&msp->ms_lock); uint64_t ms_free = msp->ms_size - - space_map_allocated(msp->ms_sm); + metaslab_allocated_space(msp); if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) ms_free /= vd->vdev_top->vdev_children; diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 70620499756c..ff39a0a26d6a 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -291,15 +291,8 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) if (ms->ms_sm == NULL) continue; - /* - * Sync tasks happen before metaslab_sync(), therefore - * smp_alloc and sm_alloc must be the same. - */ - ASSERT3U(space_map_allocated(ms->ms_sm), ==, - ms->ms_sm->sm_phys->smp_alloc); - spa->spa_removing_phys.sr_to_copy += - space_map_allocated(ms->ms_sm); + metaslab_allocated_space(ms); /* * Space which we are freeing this txg does not need to @@ -1443,22 +1436,8 @@ spa_vdev_remove_thread(void *arg) * appropriate action (see free_from_removing_vdev()). */ if (msp->ms_sm != NULL) { - space_map_t *sm = NULL; - - /* - * We have to open a new space map here, because - * ms_sm's sm_length and sm_alloc may not reflect - * what's in the object contents, if we are in between - * metaslab_sync() and metaslab_sync_done(). - */ - VERIFY0(space_map_open(&sm, - spa->spa_dsl_pool->dp_meta_objset, - msp->ms_sm->sm_object, msp->ms_sm->sm_start, - msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); - space_map_update(sm); - VERIFY0(space_map_load(sm, svr->svr_allocd_segs, - SM_ALLOC)); - space_map_close(sm); + VERIFY0(space_map_load(msp->ms_sm, + svr->svr_allocd_segs, SM_ALLOC)); range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); @@ -1681,16 +1660,6 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) ASSERT0(range_tree_space(msp->ms_freed)); if (msp->ms_sm != NULL) { - /* - * Assert that the in-core spacemap has the same - * length as the on-disk one, so we can use the - * existing in-core spacemap to load it from disk. - */ - ASSERT3U(msp->ms_sm->sm_alloc, ==, - msp->ms_sm->sm_phys->smp_alloc); - ASSERT3U(msp->ms_sm->sm_length, ==, - msp->ms_sm->sm_phys->smp_objsize); - mutex_enter(&svr->svr_lock); VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); @@ -1789,9 +1758,6 @@ spa_vdev_remove_cancel(spa_t *spa) return (spa_vdev_remove_cancel_impl(spa)); } -/* - * Called every sync pass of every txg if there's a svr. - */ void svr_sync(spa_t *spa, dmu_tx_t *tx) {