diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c index f02c2af5ef5f..2ff94970f26c 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -901,11 +901,8 @@ dump_metaslab(metaslab_t *msp) if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); - metaslab_load_wait(msp); - if (!msp->ms_loaded) { - VERIFY0(metaslab_load(msp)); - range_tree_stat_verify(msp->ms_allocatable); - } + VERIFY0(metaslab_load(msp)); + range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); metaslab_unload(msp); mutex_exit(&msp->ms_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index cb69664aa2a2..741b47559af6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -1468,7 +1468,7 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; /* * Wait for any in-progress metaslab loads to complete. */ -void +static void metaslab_load_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); @@ -1479,20 +1479,17 @@ metaslab_load_wait(metaslab_t *msp) } } -int -metaslab_load(metaslab_t *msp) +static int +metaslab_load_impl(metaslab_t *msp) { int error = 0; - boolean_t success = B_FALSE; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(!msp->ms_loaded); - ASSERT(!msp->ms_loading); + ASSERT(msp->ms_loading); - msp->ms_loading = B_TRUE; /* * Nobody else can manipulate a loading metaslab, so it's now safe - * to drop the lock. This way we don't have to hold the lock while + * to drop the lock. This way we don't have to hold the lock while * reading the spacemap from disk. */ mutex_exit(&msp->ms_lock); @@ -1509,29 +1506,49 @@ metaslab_load(metaslab_t *msp) msp->ms_start, msp->ms_size); } - success = (error == 0); - mutex_enter(&msp->ms_lock); - msp->ms_loading = B_FALSE; - if (success) { - ASSERT3P(msp->ms_group, !=, NULL); - msp->ms_loaded = B_TRUE; + if (error != 0) + return (error); - /* - * If the metaslab already has a spacemap, then we need to - * remove all segments from the defer tree; otherwise, the - * metaslab is completely empty and we can skip this. - */ - if (msp->ms_sm != NULL) { - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defer[t], - range_tree_remove, msp->ms_allocatable); - } + ASSERT3P(msp->ms_group, !=, NULL); + msp->ms_loaded = B_TRUE; + + /* + * If the metaslab already has a spacemap, then we need to + * remove all segments from the defer tree; otherwise, the + * metaslab is completely empty and we can skip this. + */ + if (msp->ms_sm != NULL) { + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defer[t], + range_tree_remove, msp->ms_allocatable); } - msp->ms_max_size = metaslab_block_maxsize(msp); } + msp->ms_max_size = metaslab_block_maxsize(msp); + + return (0); +} + +int +metaslab_load(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * There may be another thread loading the same metaslab, if that's + * the case just wait until the other thread is done and return. + */ + metaslab_load_wait(msp); + if (msp->ms_loaded) + return (0); + VERIFY(!msp->ms_loading); + + msp->ms_loading = B_TRUE; + int error = metaslab_load_impl(msp); + msp->ms_loading = B_FALSE; cv_broadcast(&msp->ms_load_cv); + return (error); } @@ -2091,13 +2108,10 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = 0; - metaslab_load_wait(msp); - if (!msp->ms_loaded) { - if ((error = metaslab_load(msp)) != 0) { - metaslab_group_sort(msp->ms_group, msp, 0); - return (error); - } + int error = metaslab_load(msp); + if (error != 0) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); } if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { /* @@ -2209,9 +2223,7 @@ metaslab_preload(void *arg) ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); - metaslab_load_wait(msp); - if (!msp->ms_loaded) - (void) metaslab_load(msp); + (void) metaslab_load(msp); msp->ms_selected_txg = spa_syncing_txg(spa); mutex_exit(&msp->ms_lock); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h index 0509e95b1587..007ac87fa39f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -48,7 +48,6 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, metaslab_t **); void metaslab_fini(metaslab_t *); -void metaslab_load_wait(metaslab_t *); int metaslab_load(metaslab_t *); void metaslab_unload(metaslab_t *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h index 2b6f8e545207..6cce0614b4a1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -370,8 +370,8 @@ struct metaslab { uint64_t ms_initializing; /* leaves initializing this ms */ /* - * We must hold both ms_lock and ms_group->mg_lock in order to - * modify ms_loaded. + * We must always hold the ms_lock when modifying ms_loaded + * and ms_loading. */ boolean_t ms_loaded; boolean_t ms_loading; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index ffcb8a465216..fd694b833782 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -163,34 +163,34 @@ static vdev_ops_t *vdev_ops_table[] = { }; -/* target number of metaslabs per top-level vdev */ -int vdev_max_ms_count = 200; -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RWTUN, - &vdev_max_ms_count, 0, +/* default target for number of metaslabs per top-level vdev */ +int zfs_vdev_default_ms_count = 200; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_count, CTLFLAG_RWTUN, + &zfs_vdev_default_ms_count, 0, "Target number of metaslabs per top-level vdev"); /* minimum number of metaslabs per top-level vdev */ -int vdev_min_ms_count = 16; +int zfs_vdev_min_ms_count = 16; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN, - &vdev_min_ms_count, 0, + &zfs_vdev_min_ms_count, 0, "Minimum number of metaslabs per top-level vdev"); /* practical upper limit of total metaslabs per top-level vdev */ -int vdev_ms_count_limit = 1ULL << 17; +int zfs_vdev_ms_count_limit = 1ULL << 17; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN, - &vdev_ms_count_limit, 0, + &zfs_vdev_ms_count_limit, 0, "Maximum number of metaslabs per top-level vdev"); /* lower limit for metaslab size (512M) */ -int vdev_default_ms_shift = 29; +int zfs_vdev_default_ms_shift = 29; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN, - &vdev_default_ms_shift, 0, + &zfs_vdev_default_ms_shift, 0, "Default shift between vdev size and number of metaslabs"); -/* upper limit for metaslab size (256G) */ -int vdev_max_ms_shift = 38; +/* upper limit for metaslab size (16G) */ +int zfs_vdev_max_ms_shift = 34; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN, - &vdev_max_ms_shift, 0, + &zfs_vdev_max_ms_shift, 0, "Maximum shift between vdev size and number of metaslabs"); boolean_t vdev_validate_skip = B_FALSE; @@ -2205,16 +2205,24 @@ void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; - uint64_t ms_count = asize >> vdev_default_ms_shift; + uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. - * In general, we aim for vdev_max_ms_count (200) metaslabs. The - * range of the dimensions are as follows: * - * 2^29 <= ms_size <= 2^38 + * The default values used below are a good balance between memory + * usage (larger metaslab size means more memory needed for loaded + * metaslabs; more metaslabs means more memory needed for the + * metaslab_t structs), metaslab load time (larger metaslabs take + * longer to load), and metaslab sync time (more metaslabs means + * more time spent syncing all of them). + * + * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. + * The range of the dimensions are as follows: + * + * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of @@ -2223,35 +2231,41 @@ vdev_metaslab_set_size(vdev_t *vd) * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab - * size of 256GB. However, we will cap the total count to 2^17 - * metaslabs to keep our memory footprint in check. + * size of 16GB. However, we will cap the total count to 2^17 + * metaslabs to keep our memory footprint in check and let the + * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * - * vdev size metaslab count - * -------------|----------------- - * < 8GB ~16 - * 8GB - 100GB one per 512MB - * 100GB - 50TB ~200 - * 50TB - 32PB one per 256GB - * > 32PB ~131,072 - * ------------------------------- + * vdev size metaslab count + * --------------|----------------- + * < 8GB ~16 + * 8GB - 100GB one per 512MB + * 100GB - 3TB ~200 + * 3TB - 2PB one per 16GB + * > 2PB ~131,072 + * -------------------------------- + * + * Finally, note that all of the above calculate the initial + * number of metaslabs. Expanding a top-level vdev will result + * in additional metaslabs being allocated making it possible + * to exceed the zfs_vdev_ms_count_limit. */ - if (ms_count < vdev_min_ms_count) - ms_shift = highbit64(asize / vdev_min_ms_count); - else if (ms_count > vdev_max_ms_count) - ms_shift = highbit64(asize / vdev_max_ms_count); + if (ms_count < zfs_vdev_min_ms_count) + ms_shift = highbit64(asize / zfs_vdev_min_ms_count); + else if (ms_count > zfs_vdev_default_ms_count) + ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else - ms_shift = vdev_default_ms_shift; + ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; - } else if (ms_shift > vdev_max_ms_shift) { - ms_shift = vdev_max_ms_shift; + } else if (ms_shift > zfs_vdev_max_ms_shift) { + ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ - if ((asize >> ms_shift) > vdev_ms_count_limit) - ms_shift = highbit64(asize / vdev_ms_count_limit); + if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) + ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; @@ -3611,13 +3625,17 @@ vdev_accessible(vdev_t *vd, zio_t *zio) boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { + if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) + return (B_TRUE); + /* - * Assuming 47 bits of the space map entry dedicated for the entry's - * offset (see description in space_map.h), we calculate the maximum - * address that can be described by a space map entry for the given - * device. + * If double-word space map entries are not enabled we assume + * 47 bits of the space map entry are dedicated to the entry's + * offset (see SM_OFFSET_BITS in space_map.h). We then use that + * to calculate the maximum address that can be described by a + * space map entry for the given device. */ - uint64_t shift = vd->vdev_ashift + 47; + uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c index a2c39c2868e5..a78fa2643e8e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c @@ -352,16 +352,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) return (0); } -static void -vdev_initialize_ms_load(metaslab_t *msp) -{ - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - metaslab_load_wait(msp); - if (!msp->ms_loaded) - VERIFY0(metaslab_load(msp)); -} - static void vdev_initialize_mg_wait(metaslab_group_t *mg) { @@ -484,10 +474,10 @@ vdev_initialize_calculate_progress(vdev_t *vd) * metaslab. Load it and walk the free tree for more accurate * progress estimation. */ - vdev_initialize_ms_load(msp); + VERIFY0(metaslab_load(msp)); - for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs; - rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { + for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); + rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { logical_rs.rs_start = rs->rs_start; logical_rs.rs_end = rs->rs_end; vdev_xlate(vd, &logical_rs, &physical_rs); @@ -615,7 +605,7 @@ vdev_initialize_thread(void *arg) vdev_initialize_ms_mark(msp); mutex_enter(&msp->ms_lock); - vdev_initialize_ms_load(msp); + VERIFY0(metaslab_load(msp)); range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, vd);