MFC r353611: 10330 merge recent ZoL vdev and metaslab changes

illumos/illumos-gate@a0b03b161c
a0b03b161c

https://www.illumos.org/issues/10330
  3 recent ZoL changes in the vdev and metaslab code which we can pull over:
  PR 8324 c853f382db 8324 Change target size of metaslabs from 256GB to 16GB
  PR 8290 b194fab0fb 8290 Factor metaslab_load_wait() in metaslab_load()
  PR 8286 419ba59145 8286 Update vdev_is_spacemap_addressable() for new spacemap
  encoding

Author: Serapheim Dimitropoulos <serapheimd@gmail.com>
Obtained from:	illumos, ZoL
MFC after:	2 weeks
This commit is contained in:
Andriy Gapon 2019-10-16 06:26:51 +00:00
commit 6cb9ab2bad
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=353612
6 changed files with 114 additions and 98 deletions

View File

@ -901,11 +901,8 @@ dump_metaslab(metaslab_t *msp)
if (dump_opt['m'] > 2 && !dump_opt['L']) { if (dump_opt['m'] > 2 && !dump_opt['L']) {
mutex_enter(&msp->ms_lock); mutex_enter(&msp->ms_lock);
metaslab_load_wait(msp); VERIFY0(metaslab_load(msp));
if (!msp->ms_loaded) { range_tree_stat_verify(msp->ms_allocatable);
VERIFY0(metaslab_load(msp));
range_tree_stat_verify(msp->ms_allocatable);
}
dump_metaslab_stats(msp); dump_metaslab_stats(msp);
metaslab_unload(msp); metaslab_unload(msp);
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);

View File

@ -1468,7 +1468,7 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
/* /*
* Wait for any in-progress metaslab loads to complete. * Wait for any in-progress metaslab loads to complete.
*/ */
void static void
metaslab_load_wait(metaslab_t *msp) metaslab_load_wait(metaslab_t *msp)
{ {
ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&msp->ms_lock));
@ -1479,20 +1479,17 @@ metaslab_load_wait(metaslab_t *msp)
} }
} }
int static int
metaslab_load(metaslab_t *msp) metaslab_load_impl(metaslab_t *msp)
{ {
int error = 0; int error = 0;
boolean_t success = B_FALSE;
ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(!msp->ms_loaded); ASSERT(msp->ms_loading);
ASSERT(!msp->ms_loading);
msp->ms_loading = B_TRUE;
/* /*
* Nobody else can manipulate a loading metaslab, so it's now safe * Nobody else can manipulate a loading metaslab, so it's now safe
* to drop the lock. This way we don't have to hold the lock while * to drop the lock. This way we don't have to hold the lock while
* reading the spacemap from disk. * reading the spacemap from disk.
*/ */
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
@ -1509,29 +1506,49 @@ metaslab_load(metaslab_t *msp)
msp->ms_start, msp->ms_size); msp->ms_start, msp->ms_size);
} }
success = (error == 0);
mutex_enter(&msp->ms_lock); mutex_enter(&msp->ms_lock);
msp->ms_loading = B_FALSE;
if (success) { if (error != 0)
ASSERT3P(msp->ms_group, !=, NULL); return (error);
msp->ms_loaded = B_TRUE;
/* ASSERT3P(msp->ms_group, !=, NULL);
* If the metaslab already has a spacemap, then we need to msp->ms_loaded = B_TRUE;
* remove all segments from the defer tree; otherwise, the
* metaslab is completely empty and we can skip this. /*
*/ * If the metaslab already has a spacemap, then we need to
if (msp->ms_sm != NULL) { * remove all segments from the defer tree; otherwise, the
for (int t = 0; t < TXG_DEFER_SIZE; t++) { * metaslab is completely empty and we can skip this.
range_tree_walk(msp->ms_defer[t], */
range_tree_remove, msp->ms_allocatable); if (msp->ms_sm != NULL) {
} for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defer[t],
range_tree_remove, msp->ms_allocatable);
} }
msp->ms_max_size = metaslab_block_maxsize(msp);
} }
msp->ms_max_size = metaslab_block_maxsize(msp);
return (0);
}
int
metaslab_load(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
/*
* There may be another thread loading the same metaslab, if that's
* the case just wait until the other thread is done and return.
*/
metaslab_load_wait(msp);
if (msp->ms_loaded)
return (0);
VERIFY(!msp->ms_loading);
msp->ms_loading = B_TRUE;
int error = metaslab_load_impl(msp);
msp->ms_loading = B_FALSE;
cv_broadcast(&msp->ms_load_cv); cv_broadcast(&msp->ms_load_cv);
return (error); return (error);
} }
@ -2091,13 +2108,10 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
int error = 0; int error = metaslab_load(msp);
metaslab_load_wait(msp); if (error != 0) {
if (!msp->ms_loaded) { metaslab_group_sort(msp->ms_group, msp, 0);
if ((error = metaslab_load(msp)) != 0) { return (error);
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
}
} }
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
/* /*
@ -2209,9 +2223,7 @@ metaslab_preload(void *arg)
ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
mutex_enter(&msp->ms_lock); mutex_enter(&msp->ms_lock);
metaslab_load_wait(msp); (void) metaslab_load(msp);
if (!msp->ms_loaded)
(void) metaslab_load(msp);
msp->ms_selected_txg = spa_syncing_txg(spa); msp->ms_selected_txg = spa_syncing_txg(spa);
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
} }

View File

@ -48,7 +48,6 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
metaslab_t **); metaslab_t **);
void metaslab_fini(metaslab_t *); void metaslab_fini(metaslab_t *);
void metaslab_load_wait(metaslab_t *);
int metaslab_load(metaslab_t *); int metaslab_load(metaslab_t *);
void metaslab_unload(metaslab_t *); void metaslab_unload(metaslab_t *);

View File

@ -370,8 +370,8 @@ struct metaslab {
uint64_t ms_initializing; /* leaves initializing this ms */ uint64_t ms_initializing; /* leaves initializing this ms */
/* /*
* We must hold both ms_lock and ms_group->mg_lock in order to * We must always hold the ms_lock when modifying ms_loaded
* modify ms_loaded. * and ms_loading.
*/ */
boolean_t ms_loaded; boolean_t ms_loaded;
boolean_t ms_loading; boolean_t ms_loading;

View File

@ -163,34 +163,34 @@ static vdev_ops_t *vdev_ops_table[] = {
}; };
/* target number of metaslabs per top-level vdev */ /* default target for number of metaslabs per top-level vdev */
int vdev_max_ms_count = 200; int zfs_vdev_default_ms_count = 200;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RWTUN, SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_count, CTLFLAG_RWTUN,
&vdev_max_ms_count, 0, &zfs_vdev_default_ms_count, 0,
"Target number of metaslabs per top-level vdev"); "Target number of metaslabs per top-level vdev");
/* minimum number of metaslabs per top-level vdev */ /* minimum number of metaslabs per top-level vdev */
int vdev_min_ms_count = 16; int zfs_vdev_min_ms_count = 16;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN, SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN,
&vdev_min_ms_count, 0, &zfs_vdev_min_ms_count, 0,
"Minimum number of metaslabs per top-level vdev"); "Minimum number of metaslabs per top-level vdev");
/* practical upper limit of total metaslabs per top-level vdev */ /* practical upper limit of total metaslabs per top-level vdev */
int vdev_ms_count_limit = 1ULL << 17; int zfs_vdev_ms_count_limit = 1ULL << 17;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN, SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN,
&vdev_ms_count_limit, 0, &zfs_vdev_ms_count_limit, 0,
"Maximum number of metaslabs per top-level vdev"); "Maximum number of metaslabs per top-level vdev");
/* lower limit for metaslab size (512M) */ /* lower limit for metaslab size (512M) */
int vdev_default_ms_shift = 29; int zfs_vdev_default_ms_shift = 29;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN, SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN,
&vdev_default_ms_shift, 0, &zfs_vdev_default_ms_shift, 0,
"Default shift between vdev size and number of metaslabs"); "Default shift between vdev size and number of metaslabs");
/* upper limit for metaslab size (256G) */ /* upper limit for metaslab size (16G) */
int vdev_max_ms_shift = 38; int zfs_vdev_max_ms_shift = 34;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN, SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN,
&vdev_max_ms_shift, 0, &zfs_vdev_max_ms_shift, 0,
"Maximum shift between vdev size and number of metaslabs"); "Maximum shift between vdev size and number of metaslabs");
boolean_t vdev_validate_skip = B_FALSE; boolean_t vdev_validate_skip = B_FALSE;
@ -2205,16 +2205,24 @@ void
vdev_metaslab_set_size(vdev_t *vd) vdev_metaslab_set_size(vdev_t *vd)
{ {
uint64_t asize = vd->vdev_asize; uint64_t asize = vd->vdev_asize;
uint64_t ms_count = asize >> vdev_default_ms_shift; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
uint64_t ms_shift; uint64_t ms_shift;
/* /*
* There are two dimensions to the metaslab sizing calculation: * There are two dimensions to the metaslab sizing calculation:
* the size of the metaslab and the count of metaslabs per vdev. * the size of the metaslab and the count of metaslabs per vdev.
* In general, we aim for vdev_max_ms_count (200) metaslabs. The
* range of the dimensions are as follows:
* *
* 2^29 <= ms_size <= 2^38 * The default values used below are a good balance between memory
* usage (larger metaslab size means more memory needed for loaded
* metaslabs; more metaslabs means more memory needed for the
* metaslab_t structs), metaslab load time (larger metaslabs take
* longer to load), and metaslab sync time (more metaslabs means
* more time spent syncing all of them).
*
* In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
* The range of the dimensions are as follows:
*
* 2^29 <= ms_size <= 2^34
* 16 <= ms_count <= 131,072 * 16 <= ms_count <= 131,072
* *
* On the lower end of vdev sizes, we aim for metaslabs sizes of * On the lower end of vdev sizes, we aim for metaslabs sizes of
@ -2223,35 +2231,41 @@ vdev_metaslab_set_size(vdev_t *vd)
* of at least 16 metaslabs will override this minimum size goal. * of at least 16 metaslabs will override this minimum size goal.
* *
* On the upper end of vdev sizes, we aim for a maximum metaslab * On the upper end of vdev sizes, we aim for a maximum metaslab
* size of 256GB. However, we will cap the total count to 2^17 * size of 16GB. However, we will cap the total count to 2^17
* metaslabs to keep our memory footprint in check. * metaslabs to keep our memory footprint in check and let the
* metaslab size grow from there if that limit is hit.
* *
* The net effect of applying above constrains is summarized below. * The net effect of applying above constrains is summarized below.
* *
* vdev size metaslab count * vdev size metaslab count
* -------------|----------------- * --------------|-----------------
* < 8GB ~16 * < 8GB ~16
* 8GB - 100GB one per 512MB * 8GB - 100GB one per 512MB
* 100GB - 50TB ~200 * 100GB - 3TB ~200
* 50TB - 32PB one per 256GB * 3TB - 2PB one per 16GB
* > 32PB ~131,072 * > 2PB ~131,072
* ------------------------------- * --------------------------------
*
* Finally, note that all of the above calculate the initial
* number of metaslabs. Expanding a top-level vdev will result
* in additional metaslabs being allocated making it possible
* to exceed the zfs_vdev_ms_count_limit.
*/ */
if (ms_count < vdev_min_ms_count) if (ms_count < zfs_vdev_min_ms_count)
ms_shift = highbit64(asize / vdev_min_ms_count); ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
else if (ms_count > vdev_max_ms_count) else if (ms_count > zfs_vdev_default_ms_count)
ms_shift = highbit64(asize / vdev_max_ms_count); ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
else else
ms_shift = vdev_default_ms_shift; ms_shift = zfs_vdev_default_ms_shift;
if (ms_shift < SPA_MAXBLOCKSHIFT) { if (ms_shift < SPA_MAXBLOCKSHIFT) {
ms_shift = SPA_MAXBLOCKSHIFT; ms_shift = SPA_MAXBLOCKSHIFT;
} else if (ms_shift > vdev_max_ms_shift) { } else if (ms_shift > zfs_vdev_max_ms_shift) {
ms_shift = vdev_max_ms_shift; ms_shift = zfs_vdev_max_ms_shift;
/* cap the total count to constrain memory footprint */ /* cap the total count to constrain memory footprint */
if ((asize >> ms_shift) > vdev_ms_count_limit) if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
ms_shift = highbit64(asize / vdev_ms_count_limit); ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
} }
vd->vdev_ms_shift = ms_shift; vd->vdev_ms_shift = ms_shift;
@ -3611,13 +3625,17 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
boolean_t boolean_t
vdev_is_spacemap_addressable(vdev_t *vd) vdev_is_spacemap_addressable(vdev_t *vd)
{ {
if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
return (B_TRUE);
/* /*
* Assuming 47 bits of the space map entry dedicated for the entry's * If double-word space map entries are not enabled we assume
* offset (see description in space_map.h), we calculate the maximum * 47 bits of the space map entry are dedicated to the entry's
* address that can be described by a space map entry for the given * offset (see SM_OFFSET_BITS in space_map.h). We then use that
* device. * to calculate the maximum address that can be described by a
* space map entry for the given device.
*/ */
uint64_t shift = vd->vdev_ashift + 47; uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
if (shift >= 63) /* detect potential overflow */ if (shift >= 63) /* detect potential overflow */
return (B_TRUE); return (B_TRUE);

View File

@ -352,16 +352,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
return (0); return (0);
} }
static void
vdev_initialize_ms_load(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
metaslab_load_wait(msp);
if (!msp->ms_loaded)
VERIFY0(metaslab_load(msp));
}
static void static void
vdev_initialize_mg_wait(metaslab_group_t *mg) vdev_initialize_mg_wait(metaslab_group_t *mg)
{ {
@ -484,10 +474,10 @@ vdev_initialize_calculate_progress(vdev_t *vd)
* metaslab. Load it and walk the free tree for more accurate * metaslab. Load it and walk the free tree for more accurate
* progress estimation. * progress estimation.
*/ */
vdev_initialize_ms_load(msp); VERIFY0(metaslab_load(msp));
for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs; for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
logical_rs.rs_start = rs->rs_start; logical_rs.rs_start = rs->rs_start;
logical_rs.rs_end = rs->rs_end; logical_rs.rs_end = rs->rs_end;
vdev_xlate(vd, &logical_rs, &physical_rs); vdev_xlate(vd, &logical_rs, &physical_rs);
@ -615,7 +605,7 @@ vdev_initialize_thread(void *arg)
vdev_initialize_ms_mark(msp); vdev_initialize_ms_mark(msp);
mutex_enter(&msp->ms_lock); mutex_enter(&msp->ms_lock);
vdev_initialize_ms_load(msp); VERIFY0(metaslab_load(msp));
range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
vd); vd);