MFC r353611: 10330 merge recent ZoL vdev and metaslab changes

illumos/illumos-gate@a0b03b161c
a0b03b161c

https://www.illumos.org/issues/10330
  3 recent ZoL changes in the vdev and metaslab code which we can pull over:
  PR 8324 c853f382db 8324 Change target size of metaslabs from 256GB to 16GB
  PR 8290 b194fab0fb 8290 Factor metaslab_load_wait() in metaslab_load()
  PR 8286 419ba59145 8286 Update vdev_is_spacemap_addressable() for new spacemap
  encoding

Author: Serapheim Dimitropoulos <serapheimd@gmail.com>
Obtained from:	illumos, ZoL
MFC after:	2 weeks
This commit is contained in:
avg 2019-10-16 06:26:51 +00:00
parent 3f51508cfe
commit e7993b34fd
6 changed files with 114 additions and 98 deletions

View File

@ -901,11 +901,8 @@ dump_metaslab(metaslab_t *msp)
if (dump_opt['m'] > 2 && !dump_opt['L']) {
mutex_enter(&msp->ms_lock);
metaslab_load_wait(msp);
if (!msp->ms_loaded) {
VERIFY0(metaslab_load(msp));
range_tree_stat_verify(msp->ms_allocatable);
}
VERIFY0(metaslab_load(msp));
range_tree_stat_verify(msp->ms_allocatable);
dump_metaslab_stats(msp);
metaslab_unload(msp);
mutex_exit(&msp->ms_lock);

View File

@ -1468,7 +1468,7 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
/*
* Wait for any in-progress metaslab loads to complete.
*/
void
static void
metaslab_load_wait(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
@ -1479,20 +1479,17 @@ metaslab_load_wait(metaslab_t *msp)
}
}
int
metaslab_load(metaslab_t *msp)
static int
metaslab_load_impl(metaslab_t *msp)
{
int error = 0;
boolean_t success = B_FALSE;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(!msp->ms_loaded);
ASSERT(!msp->ms_loading);
ASSERT(msp->ms_loading);
msp->ms_loading = B_TRUE;
/*
* Nobody else can manipulate a loading metaslab, so it's now safe
* to drop the lock. This way we don't have to hold the lock while
* to drop the lock. This way we don't have to hold the lock while
* reading the spacemap from disk.
*/
mutex_exit(&msp->ms_lock);
@ -1509,29 +1506,49 @@ metaslab_load(metaslab_t *msp)
msp->ms_start, msp->ms_size);
}
success = (error == 0);
mutex_enter(&msp->ms_lock);
msp->ms_loading = B_FALSE;
if (success) {
ASSERT3P(msp->ms_group, !=, NULL);
msp->ms_loaded = B_TRUE;
if (error != 0)
return (error);
/*
* If the metaslab already has a spacemap, then we need to
* remove all segments from the defer tree; otherwise, the
* metaslab is completely empty and we can skip this.
*/
if (msp->ms_sm != NULL) {
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defer[t],
range_tree_remove, msp->ms_allocatable);
}
ASSERT3P(msp->ms_group, !=, NULL);
msp->ms_loaded = B_TRUE;
/*
* If the metaslab already has a spacemap, then we need to
* remove all segments from the defer tree; otherwise, the
* metaslab is completely empty and we can skip this.
*/
if (msp->ms_sm != NULL) {
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defer[t],
range_tree_remove, msp->ms_allocatable);
}
msp->ms_max_size = metaslab_block_maxsize(msp);
}
msp->ms_max_size = metaslab_block_maxsize(msp);
return (0);
}
int
metaslab_load(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
/*
* There may be another thread loading the same metaslab, if that's
* the case just wait until the other thread is done and return.
*/
metaslab_load_wait(msp);
if (msp->ms_loaded)
return (0);
VERIFY(!msp->ms_loading);
msp->ms_loading = B_TRUE;
int error = metaslab_load_impl(msp);
msp->ms_loading = B_FALSE;
cv_broadcast(&msp->ms_load_cv);
return (error);
}
@ -2091,13 +2108,10 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
int error = 0;
metaslab_load_wait(msp);
if (!msp->ms_loaded) {
if ((error = metaslab_load(msp)) != 0) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
}
int error = metaslab_load(msp);
if (error != 0) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
}
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
/*
@ -2209,9 +2223,7 @@ metaslab_preload(void *arg)
ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
mutex_enter(&msp->ms_lock);
metaslab_load_wait(msp);
if (!msp->ms_loaded)
(void) metaslab_load(msp);
(void) metaslab_load(msp);
msp->ms_selected_txg = spa_syncing_txg(spa);
mutex_exit(&msp->ms_lock);
}

View File

@ -48,7 +48,6 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
metaslab_t **);
void metaslab_fini(metaslab_t *);
void metaslab_load_wait(metaslab_t *);
int metaslab_load(metaslab_t *);
void metaslab_unload(metaslab_t *);

View File

@ -370,8 +370,8 @@ struct metaslab {
uint64_t ms_initializing; /* leaves initializing this ms */
/*
* We must hold both ms_lock and ms_group->mg_lock in order to
* modify ms_loaded.
* We must always hold the ms_lock when modifying ms_loaded
* and ms_loading.
*/
boolean_t ms_loaded;
boolean_t ms_loading;

View File

@ -163,34 +163,34 @@ static vdev_ops_t *vdev_ops_table[] = {
};
/* target number of metaslabs per top-level vdev */
int vdev_max_ms_count = 200;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RWTUN,
&vdev_max_ms_count, 0,
/* default target for number of metaslabs per top-level vdev */
int zfs_vdev_default_ms_count = 200;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_count, CTLFLAG_RWTUN,
&zfs_vdev_default_ms_count, 0,
"Target number of metaslabs per top-level vdev");
/* minimum number of metaslabs per top-level vdev */
int vdev_min_ms_count = 16;
int zfs_vdev_min_ms_count = 16;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN,
&vdev_min_ms_count, 0,
&zfs_vdev_min_ms_count, 0,
"Minimum number of metaslabs per top-level vdev");
/* practical upper limit of total metaslabs per top-level vdev */
int vdev_ms_count_limit = 1ULL << 17;
int zfs_vdev_ms_count_limit = 1ULL << 17;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN,
&vdev_ms_count_limit, 0,
&zfs_vdev_ms_count_limit, 0,
"Maximum number of metaslabs per top-level vdev");
/* lower limit for metaslab size (512M) */
int vdev_default_ms_shift = 29;
int zfs_vdev_default_ms_shift = 29;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN,
&vdev_default_ms_shift, 0,
&zfs_vdev_default_ms_shift, 0,
"Default shift between vdev size and number of metaslabs");
/* upper limit for metaslab size (256G) */
int vdev_max_ms_shift = 38;
/* upper limit for metaslab size (16G) */
int zfs_vdev_max_ms_shift = 34;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN,
&vdev_max_ms_shift, 0,
&zfs_vdev_max_ms_shift, 0,
"Maximum shift between vdev size and number of metaslabs");
boolean_t vdev_validate_skip = B_FALSE;
@ -2205,16 +2205,24 @@ void
vdev_metaslab_set_size(vdev_t *vd)
{
uint64_t asize = vd->vdev_asize;
uint64_t ms_count = asize >> vdev_default_ms_shift;
uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
uint64_t ms_shift;
/*
* There are two dimensions to the metaslab sizing calculation:
* the size of the metaslab and the count of metaslabs per vdev.
* In general, we aim for vdev_max_ms_count (200) metaslabs. The
* range of the dimensions are as follows:
*
* 2^29 <= ms_size <= 2^38
* The default values used below are a good balance between memory
* usage (larger metaslab size means more memory needed for loaded
* metaslabs; more metaslabs means more memory needed for the
* metaslab_t structs), metaslab load time (larger metaslabs take
* longer to load), and metaslab sync time (more metaslabs means
* more time spent syncing all of them).
*
* In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
* The range of the dimensions are as follows:
*
* 2^29 <= ms_size <= 2^34
* 16 <= ms_count <= 131,072
*
* On the lower end of vdev sizes, we aim for metaslabs sizes of
@ -2223,35 +2231,41 @@ vdev_metaslab_set_size(vdev_t *vd)
* of at least 16 metaslabs will override this minimum size goal.
*
* On the upper end of vdev sizes, we aim for a maximum metaslab
* size of 256GB. However, we will cap the total count to 2^17
* metaslabs to keep our memory footprint in check.
* size of 16GB. However, we will cap the total count to 2^17
* metaslabs to keep our memory footprint in check and let the
* metaslab size grow from there if that limit is hit.
*
* The net effect of applying above constrains is summarized below.
*
* vdev size metaslab count
* -------------|-----------------
* < 8GB ~16
* 8GB - 100GB one per 512MB
* 100GB - 50TB ~200
* 50TB - 32PB one per 256GB
* > 32PB ~131,072
* -------------------------------
* vdev size metaslab count
* --------------|-----------------
* < 8GB ~16
* 8GB - 100GB one per 512MB
* 100GB - 3TB ~200
* 3TB - 2PB one per 16GB
* > 2PB ~131,072
* --------------------------------
*
* Finally, note that all of the above calculate the initial
* number of metaslabs. Expanding a top-level vdev will result
* in additional metaslabs being allocated making it possible
* to exceed the zfs_vdev_ms_count_limit.
*/
if (ms_count < vdev_min_ms_count)
ms_shift = highbit64(asize / vdev_min_ms_count);
else if (ms_count > vdev_max_ms_count)
ms_shift = highbit64(asize / vdev_max_ms_count);
if (ms_count < zfs_vdev_min_ms_count)
ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
else if (ms_count > zfs_vdev_default_ms_count)
ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
else
ms_shift = vdev_default_ms_shift;
ms_shift = zfs_vdev_default_ms_shift;
if (ms_shift < SPA_MAXBLOCKSHIFT) {
ms_shift = SPA_MAXBLOCKSHIFT;
} else if (ms_shift > vdev_max_ms_shift) {
ms_shift = vdev_max_ms_shift;
} else if (ms_shift > zfs_vdev_max_ms_shift) {
ms_shift = zfs_vdev_max_ms_shift;
/* cap the total count to constrain memory footprint */
if ((asize >> ms_shift) > vdev_ms_count_limit)
ms_shift = highbit64(asize / vdev_ms_count_limit);
if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
}
vd->vdev_ms_shift = ms_shift;
@ -3611,13 +3625,17 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
boolean_t
vdev_is_spacemap_addressable(vdev_t *vd)
{
if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
return (B_TRUE);
/*
* Assuming 47 bits of the space map entry dedicated for the entry's
* offset (see description in space_map.h), we calculate the maximum
* address that can be described by a space map entry for the given
* device.
* If double-word space map entries are not enabled we assume
* 47 bits of the space map entry are dedicated to the entry's
* offset (see SM_OFFSET_BITS in space_map.h). We then use that
* to calculate the maximum address that can be described by a
* space map entry for the given device.
*/
uint64_t shift = vd->vdev_ashift + 47;
uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
if (shift >= 63) /* detect potential overflow */
return (B_TRUE);

View File

@ -352,16 +352,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
return (0);
}
static void
vdev_initialize_ms_load(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
metaslab_load_wait(msp);
if (!msp->ms_loaded)
VERIFY0(metaslab_load(msp));
}
static void
vdev_initialize_mg_wait(metaslab_group_t *mg)
{
@ -484,10 +474,10 @@ vdev_initialize_calculate_progress(vdev_t *vd)
* metaslab. Load it and walk the free tree for more accurate
* progress estimation.
*/
vdev_initialize_ms_load(msp);
VERIFY0(metaslab_load(msp));
for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs;
rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
logical_rs.rs_start = rs->rs_start;
logical_rs.rs_end = rs->rs_end;
vdev_xlate(vd, &logical_rs, &physical_rs);
@ -615,7 +605,7 @@ vdev_initialize_thread(void *arg)
vdev_initialize_ms_mark(msp);
mutex_enter(&msp->ms_lock);
vdev_initialize_ms_load(msp);
VERIFY0(metaslab_load(msp));
range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
vd);