MFC r353611: 10330 merge recent ZoL vdev and metaslab changes
illumos/illumos-gate@a0b03b161c
a0b03b161c
https://www.illumos.org/issues/10330
3 recent ZoL changes in the vdev and metaslab code which we can pull over:
PR 8324 c853f382db 8324 Change target size of metaslabs from 256GB to 16GB
PR 8290 b194fab0fb 8290 Factor metaslab_load_wait() in metaslab_load()
PR 8286 419ba59145 8286 Update vdev_is_spacemap_addressable() for new spacemap
encoding
Author: Serapheim Dimitropoulos <serapheimd@gmail.com>
Obtained from: illumos, ZoL
MFC after: 2 weeks
This commit is contained in:
parent
3f51508cfe
commit
e7993b34fd
@ -901,11 +901,8 @@ dump_metaslab(metaslab_t *msp)
|
|||||||
|
|
||||||
if (dump_opt['m'] > 2 && !dump_opt['L']) {
|
if (dump_opt['m'] > 2 && !dump_opt['L']) {
|
||||||
mutex_enter(&msp->ms_lock);
|
mutex_enter(&msp->ms_lock);
|
||||||
metaslab_load_wait(msp);
|
VERIFY0(metaslab_load(msp));
|
||||||
if (!msp->ms_loaded) {
|
range_tree_stat_verify(msp->ms_allocatable);
|
||||||
VERIFY0(metaslab_load(msp));
|
|
||||||
range_tree_stat_verify(msp->ms_allocatable);
|
|
||||||
}
|
|
||||||
dump_metaslab_stats(msp);
|
dump_metaslab_stats(msp);
|
||||||
metaslab_unload(msp);
|
metaslab_unload(msp);
|
||||||
mutex_exit(&msp->ms_lock);
|
mutex_exit(&msp->ms_lock);
|
||||||
|
@ -1468,7 +1468,7 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
|
|||||||
/*
|
/*
|
||||||
* Wait for any in-progress metaslab loads to complete.
|
* Wait for any in-progress metaslab loads to complete.
|
||||||
*/
|
*/
|
||||||
void
|
static void
|
||||||
metaslab_load_wait(metaslab_t *msp)
|
metaslab_load_wait(metaslab_t *msp)
|
||||||
{
|
{
|
||||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||||
@ -1479,20 +1479,17 @@ metaslab_load_wait(metaslab_t *msp)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
static int
|
||||||
metaslab_load(metaslab_t *msp)
|
metaslab_load_impl(metaslab_t *msp)
|
||||||
{
|
{
|
||||||
int error = 0;
|
int error = 0;
|
||||||
boolean_t success = B_FALSE;
|
|
||||||
|
|
||||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||||
ASSERT(!msp->ms_loaded);
|
ASSERT(msp->ms_loading);
|
||||||
ASSERT(!msp->ms_loading);
|
|
||||||
|
|
||||||
msp->ms_loading = B_TRUE;
|
|
||||||
/*
|
/*
|
||||||
* Nobody else can manipulate a loading metaslab, so it's now safe
|
* Nobody else can manipulate a loading metaslab, so it's now safe
|
||||||
* to drop the lock. This way we don't have to hold the lock while
|
* to drop the lock. This way we don't have to hold the lock while
|
||||||
* reading the spacemap from disk.
|
* reading the spacemap from disk.
|
||||||
*/
|
*/
|
||||||
mutex_exit(&msp->ms_lock);
|
mutex_exit(&msp->ms_lock);
|
||||||
@ -1509,29 +1506,49 @@ metaslab_load(metaslab_t *msp)
|
|||||||
msp->ms_start, msp->ms_size);
|
msp->ms_start, msp->ms_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
success = (error == 0);
|
|
||||||
|
|
||||||
mutex_enter(&msp->ms_lock);
|
mutex_enter(&msp->ms_lock);
|
||||||
msp->ms_loading = B_FALSE;
|
|
||||||
|
|
||||||
if (success) {
|
if (error != 0)
|
||||||
ASSERT3P(msp->ms_group, !=, NULL);
|
return (error);
|
||||||
msp->ms_loaded = B_TRUE;
|
|
||||||
|
|
||||||
/*
|
ASSERT3P(msp->ms_group, !=, NULL);
|
||||||
* If the metaslab already has a spacemap, then we need to
|
msp->ms_loaded = B_TRUE;
|
||||||
* remove all segments from the defer tree; otherwise, the
|
|
||||||
* metaslab is completely empty and we can skip this.
|
/*
|
||||||
*/
|
* If the metaslab already has a spacemap, then we need to
|
||||||
if (msp->ms_sm != NULL) {
|
* remove all segments from the defer tree; otherwise, the
|
||||||
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
* metaslab is completely empty and we can skip this.
|
||||||
range_tree_walk(msp->ms_defer[t],
|
*/
|
||||||
range_tree_remove, msp->ms_allocatable);
|
if (msp->ms_sm != NULL) {
|
||||||
}
|
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
||||||
|
range_tree_walk(msp->ms_defer[t],
|
||||||
|
range_tree_remove, msp->ms_allocatable);
|
||||||
}
|
}
|
||||||
msp->ms_max_size = metaslab_block_maxsize(msp);
|
|
||||||
}
|
}
|
||||||
|
msp->ms_max_size = metaslab_block_maxsize(msp);
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
metaslab_load(metaslab_t *msp)
|
||||||
|
{
|
||||||
|
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There may be another thread loading the same metaslab, if that's
|
||||||
|
* the case just wait until the other thread is done and return.
|
||||||
|
*/
|
||||||
|
metaslab_load_wait(msp);
|
||||||
|
if (msp->ms_loaded)
|
||||||
|
return (0);
|
||||||
|
VERIFY(!msp->ms_loading);
|
||||||
|
|
||||||
|
msp->ms_loading = B_TRUE;
|
||||||
|
int error = metaslab_load_impl(msp);
|
||||||
|
msp->ms_loading = B_FALSE;
|
||||||
cv_broadcast(&msp->ms_load_cv);
|
cv_broadcast(&msp->ms_load_cv);
|
||||||
|
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2091,13 +2108,10 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
|
|||||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||||
|
|
||||||
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
|
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
|
||||||
int error = 0;
|
int error = metaslab_load(msp);
|
||||||
metaslab_load_wait(msp);
|
if (error != 0) {
|
||||||
if (!msp->ms_loaded) {
|
metaslab_group_sort(msp->ms_group, msp, 0);
|
||||||
if ((error = metaslab_load(msp)) != 0) {
|
return (error);
|
||||||
metaslab_group_sort(msp->ms_group, msp, 0);
|
|
||||||
return (error);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
|
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
|
||||||
/*
|
/*
|
||||||
@ -2209,9 +2223,7 @@ metaslab_preload(void *arg)
|
|||||||
ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
|
ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
|
||||||
|
|
||||||
mutex_enter(&msp->ms_lock);
|
mutex_enter(&msp->ms_lock);
|
||||||
metaslab_load_wait(msp);
|
(void) metaslab_load(msp);
|
||||||
if (!msp->ms_loaded)
|
|
||||||
(void) metaslab_load(msp);
|
|
||||||
msp->ms_selected_txg = spa_syncing_txg(spa);
|
msp->ms_selected_txg = spa_syncing_txg(spa);
|
||||||
mutex_exit(&msp->ms_lock);
|
mutex_exit(&msp->ms_lock);
|
||||||
}
|
}
|
||||||
|
@ -48,7 +48,6 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
|
|||||||
metaslab_t **);
|
metaslab_t **);
|
||||||
void metaslab_fini(metaslab_t *);
|
void metaslab_fini(metaslab_t *);
|
||||||
|
|
||||||
void metaslab_load_wait(metaslab_t *);
|
|
||||||
int metaslab_load(metaslab_t *);
|
int metaslab_load(metaslab_t *);
|
||||||
void metaslab_unload(metaslab_t *);
|
void metaslab_unload(metaslab_t *);
|
||||||
|
|
||||||
|
@ -370,8 +370,8 @@ struct metaslab {
|
|||||||
uint64_t ms_initializing; /* leaves initializing this ms */
|
uint64_t ms_initializing; /* leaves initializing this ms */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We must hold both ms_lock and ms_group->mg_lock in order to
|
* We must always hold the ms_lock when modifying ms_loaded
|
||||||
* modify ms_loaded.
|
* and ms_loading.
|
||||||
*/
|
*/
|
||||||
boolean_t ms_loaded;
|
boolean_t ms_loaded;
|
||||||
boolean_t ms_loading;
|
boolean_t ms_loading;
|
||||||
|
@ -163,34 +163,34 @@ static vdev_ops_t *vdev_ops_table[] = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/* target number of metaslabs per top-level vdev */
|
/* default target for number of metaslabs per top-level vdev */
|
||||||
int vdev_max_ms_count = 200;
|
int zfs_vdev_default_ms_count = 200;
|
||||||
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RWTUN,
|
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_count, CTLFLAG_RWTUN,
|
||||||
&vdev_max_ms_count, 0,
|
&zfs_vdev_default_ms_count, 0,
|
||||||
"Target number of metaslabs per top-level vdev");
|
"Target number of metaslabs per top-level vdev");
|
||||||
|
|
||||||
/* minimum number of metaslabs per top-level vdev */
|
/* minimum number of metaslabs per top-level vdev */
|
||||||
int vdev_min_ms_count = 16;
|
int zfs_vdev_min_ms_count = 16;
|
||||||
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN,
|
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN,
|
||||||
&vdev_min_ms_count, 0,
|
&zfs_vdev_min_ms_count, 0,
|
||||||
"Minimum number of metaslabs per top-level vdev");
|
"Minimum number of metaslabs per top-level vdev");
|
||||||
|
|
||||||
/* practical upper limit of total metaslabs per top-level vdev */
|
/* practical upper limit of total metaslabs per top-level vdev */
|
||||||
int vdev_ms_count_limit = 1ULL << 17;
|
int zfs_vdev_ms_count_limit = 1ULL << 17;
|
||||||
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN,
|
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN,
|
||||||
&vdev_ms_count_limit, 0,
|
&zfs_vdev_ms_count_limit, 0,
|
||||||
"Maximum number of metaslabs per top-level vdev");
|
"Maximum number of metaslabs per top-level vdev");
|
||||||
|
|
||||||
/* lower limit for metaslab size (512M) */
|
/* lower limit for metaslab size (512M) */
|
||||||
int vdev_default_ms_shift = 29;
|
int zfs_vdev_default_ms_shift = 29;
|
||||||
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN,
|
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN,
|
||||||
&vdev_default_ms_shift, 0,
|
&zfs_vdev_default_ms_shift, 0,
|
||||||
"Default shift between vdev size and number of metaslabs");
|
"Default shift between vdev size and number of metaslabs");
|
||||||
|
|
||||||
/* upper limit for metaslab size (256G) */
|
/* upper limit for metaslab size (16G) */
|
||||||
int vdev_max_ms_shift = 38;
|
int zfs_vdev_max_ms_shift = 34;
|
||||||
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN,
|
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN,
|
||||||
&vdev_max_ms_shift, 0,
|
&zfs_vdev_max_ms_shift, 0,
|
||||||
"Maximum shift between vdev size and number of metaslabs");
|
"Maximum shift between vdev size and number of metaslabs");
|
||||||
|
|
||||||
boolean_t vdev_validate_skip = B_FALSE;
|
boolean_t vdev_validate_skip = B_FALSE;
|
||||||
@ -2205,16 +2205,24 @@ void
|
|||||||
vdev_metaslab_set_size(vdev_t *vd)
|
vdev_metaslab_set_size(vdev_t *vd)
|
||||||
{
|
{
|
||||||
uint64_t asize = vd->vdev_asize;
|
uint64_t asize = vd->vdev_asize;
|
||||||
uint64_t ms_count = asize >> vdev_default_ms_shift;
|
uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
|
||||||
uint64_t ms_shift;
|
uint64_t ms_shift;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* There are two dimensions to the metaslab sizing calculation:
|
* There are two dimensions to the metaslab sizing calculation:
|
||||||
* the size of the metaslab and the count of metaslabs per vdev.
|
* the size of the metaslab and the count of metaslabs per vdev.
|
||||||
* In general, we aim for vdev_max_ms_count (200) metaslabs. The
|
|
||||||
* range of the dimensions are as follows:
|
|
||||||
*
|
*
|
||||||
* 2^29 <= ms_size <= 2^38
|
* The default values used below are a good balance between memory
|
||||||
|
* usage (larger metaslab size means more memory needed for loaded
|
||||||
|
* metaslabs; more metaslabs means more memory needed for the
|
||||||
|
* metaslab_t structs), metaslab load time (larger metaslabs take
|
||||||
|
* longer to load), and metaslab sync time (more metaslabs means
|
||||||
|
* more time spent syncing all of them).
|
||||||
|
*
|
||||||
|
* In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
|
||||||
|
* The range of the dimensions are as follows:
|
||||||
|
*
|
||||||
|
* 2^29 <= ms_size <= 2^34
|
||||||
* 16 <= ms_count <= 131,072
|
* 16 <= ms_count <= 131,072
|
||||||
*
|
*
|
||||||
* On the lower end of vdev sizes, we aim for metaslabs sizes of
|
* On the lower end of vdev sizes, we aim for metaslabs sizes of
|
||||||
@ -2223,35 +2231,41 @@ vdev_metaslab_set_size(vdev_t *vd)
|
|||||||
* of at least 16 metaslabs will override this minimum size goal.
|
* of at least 16 metaslabs will override this minimum size goal.
|
||||||
*
|
*
|
||||||
* On the upper end of vdev sizes, we aim for a maximum metaslab
|
* On the upper end of vdev sizes, we aim for a maximum metaslab
|
||||||
* size of 256GB. However, we will cap the total count to 2^17
|
* size of 16GB. However, we will cap the total count to 2^17
|
||||||
* metaslabs to keep our memory footprint in check.
|
* metaslabs to keep our memory footprint in check and let the
|
||||||
|
* metaslab size grow from there if that limit is hit.
|
||||||
*
|
*
|
||||||
* The net effect of applying above constrains is summarized below.
|
* The net effect of applying above constrains is summarized below.
|
||||||
*
|
*
|
||||||
* vdev size metaslab count
|
* vdev size metaslab count
|
||||||
* -------------|-----------------
|
* --------------|-----------------
|
||||||
* < 8GB ~16
|
* < 8GB ~16
|
||||||
* 8GB - 100GB one per 512MB
|
* 8GB - 100GB one per 512MB
|
||||||
* 100GB - 50TB ~200
|
* 100GB - 3TB ~200
|
||||||
* 50TB - 32PB one per 256GB
|
* 3TB - 2PB one per 16GB
|
||||||
* > 32PB ~131,072
|
* > 2PB ~131,072
|
||||||
* -------------------------------
|
* --------------------------------
|
||||||
|
*
|
||||||
|
* Finally, note that all of the above calculate the initial
|
||||||
|
* number of metaslabs. Expanding a top-level vdev will result
|
||||||
|
* in additional metaslabs being allocated making it possible
|
||||||
|
* to exceed the zfs_vdev_ms_count_limit.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if (ms_count < vdev_min_ms_count)
|
if (ms_count < zfs_vdev_min_ms_count)
|
||||||
ms_shift = highbit64(asize / vdev_min_ms_count);
|
ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
|
||||||
else if (ms_count > vdev_max_ms_count)
|
else if (ms_count > zfs_vdev_default_ms_count)
|
||||||
ms_shift = highbit64(asize / vdev_max_ms_count);
|
ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
|
||||||
else
|
else
|
||||||
ms_shift = vdev_default_ms_shift;
|
ms_shift = zfs_vdev_default_ms_shift;
|
||||||
|
|
||||||
if (ms_shift < SPA_MAXBLOCKSHIFT) {
|
if (ms_shift < SPA_MAXBLOCKSHIFT) {
|
||||||
ms_shift = SPA_MAXBLOCKSHIFT;
|
ms_shift = SPA_MAXBLOCKSHIFT;
|
||||||
} else if (ms_shift > vdev_max_ms_shift) {
|
} else if (ms_shift > zfs_vdev_max_ms_shift) {
|
||||||
ms_shift = vdev_max_ms_shift;
|
ms_shift = zfs_vdev_max_ms_shift;
|
||||||
/* cap the total count to constrain memory footprint */
|
/* cap the total count to constrain memory footprint */
|
||||||
if ((asize >> ms_shift) > vdev_ms_count_limit)
|
if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
|
||||||
ms_shift = highbit64(asize / vdev_ms_count_limit);
|
ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
vd->vdev_ms_shift = ms_shift;
|
vd->vdev_ms_shift = ms_shift;
|
||||||
@ -3611,13 +3625,17 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
|
|||||||
boolean_t
|
boolean_t
|
||||||
vdev_is_spacemap_addressable(vdev_t *vd)
|
vdev_is_spacemap_addressable(vdev_t *vd)
|
||||||
{
|
{
|
||||||
|
if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
|
||||||
|
return (B_TRUE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Assuming 47 bits of the space map entry dedicated for the entry's
|
* If double-word space map entries are not enabled we assume
|
||||||
* offset (see description in space_map.h), we calculate the maximum
|
* 47 bits of the space map entry are dedicated to the entry's
|
||||||
* address that can be described by a space map entry for the given
|
* offset (see SM_OFFSET_BITS in space_map.h). We then use that
|
||||||
* device.
|
* to calculate the maximum address that can be described by a
|
||||||
|
* space map entry for the given device.
|
||||||
*/
|
*/
|
||||||
uint64_t shift = vd->vdev_ashift + 47;
|
uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
|
||||||
|
|
||||||
if (shift >= 63) /* detect potential overflow */
|
if (shift >= 63) /* detect potential overflow */
|
||||||
return (B_TRUE);
|
return (B_TRUE);
|
||||||
|
@ -352,16 +352,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
|
|||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
|
||||||
vdev_initialize_ms_load(metaslab_t *msp)
|
|
||||||
{
|
|
||||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
|
||||||
|
|
||||||
metaslab_load_wait(msp);
|
|
||||||
if (!msp->ms_loaded)
|
|
||||||
VERIFY0(metaslab_load(msp));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
vdev_initialize_mg_wait(metaslab_group_t *mg)
|
vdev_initialize_mg_wait(metaslab_group_t *mg)
|
||||||
{
|
{
|
||||||
@ -484,10 +474,10 @@ vdev_initialize_calculate_progress(vdev_t *vd)
|
|||||||
* metaslab. Load it and walk the free tree for more accurate
|
* metaslab. Load it and walk the free tree for more accurate
|
||||||
* progress estimation.
|
* progress estimation.
|
||||||
*/
|
*/
|
||||||
vdev_initialize_ms_load(msp);
|
VERIFY0(metaslab_load(msp));
|
||||||
|
|
||||||
for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs;
|
for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
|
||||||
rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
|
rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
|
||||||
logical_rs.rs_start = rs->rs_start;
|
logical_rs.rs_start = rs->rs_start;
|
||||||
logical_rs.rs_end = rs->rs_end;
|
logical_rs.rs_end = rs->rs_end;
|
||||||
vdev_xlate(vd, &logical_rs, &physical_rs);
|
vdev_xlate(vd, &logical_rs, &physical_rs);
|
||||||
@ -615,7 +605,7 @@ vdev_initialize_thread(void *arg)
|
|||||||
|
|
||||||
vdev_initialize_ms_mark(msp);
|
vdev_initialize_ms_mark(msp);
|
||||||
mutex_enter(&msp->ms_lock);
|
mutex_enter(&msp->ms_lock);
|
||||||
vdev_initialize_ms_load(msp);
|
VERIFY0(metaslab_load(msp));
|
||||||
|
|
||||||
range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
|
range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
|
||||||
vd);
|
vd);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user