7303 dynamic metaslab selection

illumos/illumos-gate@8363e80ae7
https://github.com/illumos/illumos-gate/commit/8363e80ae72609660f6090766ca8c2c18aa53f0

https://www.illumos.org/issues/7303

  This change introduces a new weighting algorithm to improve metaslab selection.
  The new weighting algorithm relies on the SPACEMAP_HISTOGRAM feature. As a result,
  the metaslab weight now encodes the type of weighting algorithm used
  (size-based vs segment-based).

  This also introduce a new allocation tracing facility and two new dcmds to help
  debug allocation problems. Each zio now contains a zio_alloc_list_t structure
  that is populated as the zio goes through the allocations stage. Here's an
  example of how to use the tracing facility:

> c5ec000::print zio_t io_alloc_list | ::walk list | ::metaslab_trace
  MSID    DVA    ASIZE      WEIGHT             RESULT               VDEV
     -      0      400           0    NOT_ALLOCATABLE           ztest.0a
     -      0      400           0    NOT_ALLOCATABLE           ztest.0a
     -      0      400           0             ENOSPC           ztest.0a
     -      0      200           0    NOT_ALLOCATABLE           ztest.0a
     -      0      200           0    NOT_ALLOCATABLE           ztest.0a
     -      0      200           0             ENOSPC           ztest.0a
     1      0      400      1 x 8M            17b1a00           ztest.0a

> 1ff2400::print zio_t io_alloc_list | ::walk list | ::metaslab_trace
  MSID    DVA    ASIZE      WEIGHT             RESULT               VDEV
     -      0      200           0    NOT_ALLOCATABLE           mirror-2
     -      0      200           0    NOT_ALLOCATABLE           mirror-0
     1      0      200      1 x 4M            112ae00           mirror-1
     -      1      200           0    NOT_ALLOCATABLE           mirror-2
     -      1      200           0    NOT_ALLOCATABLE           mirror-0
     1      1      200      1 x 4M            112b000           mirror-1
     -      2      200           0    NOT_ALLOCATABLE           mirror-2

  If the metaslab is using segment-based weighting then the WEIGHT column will
  display the number of segments available in the bucket where the allocation
  attempt was made.

Author: George Wilson <george.wilson@delphix.com>
Reviewed by: Alex Reece <alex@delphix.com>
Reviewed by: Chris Siden <christopher.siden@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <paul.dagnelie@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Don Brady <don.brady@intel.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
This commit is contained in:
Josh Paetzel 2017-03-15 04:16:08 +00:00
parent b060bbc16a
commit 137146f48c
4 changed files with 28 additions and 5 deletions

View File

@ -2562,10 +2562,21 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
if (!dump_opt['L']) {
vdev_t *rvd = spa->spa_root_vdev;
/*
* We are going to be changing the meaning of the metaslab's
* ms_tree. Ensure that the allocator doesn't try to
* use the tree.
*/
spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
metaslab_group_t *mg = vd->vdev_mg;
for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
ASSERT3P(msp->ms_group, ==, mg);
mutex_enter(&msp->ms_lock);
metaslab_unload(msp);
@ -2586,8 +2597,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
(longlong_t)m,
(longlong_t)vd->vdev_ms_count);
msp->ms_ops = &zdb_metaslab_ops;
/*
* We don't want to spend the CPU
* manipulating the size-ordered
@ -2597,7 +2606,10 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
msp->ms_tree->rt_ops = NULL;
VERIFY0(space_map_load(msp->ms_sm,
msp->ms_tree, SM_ALLOC));
msp->ms_loaded = B_TRUE;
if (!msp->ms_loaded) {
msp->ms_loaded = B_TRUE;
}
}
mutex_exit(&msp->ms_lock);
}
@ -2619,8 +2631,10 @@ zdb_leak_fini(spa_t *spa)
vdev_t *rvd = spa->spa_root_vdev;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
metaslab_group_t *mg = vd->vdev_mg;
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
ASSERT3P(mg, ==, msp->ms_group);
mutex_enter(&msp->ms_lock);
/*
@ -2634,7 +2648,10 @@ zdb_leak_fini(spa_t *spa)
* from the ms_tree.
*/
range_tree_vacate(msp->ms_tree, zdb_leak, vd);
msp->ms_loaded = B_FALSE;
if (msp->ms_loaded) {
msp->ms_loaded = B_FALSE;
}
mutex_exit(&msp->ms_lock);
}

View File

@ -171,7 +171,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
.zo_mirrors = 2,
.zo_raidz = 4,
.zo_raidz_parity = 1,
.zo_vdev_size = SPA_MINDEVSIZE * 2,
.zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */
.zo_datasets = 7,
.zo_threads = 23,
.zo_passtime = 60, /* 60 seconds */

View File

@ -92,6 +92,11 @@ kstat_create(const char *module, int instance, const char *name,
return (NULL);
}
/*ARGSUSED*/
void
kstat_named_init(kstat_named_t *knp, const char *name, uchar_t type)
{}
/*ARGSUSED*/
void
kstat_install(kstat_t *ksp)

View File

@ -301,6 +301,7 @@ extern void cv_broadcast(kcondvar_t *cv);
*/
extern kstat_t *kstat_create(const char *, int,
const char *, const char *, uchar_t, ulong_t, uchar_t);
extern void kstat_named_init(kstat_named_t *, const char *, uchar_t);
extern void kstat_install(kstat_t *);
extern void kstat_delete(kstat_t *);
extern void kstat_waitq_enter(kstat_io_t *);