From 431b557a046211b10745d256bca9432b92f92604 Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Wed, 6 Nov 2019 08:47:09 +0000 Subject: [PATCH 1/7] 10701 Correct lock ASSERTs in vdev_label_read/write illumos/illumos-gate@58447f688d5e308373ab16a3b129bc0ba0fbc154 https://github.com/illumos/illumos-gate/commit/58447f688d5e308373ab16a3b129bc0ba0fbc154 https://www.illumos.org/issues/10701 Port of ZoL commit: 0091d66f4e Correct lock ASSERTs in vdev_label_read/write At a minimum, this fixes a blown assert during an MMP test run when running on a DEBUG build. Portions contributed by: Jerry Jelinek Author: Olaf Faaland --- uts/common/fs/zfs/vdev_label.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/uts/common/fs/zfs/vdev_label.c b/uts/common/fs/zfs/vdev_label.c index 9f68b2af3e3e..55c3060042d9 100644 --- a/uts/common/fs/zfs/vdev_label.c +++ b/uts/common/fs/zfs/vdev_label.c @@ -184,8 +184,9 @@ static void vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { - ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == - SCL_STATE_ALL); + ASSERT( + spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || + spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_read_phys(zio, vd, @@ -198,17 +199,9 @@ void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { -#ifdef _KERNEL - /* - * This assert is invalid in the user-level ztest MMP code because - * the ztest thread is not in dsl_pool_sync_context. ZoL does not - * build the user-level code with DEBUG so this is not an issue there. - */ - ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || - (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) == - (SCL_CONFIG | SCL_STATE) && - dsl_pool_sync_context(spa_get_dsl(zio->io_spa)))); -#endif + ASSERT( + spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || + spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_write_phys(zio, vd, From 6532d07b7c52767faee7fa1ca23a1e8d0af1a935 Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Wed, 6 Nov 2019 08:53:52 +0000 Subject: [PATCH 2/7] 8899 zpool list property documentation doesn't match actual behaviour illumos/illumos-gate@b0e142e57dfda6f02bf1fa66973c610f5fefe8cb https://github.com/illumos/illumos-gate/commit/b0e142e57dfda6f02bf1fa66973c610f5fefe8cb https://www.illumos.org/issues/8899 https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=221795: The `zpool list` section of the zpool(8) manpage says that the default list of displayed properties is as below: The default list is name, size, used, available, fragmentation, expandsize, capacity, health, altroot. This does not seem to be the case. In fact: # zpool list NAME SIZE ALLOC FREE EXPANDSZ FRAG CAP DEDUP HEALTH ALTROOT tank 10.9T 6.05T 4.83T - 29% 55% 1.00x ONLINE - The Properties section of the same manpage includes the `used` property, but this seems not to be recognised: # zpool list -o used bad property list: invalid property 'used' usage: [...] # The usage message produced here indeed doesn't include `used`. Author: Yuri Pankov (yuripv) --- man/man1m/zpool.1m | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/man/man1m/zpool.1m b/man/man1m/zpool.1m index 38bc74f76ac6..30876beb2ee3 100644 --- a/man/man1m/zpool.1m +++ b/man/man1m/zpool.1m @@ -25,7 +25,7 @@ .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2017 George Melikov. All Rights Reserved. .\" -.Dd August 23, 2017 +.Dd December 6, 2017 .Dt ZPOOL 1M .Os .Sh NAME @@ -521,10 +521,8 @@ change the behavior of the pool. .Pp The following are read-only properties: .Bl -tag -width Ds -.It Sy available -Amount of storage available within the pool. -This property can also be referred to by its shortened column name, -.Sy avail . +.It Cm allocated +Amount of storage space used within the pool. .It Sy bootsize The size of the system boot partition. This property can only be set at pool creation time and is read-only once pool @@ -572,8 +570,6 @@ Information about unsupported features that are enabled on the pool. See .Xr zpool-features 5 for details. -.It Sy used -Amount of storage space used within the pool. .El .Pp The space usage properties report actual physical space available to the @@ -1532,8 +1528,8 @@ See the .Sx Properties section for a list of valid properties. The default list is -.Sy name , size , used , available , fragmentation , expandsize , capacity , -.Sy dedupratio , health , altroot . +.Cm name , size , allocated , free , expandsize , fragmentation , capacity , +.Cm dedupratio , health , altroot . .It Fl p Display numbers in parsable .Pq exact From db46f5eb51be9602d298a84b1d77d50be3336fea Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Wed, 6 Nov 2019 08:55:23 +0000 Subject: [PATCH 3/7] 9521 Add checkpoint field in the default list of the zpool-list man page illumos/illumos-gate@f41179bd376293096297cdc1f32e610d44f65c8b https://github.com/illumos/illumos-gate/commit/f41179bd376293096297cdc1f32e610d44f65c8b https://www.illumos.org/issues/9521 The default list of fields in the zpool list man page is missing checkpoint. Author: Eitan Adler (eadler) --- man/man1m/zpool.1m | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/man1m/zpool.1m b/man/man1m/zpool.1m index 30876beb2ee3..29935e5d77cf 100644 --- a/man/man1m/zpool.1m +++ b/man/man1m/zpool.1m @@ -25,7 +25,7 @@ .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2017 George Melikov. All Rights Reserved. .\" -.Dd December 6, 2017 +.Dd April 27, 2018 .Dt ZPOOL 1M .Os .Sh NAME @@ -1528,7 +1528,7 @@ See the .Sx Properties section for a list of valid properties. The default list is -.Cm name , size , allocated , free , expandsize , fragmentation , capacity , +.Cm name , size , allocated , free , checkpoint, expandsize , fragmentation , capacity , .Cm dedupratio , health , altroot . .It Fl p Display numbers in parsable From 41e8562cdc86ecf5f8d662c3db12b84596a07ff6 Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Wed, 6 Nov 2019 08:58:03 +0000 Subject: [PATCH 4/7] 10601 10757 Pool allocation classes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit illumos/illumos-gate@663207adb1669640c01c5ec6949ce78fd806efae https://github.com/illumos/illumos-gate/commit/663207adb1669640c01c5ec6949ce78fd806efae 10601 Pool allocation classes https://www.illumos.org/issues/10601 illumos port of ZoL Pool allocation classes. Includes at least these two commits: 441709695 Pool allocation classes misplacing small file blocks cc99f275a Pool allocation classes 10757 Add -gLp to zpool subcommands for alt vdev names https://www.illumos.org/issues/10757 Port from ZoL of d2f3e292d Add -gLp to zpool subcommands for alt vdev names Note that a subsequent ZoL commit changed -p to -P a77f29f93 Change full path subcommand flag from -p to -P Portions contributed by: Jerry Jelinek Portions contributed by: HÃ¥kan Johansson Portions contributed by: Richard Yao Portions contributed by: Chunwei Chen Portions contributed by: loli10K Author: Don Brady --- cmd/zdb/zdb.c | 137 +++++-- cmd/zpool/zpool_main.c | 556 ++++++++++++++++++++--------- cmd/zpool/zpool_vdev.c | 173 ++++++++- cmd/ztest/ztest.c | 191 +++++++++- common/zfs/zfeature_common.c | 10 +- common/zfs/zfeature_common.h | 2 + common/zfs/zfs_prop.c | 3 + lib/libzfs/common/libzfs.h | 10 +- lib/libzfs/common/libzfs_dataset.c | 30 ++ lib/libzfs/common/libzfs_pool.c | 91 ++++- lib/libzpool/common/util.c | 23 +- man/man1m/zfs.1m | 14 + man/man1m/zpool.1m | 167 ++++++++- man/man5/zpool-features.5 | 22 ++ uts/common/fs/zfs/dmu.c | 2 + uts/common/fs/zfs/dmu_objset.c | 20 ++ uts/common/fs/zfs/metaslab.c | 143 ++++---- uts/common/fs/zfs/spa.c | 65 +++- uts/common/fs/zfs/spa_misc.c | 99 +++++ uts/common/fs/zfs/sys/dmu.h | 14 +- uts/common/fs/zfs/sys/dmu_objset.h | 7 +- uts/common/fs/zfs/sys/metaslab.h | 8 +- uts/common/fs/zfs/sys/spa.h | 7 + uts/common/fs/zfs/sys/spa_impl.h | 3 + uts/common/fs/zfs/sys/vdev.h | 3 + uts/common/fs/zfs/sys/vdev_impl.h | 10 + uts/common/fs/zfs/sys/zio.h | 2 + uts/common/fs/zfs/vdev.c | 223 ++++++++++-- uts/common/fs/zfs/vdev_label.c | 23 ++ uts/common/fs/zfs/vdev_removal.c | 39 +- uts/common/fs/zfs/zfs_ioctl.c | 9 + uts/common/fs/zfs/zio.c | 104 ++++-- uts/common/sys/fs/zfs.h | 12 + 33 files changed, 1819 insertions(+), 403 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 1a3dbb115538..acfe7ca5f7a8 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. @@ -931,13 +931,23 @@ dump_metaslab(metaslab_t *msp) static void print_vdev_metaslab_header(vdev_t *vd) { - (void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n", - (u_longlong_t)vd->vdev_id, + vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; + const char *bias_str; + + bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ? + VDEV_ALLOC_BIAS_LOG : + (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : + (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : + vd->vdev_islog ? "log" : ""; + + (void) printf("\tvdev %10llu %s\n" + "\t%-10s%5llu %-19s %-15s %-12s\n", + (u_longlong_t)vd->vdev_id, bias_str, "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); - (void) printf("\t%15s %19s %15s %10s\n", + (void) printf("\t%15s %19s %15s %12s\n", "---------------", "-------------------", - "---------------", "-------------"); + "---------------", "------------"); } static void @@ -953,7 +963,7 @@ dump_metaslab_groups(spa_t *spa) vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; - if (mg->mg_class != mc) + if (mg == NULL || mg->mg_class != mc) continue; metaslab_group_histogram_verify(mg); @@ -2767,6 +2777,7 @@ typedef struct zdb_blkstats { uint64_t zb_count; uint64_t zb_gangs; uint64_t zb_ditto_samevdev; + uint64_t zb_ditto_same_ms; uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; } zdb_blkstats_t; @@ -2806,6 +2817,16 @@ typedef struct zdb_cb { uint32_t **zcb_vd_obsolete_counts; } zdb_cb_t; +/* test if two DVA offsets from same vdev are within the same metaslab */ +static boolean_t +same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) +{ + vdev_t *vd = vdev_lookup_top(spa, vdev); + uint64_t ms_shift = vd->vdev_ms_shift; + + return ((off1 >> ms_shift) == (off2 >> ms_shift)); +} + static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) @@ -2817,6 +2838,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; + spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); + for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; @@ -2842,8 +2865,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) + DVA_GET_VDEV(&bp->blk_dva[1])) { zb->zb_ditto_samevdev++; + + if (same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[1]))) + zb->zb_ditto_same_ms++; + } break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == @@ -2852,13 +2882,37 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); - if (equal != 0) + if (equal != 0) { zb->zb_ditto_samevdev++; + + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[1]))) + zb->zb_ditto_same_ms++; + else if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[2]))) + zb->zb_ditto_same_ms++; + else if (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[1]), + DVA_GET_OFFSET(&bp->blk_dva[1]), + DVA_GET_OFFSET(&bp->blk_dva[2]))) + zb->zb_ditto_same_ms++; + } break; } - } + spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); + if (BP_IS_EMBEDDED(bp)) { zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] @@ -3665,6 +3719,7 @@ dump_block_stats(spa_t *spa) uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; boolean_t leaks = B_FALSE; + int err; bzero(&zcb, sizeof (zcb)); (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", @@ -3707,8 +3762,10 @@ dump_block_stats(spa_t *spa) flags |= TRAVERSE_PREFETCH_DATA; zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); + zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); + zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); - zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); + err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); /* * If we've traversed the data blocks then we need to wait for those @@ -3724,6 +3781,12 @@ dump_block_stats(spa_t *spa) } } + /* + * Done after zio_wait() since zcb_haderrors is modified in + * zdb_blkptr_done() + */ + zcb.zcb_haderrors |= err; + if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); @@ -3745,7 +3808,10 @@ dump_block_stats(spa_t *spa) norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); norm_space = metaslab_class_get_space(spa_normal_class(spa)); - total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)); + total_alloc = norm_alloc + + metaslab_class_get_alloc(spa_log_class(spa)) + + metaslab_class_get_alloc(spa_special_class(spa)) + + metaslab_class_get_alloc(spa_dedup_class(spa)); total_found = tzb->zb_asize - zcb.zcb_dedup_asize + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; @@ -3767,31 +3833,50 @@ dump_block_stats(spa_t *spa) return (2); (void) printf("\n"); - (void) printf("\tbp count: %10llu\n", + (void) printf("\t%-16s %14llu\n", "bp count:", (u_longlong_t)tzb->zb_count); - (void) printf("\tganged count: %10llu\n", + (void) printf("\t%-16s %14llu\n", "ganged count:", (longlong_t)tzb->zb_gangs); - (void) printf("\tbp logical: %10llu avg: %6llu\n", + (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); - (void) printf("\tbp physical: %10llu avg:" - " %6llu compression: %6.2f\n", - (u_longlong_t)tzb->zb_psize, + (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", + "bp physical:", (u_longlong_t)tzb->zb_psize, (u_longlong_t)(tzb->zb_psize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_psize); - (void) printf("\tbp allocated: %10llu avg:" - " %6llu compression: %6.2f\n", - (u_longlong_t)tzb->zb_asize, + (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", + "bp allocated:", (u_longlong_t)tzb->zb_asize, (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); - (void) printf("\tbp deduped: %10llu ref>1:" - " %6llu deduplication: %6.2f\n", - (u_longlong_t)zcb.zcb_dedup_asize, + (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", + "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize, (u_longlong_t)zcb.zcb_dedup_blocks, (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); - (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n", + (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); + if (spa_special_class(spa)->mc_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_special_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_special_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Special class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + + if (spa_dedup_class(spa)->mc_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_dedup_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_dedup_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Dedup class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb.zcb_embedded_blocks[i] == 0) continue; @@ -3813,6 +3898,10 @@ dump_block_stats(spa_t *spa) (void) printf("\tDittoed blocks on same vdev: %llu\n", (longlong_t)tzb->zb_ditto_samevdev); } + if (tzb->zb_ditto_same_ms != 0) { + (void) printf("\tDittoed blocks in same metaslab: %llu\n", + (longlong_t)tzb->zb_ditto_same_ms); + } for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index f5dba9c32c50..39df98edff7f 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -27,6 +27,7 @@ * Copyright 2016 Igor Kozhukhov . * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. */ #include @@ -206,6 +207,8 @@ static zpool_command_t command_table[] = { #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) +#define VDEV_ALLOC_CLASS_LOGS "logs" + static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; @@ -216,7 +219,7 @@ get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: - return (gettext("\tadd [-fn] ...\n")); + return (gettext("\tadd [-fgLnP] ...\n")); case HELP_ATTACH: return (gettext("\tattach [-f] " "\n")); @@ -248,12 +251,12 @@ get_usage(zpool_help_t idx) "[-R root] [-F [-n]] [-t]\n" "\t [--rewind-to-checkpoint] [newpool]\n")); case HELP_IOSTAT: - return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval " - "[count]]\n")); + return (gettext("\tiostat [-gLPv] [-T d|u] [pool] ... " + "[interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: - return (gettext("\tlist [-Hp] [-o property[,...]] " + return (gettext("\tlist [-gHLpPv] [-o property[,...]] " "[-T d|u] [pool] ... [interval [count]]\n")); case HELP_OFFLINE: return (gettext("\toffline [-t] ...\n")); @@ -271,8 +274,8 @@ get_usage(zpool_help_t idx) case HELP_SCRUB: return (gettext("\tscrub [-s | -p] ...\n")); case HELP_STATUS: - return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval " - "[count]]\n")); + return (gettext("\tstatus [-DgLPvx] [-T d|u] [pool] ... " + "[interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" @@ -283,7 +286,7 @@ get_usage(zpool_help_t idx) case HELP_SET: return (gettext("\tset \n")); case HELP_SPLIT: - return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n" + return (gettext("\tsplit [-gLnP] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); case HELP_REGUID: @@ -305,7 +308,7 @@ print_prop_cb(int prop, void *cb) { FILE *fp = cb; - (void) fprintf(fp, "\t%-15s ", zpool_prop_to_name(prop)); + (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); @@ -357,14 +360,14 @@ usage(boolean_t requested) (void) fprintf(fp, gettext("\nthe following properties are supported:\n")); - (void) fprintf(fp, "\n\t%-15s %s %s\n\n", + (void) fprintf(fp, "\n\t%-19s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_POOL); - (void) fprintf(fp, "\t%-15s ", "feature@..."); + (void) fprintf(fp, "\t%-19s ", "feature@..."); (void) fprintf(fp, "YES disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties must be " @@ -382,32 +385,45 @@ usage(boolean_t requested) exit(requested ? 0 : 2); } -void +/* + * print a pool vdev config for dry runs + */ +static void print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, - boolean_t print_logs) + const char *match, int name_flags) { nvlist_t **child; uint_t c, children; char *vname; - - if (name != NULL) - (void) printf("\t%*s%s\n", indent, "", name); + boolean_t printed = B_FALSE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) + &child, &children) != 0) { + if (name != NULL) + (void) printf("\t%*s%s\n", indent, "", name); return; + } for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; + char *class = ""; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); - if ((is_log && !print_logs) || (!is_log && print_logs)) + if (is_log) + class = VDEV_ALLOC_BIAS_LOG; + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &class); + if (strcmp(match, class) != 0) continue; - vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); - print_vdev_tree(zhp, vname, child[c], indent + 2, - B_FALSE); + if (!printed && name != NULL) { + (void) printf("\t%*s%s\n", indent, "", name); + printed = B_TRUE; + } + vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); + print_vdev_tree(zhp, vname, child[c], indent + 2, "", + name_flags); free(vname); } } @@ -515,11 +531,14 @@ add_prop_list_default(const char *propname, char *propval, nvlist_t **props, } /* - * zpool add [-fn] ... + * zpool add [-fgLnP] [-o property=value] ... * * -f Force addition of devices, even if they appear in use + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. * -n Do not add the devices, but display the resulting layout if * they were to be added. + * -P Display full path for vdev name. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by get_vdev_spec(), which constructs the nvlist needed to pass to @@ -530,6 +549,7 @@ zpool_do_add(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; + int name_flags = 0; int c; nvlist_t *nvroot; char *poolname; @@ -540,14 +560,23 @@ zpool_do_add(int argc, char **argv) nvlist_t *config; /* check options */ - while ((c = getopt(argc, argv, "fn")) != -1) { + while ((c = getopt(argc, argv, "fgLnP")) != -1) { switch (c) { case 'f': force = B_TRUE; break; + case 'g': + name_flags |= VDEV_NAME_GUID; + break; + case 'L': + name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; case 'n': dryrun = B_TRUE; break; + case 'P': + name_flags |= VDEV_NAME_PATH; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -607,16 +636,25 @@ zpool_do_add(int argc, char **argv) "configuration:\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ - print_vdev_tree(zhp, poolname, poolnvroot, 0, B_FALSE); - print_vdev_tree(zhp, NULL, nvroot, 0, B_FALSE); + print_vdev_tree(zhp, poolname, poolnvroot, 0, "", + name_flags | VDEV_NAME_TYPE_ID); + print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); - /* Do the same for the logs */ - if (num_logs(poolnvroot) > 0) { - print_vdev_tree(zhp, "logs", poolnvroot, 0, B_TRUE); - print_vdev_tree(zhp, NULL, nvroot, 0, B_TRUE); - } else if (num_logs(nvroot) > 0) { - print_vdev_tree(zhp, "logs", nvroot, 0, B_TRUE); - } + /* print other classes: 'dedup', 'special', and 'log' */ + print_vdev_tree(zhp, "dedup", poolnvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, + name_flags); + + print_vdev_tree(zhp, "special", poolnvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, + name_flags); + + print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG, + name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG, + name_flags); ret = 0; } else { @@ -1203,9 +1241,13 @@ zpool_do_create(int argc, char **argv) (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), poolname); - print_vdev_tree(NULL, poolname, nvroot, 0, B_FALSE); - if (num_logs(nvroot) > 0) - print_vdev_tree(NULL, "logs", nvroot, 0, B_TRUE); + print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); + print_vdev_tree(NULL, "dedup", nvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, 0); + print_vdev_tree(NULL, "special", nvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, 0); + print_vdev_tree(NULL, "logs", nvroot, 0, + VDEV_ALLOC_BIAS_LOG, 0); ret = 0; } else { @@ -1412,13 +1454,15 @@ zpool_do_export(int argc, char **argv) * name column. */ static int -max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max) +max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, + int name_flags) { - char *name = zpool_vdev_name(g_zfs, zhp, nv, B_TRUE); + char *name; nvlist_t **child; uint_t c, children; int ret; + name = zpool_vdev_name(g_zfs, zhp, nv, name_flags | VDEV_NAME_TYPE_ID); if (strlen(name) + depth > max) max = strlen(name) + depth; @@ -1428,7 +1472,7 @@ max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max) &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, - max)) > max) + max, name_flags)) > max) max = ret; } @@ -1436,7 +1480,7 @@ max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max) &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, - max)) > max) + max, name_flags)) > max) max = ret; } @@ -1444,11 +1488,10 @@ max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max) &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, - max)) > max) + max, name_flags)) > max) max = ret; } - return (max); } @@ -1497,12 +1540,24 @@ find_spare(zpool_handle_t *zhp, void *data) return (0); } +typedef struct status_cbdata { + int cb_count; + int cb_name_flags; + int cb_namewidth; + boolean_t cb_allpools; + boolean_t cb_verbose; + boolean_t cb_explain; + boolean_t cb_first; + boolean_t cb_dedup_stats; + boolean_t cb_print_status; +} status_cbdata_t; + /* * Print out configuration state as requested by status_callback. */ -void -print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, - int namewidth, int depth, boolean_t isspare) +static void +print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, + nvlist_t *nv, int depth, boolean_t isspare) { nvlist_t **child; uint_t c, children; @@ -1511,7 +1566,7 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; - spare_cbdata_t cb; + spare_cbdata_t spare_cb; const char *state; char *type; @@ -1539,7 +1594,7 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, state = "AVAIL"; } - (void) printf("\t%*s%-*s %-8s", depth, "", namewidth - depth, + (void) printf("\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); if (!isspare) { @@ -1580,17 +1635,17 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, - &cb.cb_guid) == 0); - if (zpool_iter(g_zfs, find_spare, &cb) == 1) { - if (strcmp(zpool_get_name(cb.cb_zhp), + &spare_cb.cb_guid) == 0); + if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { + if (strcmp(zpool_get_name(spare_cb.cb_zhp), zpool_get_name(zhp)) == 0) (void) printf(gettext("currently in " "use")); else (void) printf(gettext("in use by " "pool '%s'"), - zpool_get_name(cb.cb_zhp)); - zpool_close(cb.cb_zhp); + zpool_get_name(spare_cb.cb_zhp)); + zpool_close(spare_cb.cb_zhp); } else { (void) printf(gettext("currently in use")); } @@ -1689,20 +1744,25 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, &ishole); if (islog || ishole) continue; - vname = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE); - print_status_config(zhp, vname, child[c], - namewidth, depth + 2, isspare); + /* Only print normal classes here */ + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags | VDEV_NAME_TYPE_ID); + print_status_config(zhp, cb, vname, child[c], depth + 2, + isspare); free(vname); } } - /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ -void -print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) +static void +print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, + int depth) { nvlist_t **child; uint_t c, children; @@ -1717,7 +1777,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); - (void) printf("\t%*s%-*s", depth, "", namewidth - depth, name); + (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); if (vs->vs_aux != 0) { @@ -1774,9 +1834,12 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) &is_log); if (is_log) continue; + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; - vname = zpool_vdev_name(g_zfs, NULL, child[c], B_TRUE); - print_import_config(vname, child[c], namewidth, depth + 2); + vname = zpool_vdev_name(g_zfs, NULL, child[c], + cb->cb_name_flags | VDEV_NAME_TYPE_ID); + print_import_config(cb, vname, child[c], depth + 2); free(vname); } @@ -1784,7 +1847,8 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) &child, &children) == 0) { (void) printf(gettext("\tcache\n")); for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE); + vname = zpool_vdev_name(g_zfs, NULL, child[c], + cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } @@ -1794,7 +1858,8 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) &child, &children) == 0) { (void) printf(gettext("\tspares\n")); for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE); + vname = zpool_vdev_name(g_zfs, NULL, child[c], + cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } @@ -1802,39 +1867,62 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) } /* - * Print log vdevs. - * Logs are recorded as top level vdevs in the main pool child array - * but with "is_log" set to 1. We use either print_status_config() or - * print_import_config() to print the top level logs then any log - * children (eg mirrored slogs) are printed recursively - which - * works because only the top level vdev is marked "is_log" + * Print specialized class vdevs. + * + * These are recorded as top level vdevs in the main pool child array + * but with "is_log" set to 1 or an "alloc_bias" string. We use either + * print_status_config() or print_import_config() to print the top level + * class vdevs then any of their children (eg mirrored slogs) are printed + * recursively - which works because only the top level vdev is marked. */ static void -print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose) +print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, + const char *class) { uint_t c, children; nvlist_t **child; + boolean_t printed = B_FALSE; + + assert(zhp != NULL || !cb->cb_verbose); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; - (void) printf(gettext("\tlogs\n")); - for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; - char *name; + char *bias = NULL; + char *type = NULL; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); - if (!is_log) + + if (is_log) { + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_TYPE, &type); + } + + if (bias == NULL || strcmp(bias, class) != 0) continue; - name = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE); - if (verbose) - print_status_config(zhp, name, child[c], namewidth, - 2, B_FALSE); + if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) + continue; + + if (!printed) { + (void) printf("\t%s\t\n", gettext(class)); + printed = B_TRUE; + } + + char *name = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags | VDEV_NAME_TYPE_ID); + if (cb->cb_print_status) + print_status_config(zhp, cb, name, child[c], 2, + B_FALSE); else - print_import_config(name, child[c], namewidth, 2); + print_import_config(cb, name, child[c], 2); free(name); } } @@ -1856,8 +1944,8 @@ show_import(nvlist_t *config) int reason; const char *health; uint_t vsc; - int namewidth; char *comment; + status_cbdata_t cb = { 0 }; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); @@ -2083,13 +2171,15 @@ show_import(nvlist_t *config) (void) printf(gettext(" config:\n\n")); - namewidth = max_width(NULL, nvroot, 0, 0); - if (namewidth < 10) - namewidth = 10; + cb.cb_namewidth = max_width(NULL, nvroot, 0, 0, 0); + if (cb.cb_namewidth < 10) + cb.cb_namewidth = 10; - print_import_config(name, nvroot, namewidth, 0); - if (num_logs(nvroot) > 0) - print_logs(NULL, nvroot, namewidth, B_FALSE); + print_import_config(&cb, name, nvroot, 0); + + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " @@ -2751,8 +2841,10 @@ zpool_do_sync(int argc, char **argv) typedef struct iostat_cbdata { boolean_t cb_verbose; + int cb_name_flags; int cb_namewidth; int cb_iteration; + boolean_t cb_scripted; zpool_list_t *cb_list; } iostat_cbdata_t; @@ -2788,12 +2880,20 @@ print_one_stat(uint64_t value) (void) printf(" %5s", buf); } +static const char *class_name[] = { + VDEV_ALLOC_BIAS_DEDUP, + VDEV_ALLOC_BIAS_SPECIAL, + VDEV_ALLOC_CLASS_LOGS +}; + /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. + * + * Returns the number of stat lines printed. */ -void +static unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { @@ -2801,12 +2901,13 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, uint_t c, children; vdev_stat_t *oldvs, *newvs; vdev_stat_t zerovs = { 0 }; + char *vname; + int ret = 0; uint64_t tdelta; double scale; - char *vname; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) - return; + return (ret); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, @@ -2854,16 +2955,19 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, (void) printf("\n"); if (!cb->cb_verbose) - return; + return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) - return; + return (ret); if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &c) != 0) - return; + return (ret); + /* + * print normal top-level devices + */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; @@ -2876,33 +2980,45 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, if (ishole || islog) continue; - vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE); + if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + cb->cb_name_flags); print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } /* - * Log device section + * print all other top-level devices */ - - if (num_logs(newnv) > 0) { - (void) printf("%-*s - - - - - " - "-\n", cb->cb_namewidth, "logs"); - + for (uint_t n = 0; n < 3; n++) { for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; + char *bias = NULL; + char *type = NULL; + (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); - if (islog) { - vname = zpool_vdev_name(g_zfs, zhp, newchild[c], - B_FALSE); - print_vdev_stats(zhp, vname, oldnv ? - oldchild[c] : NULL, newchild[c], - cb, depth + 2); - free(vname); + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(newchild[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(newchild[c], + ZPOOL_CONFIG_TYPE, &type); } + if (bias == NULL || strcmp(bias, class_name[n]) != 0) + continue; + if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) + continue; + + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + cb->cb_name_flags); + ret += print_vdev_stats(zhp, vname, oldnv ? + oldchild[c] : NULL, newchild[c], cb, depth + 2); + free(vname); } } @@ -2912,23 +3028,25 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) - return; + return (ret); if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &c) != 0) - return; + return (ret); if (children > 0) { (void) printf("%-*s - - - - - " "-\n", cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], - B_FALSE); + cb->cb_name_flags); print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } + + return (ret); } static int @@ -2997,7 +3115,7 @@ get_namewidth(zpool_handle_t *zhp, void *data) cb->cb_namewidth = strlen(zpool_get_name(zhp)); else cb->cb_namewidth = max_width(zhp, nvroot, 0, - cb->cb_namewidth); + cb->cb_namewidth, cb->cb_name_flags); } /* @@ -3095,8 +3213,11 @@ get_timestamp_arg(char c) } /* - * zpool iostat [-v] [-T d|u] [pool] ... [interval [count]] + * zpool iostat [-gLPv] [-T d|u] [pool] ... [interval [count]] * + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. + * -P Display full path for vdev name. * -v Display statistics for individual vdevs * -T Display a timestamp in date(1) or Unix format * @@ -3115,11 +3236,23 @@ zpool_do_iostat(int argc, char **argv) unsigned long interval = 0, count = 0; zpool_list_t *list; boolean_t verbose = B_FALSE; - iostat_cbdata_t cb; + boolean_t guid = B_FALSE; + boolean_t follow_links = B_FALSE; + boolean_t full_name = B_FALSE; + iostat_cbdata_t cb = { 0 }; /* check options */ - while ((c = getopt(argc, argv, "T:v")) != -1) { + while ((c = getopt(argc, argv, "gLPT:v")) != -1) { switch (c) { + case 'g': + guid = B_TRUE; + break; + case 'L': + follow_links = B_TRUE; + break; + case 'P': + full_name = B_TRUE; + break; case 'T': get_timestamp_arg(*optarg); break; @@ -3161,6 +3294,12 @@ zpool_do_iostat(int argc, char **argv) */ cb.cb_list = list; cb.cb_verbose = verbose; + if (guid) + cb.cb_name_flags |= VDEV_NAME_GUID; + if (follow_links) + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + if (full_name) + cb.cb_name_flags |= VDEV_NAME_PATH; cb.cb_iteration = 0; cb.cb_namewidth = 0; @@ -3227,12 +3366,14 @@ zpool_do_iostat(int argc, char **argv) typedef struct list_cbdata { boolean_t cb_verbose; + int cb_name_flags; int cb_namewidth; boolean_t cb_scripted; zprop_list_t *cb_proplist; boolean_t cb_literal; } list_cbdata_t; + /* * Given a list of columns to display, output appropriate headers for each one. */ @@ -3288,7 +3429,7 @@ print_header(list_cbdata_t *cb) /* * Given a pool and a list of properties, print out all the properties according - * to the described layout. + * to the described layout. Used by zpool_do_list(). */ static void print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) @@ -3380,7 +3521,9 @@ print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted, } break; case ZPOOL_PROP_CAPACITY: - (void) snprintf(propval, sizeof (propval), "%llu%%", value); + (void) snprintf(propval, sizeof (propval), + value < 1000 ? "%1.2f%%" : value < 10000 ? + "%2.1f%%" : "%3.0f%%", value / 100.0); break; default: zfs_nicenum(value, propval, sizeof (propval)); @@ -3395,6 +3538,9 @@ print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted, (void) printf(" %*s", width, propval); } +/* + * print static default line per vdev + */ void print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth) @@ -3405,7 +3551,6 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, char *vname; boolean_t scripted = cb->cb_scripted; uint64_t islog = B_FALSE; - boolean_t haslog = B_FALSE; char *dashes = "%-*s - - - - - -\n"; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, @@ -3446,7 +3591,7 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, vs->vs_fragmentation, scripted, (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel)); cap = (vs->vs_space == 0) ? 0 : - (vs->vs_alloc * 100 / vs->vs_space); + (vs->vs_alloc * 10000 / vs->vs_space); print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel); (void) printf("\n"); } @@ -3455,6 +3600,7 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, &child, &children) != 0) return; + /* list the normal vdevs first */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE; @@ -3463,24 +3609,48 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, continue; if (nvlist_lookup_uint64(child[c], - ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { - haslog = B_TRUE; + ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) continue; - } - vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } - if (haslog == B_TRUE) { - /* LINTED E_SEC_PRINTF_VAR_FMT */ - (void) printf(dashes, cb->cb_namewidth, "log"); + /* list the classes: 'logs', 'dedup', and 'special' */ + for (uint_t n = 0; n < 3; n++) { + boolean_t printed = B_FALSE; + for (c = 0; c < children; c++) { + char *bias = NULL; + char *type = NULL; + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &islog) != 0 || !islog) + &islog) == 0 && islog) { + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_TYPE, &type); + } + if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; - vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); + if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) + continue; + + if (!printed) { + /* LINTED E_SEC_PRINTF_VAR_FMT */ + (void) printf(dashes, cb->cb_namewidth, + class_name[n]); + printed = B_TRUE; + } + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } @@ -3491,7 +3661,8 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } @@ -3502,14 +3673,14 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "spare"); for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } } } - /* * Generic callback function to list a pool. */ @@ -3522,26 +3693,37 @@ list_callback(zpool_handle_t *zhp, void *data) config = zpool_get_config(zhp, NULL); - print_pool(zhp, cbp); - if (!cbp->cb_verbose) - return (0); + if (cbp->cb_verbose) { + config = zpool_get_config(zhp, NULL); - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - print_list_stats(zhp, NULL, nvroot, cbp, 0); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + } + + if (cbp->cb_verbose) + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, + cbp->cb_name_flags); + + print_pool(zhp, cbp); + + if (cbp->cb_verbose) + print_list_stats(zhp, NULL, nvroot, cbp, 0); return (0); } /* - * zpool list [-Hp] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] + * zpool list [-gHLP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * + * -g Display guid for individual vdev name. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. + * -L Follow links when resolving vdev path name. * -o List of properties to display. Defaults to * "name,size,allocated,free,expandsize,fragmentation,capacity," * "dedupratio,health,altroot" * -p Diplay values in parsable (exact) format. + * -P Display full path for vdev name. * -T Display a timestamp in date(1) or Unix format * * List all pools in the system, whether or not they're healthy. Output space @@ -3562,14 +3744,23 @@ zpool_do_list(int argc, char **argv) boolean_t first = B_TRUE; /* check options */ - while ((c = getopt(argc, argv, ":Ho:pT:v")) != -1) { + while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) { switch (c) { + case 'g': + cb.cb_name_flags |= VDEV_NAME_GUID; + break; case 'H': cb.cb_scripted = B_TRUE; break; + case 'L': + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; case 'o': props = optarg; break; + case 'P': + cb.cb_name_flags |= VDEV_NAME_PATH; + break; case 'p': cb.cb_literal = B_TRUE; break; @@ -3578,6 +3769,7 @@ zpool_do_list(int argc, char **argv) break; case 'v': cb.cb_verbose = B_TRUE; + cb.cb_namewidth = 8; /* 8 until precalc is avail */ break; case ':': (void) fprintf(stderr, gettext("missing argument for " @@ -3823,13 +4015,16 @@ zpool_do_detach(int argc, char **argv) } /* - * zpool split [-n] [-o prop=val] ... + * zpool split [-gLnP] [-o prop=val] ... * [-o mntopt] ... * [-R altroot] [ ...] * + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. * -n Do not split the pool, but display the resulting layout if * it were to be split. * -o Set property=value, or set mount options. + * -P Display full path for vdev name. * -R Mount the split-off pool under an alternate root. * * Splits the named pool and gives it the new pool name. Devices to be split @@ -3853,10 +4048,17 @@ zpool_do_split(int argc, char **argv) flags.dryrun = B_FALSE; flags.import = B_FALSE; + flags.name_flags = 0; /* check options */ - while ((c = getopt(argc, argv, ":R:no:")) != -1) { + while ((c = getopt(argc, argv, ":gLR:no:P")) != -1) { switch (c) { + case 'g': + flags.name_flags |= VDEV_NAME_GUID; + break; + case 'L': + flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; case 'R': flags.import = B_TRUE; if (add_prop_list( @@ -3882,6 +4084,9 @@ zpool_do_split(int argc, char **argv) mntopts = optarg; } break; + case 'P': + flags.name_flags |= VDEV_NAME_PATH; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -3929,7 +4134,8 @@ zpool_do_split(int argc, char **argv) if (flags.dryrun) { (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), newpool); - print_vdev_tree(NULL, newpool, config, 0, B_FALSE); + print_vdev_tree(NULL, newpool, config, 0, "", + flags.name_flags); } nvlist_free(config); } @@ -4494,15 +4700,6 @@ zpool_do_initialize(int argc, char **argv) return (err); } -typedef struct status_cbdata { - int cb_count; - boolean_t cb_allpools; - boolean_t cb_verbose; - boolean_t cb_explain; - boolean_t cb_first; - boolean_t cb_dedup_stats; -} status_cbdata_t; - /* * Print out detailed scrub status. */ @@ -4840,8 +5037,8 @@ print_error_log(zpool_handle_t *zhp) } static void -print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares, - int namewidth) +print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, + uint_t nspares) { uint_t i; char *name; @@ -4852,16 +5049,16 @@ print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares, (void) printf(gettext("\tspares\n")); for (i = 0; i < nspares; i++) { - name = zpool_vdev_name(g_zfs, zhp, spares[i], B_FALSE); - print_status_config(zhp, name, spares[i], - namewidth, 2, B_TRUE); + name = zpool_vdev_name(g_zfs, zhp, spares[i], + cb->cb_name_flags); + print_status_config(zhp, cb, name, spares[i], 2, B_TRUE); free(name); } } static void -print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache, - int namewidth) +print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, + uint_t nl2cache) { uint_t i; char *name; @@ -4872,9 +5069,9 @@ print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache, (void) printf(gettext("\tcache\n")); for (i = 0; i < nl2cache; i++) { - name = zpool_vdev_name(g_zfs, zhp, l2cache[i], B_FALSE); - print_status_config(zhp, name, l2cache[i], - namewidth, 2, B_FALSE); + name = zpool_vdev_name(g_zfs, zhp, l2cache[i], + cb->cb_name_flags); + print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE); free(name); } } @@ -5180,7 +5377,6 @@ status_callback(zpool_handle_t *zhp, void *data) msgid); if (config != NULL) { - int namewidth; uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; @@ -5200,25 +5396,30 @@ status_callback(zpool_handle_t *zhp, void *data) print_removal_status(zhp, prs); print_checkpoint_status(pcs); - namewidth = max_width(zhp, nvroot, 0, 0); - if (namewidth < 10) - namewidth = 10; + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, + cbp->cb_name_flags); + if (cbp->cb_namewidth < 10) + cbp->cb_namewidth = 10; (void) printf(gettext("config:\n\n")); - (void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), namewidth, - "NAME", "STATE", "READ", "WRITE", "CKSUM"); - print_status_config(zhp, zpool_get_name(zhp), nvroot, - namewidth, 0, B_FALSE); + (void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), + cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", + "CKSUM"); + + print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, + B_FALSE); + + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); - if (num_logs(nvroot) > 0) - print_logs(zhp, nvroot, namewidth, B_TRUE); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) - print_l2cache(zhp, l2cache, nl2cache, namewidth); + print_l2cache(zhp, cbp, l2cache, nl2cache); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) - print_spares(zhp, spares, nspares, namewidth); + print_spares(zhp, cbp, spares, nspares); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { @@ -5266,8 +5467,11 @@ status_callback(zpool_handle_t *zhp, void *data) } /* - * zpool status [-vx] [-T d|u] [pool] ... [interval [count]] + * zpool status [-gLPvx] [-T d|u] [pool] ... [interval [count]] * + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. + * -P Display full path for vdev name. * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) @@ -5284,8 +5488,17 @@ zpool_do_status(int argc, char **argv) status_cbdata_t cb = { 0 }; /* check options */ - while ((c = getopt(argc, argv, "vxDT:")) != -1) { + while ((c = getopt(argc, argv, "gLPvxDT:")) != -1) { switch (c) { + case 'g': + cb.cb_name_flags |= VDEV_NAME_GUID; + break; + case 'L': + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; + case 'P': + cb.cb_name_flags |= VDEV_NAME_PATH; + break; case 'v': cb.cb_verbose = B_TRUE; break; @@ -5314,6 +5527,7 @@ zpool_do_status(int argc, char **argv) cb.cb_allpools = B_TRUE; cb.cb_first = B_TRUE; + cb.cb_print_status = B_TRUE; for (;;) { if (timestamp_fmt != NODATE) diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 28d81b949301..1690355bd120 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -21,7 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 Intel Corporation. * Copyright 2016 Igor Kozhukhov . */ @@ -471,6 +472,9 @@ make_leaf_vdev(const char *arg, uint64_t is_log) verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) + verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); @@ -525,6 +529,9 @@ make_leaf_vdev(const char *arg, uint64_t is_log) * * Otherwise, make sure that the current spec (if there is one) and the new * spec have consistent replication levels. + * + * If there is no current spec (create), make sure new spec has at least + * one general purpose vdev. */ typedef struct replication_level { char *zprl_type; @@ -534,6 +541,19 @@ typedef struct replication_level { #define ZPOOL_FUZZ (16 * 1024 * 1024) +static boolean_t +is_raidz_mirror(replication_level_t *a, replication_level_t *b, + replication_level_t **raidz, replication_level_t **mirror) +{ + if (strcmp(a->zprl_type, "raidz") == 0 && + strcmp(b->zprl_type, "mirror") == 0) { + *raidz = a; + *mirror = b; + return (B_TRUE); + } + return (B_FALSE); +} + /* * Given a list of toplevel vdevs, return the current replication level. If * the config is inconsistent, then NULL is returned. If 'fatal' is set, then @@ -551,6 +571,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) replication_level_t lastrep = {0}; replication_level_t rep; replication_level_t *ret; + replication_level_t *raidz, *mirror; boolean_t dontreport; ret = safe_malloc(sizeof (replication_level_t)); @@ -731,11 +752,39 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) /* * At this point, we have the replication of the last toplevel - * vdev in 'rep'. Compare it to 'lastrep' to see if its + * vdev in 'rep'. Compare it to 'lastrep' to see if it is * different. */ if (lastrep.zprl_type != NULL) { - if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { + if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || + is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { + /* + * Accepted raidz and mirror when they can + * handle the same number of disk failures. + */ + if (raidz->zprl_parity != + mirror->zprl_children - 1) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication " + "level: " + "%s and %s vdevs with " + "different redundancy, " + "%llu vs. %llu (%llu-way) " + "are present\n"), + raidz->zprl_type, + mirror->zprl_type, + raidz->zprl_parity, + mirror->zprl_children - 1, + mirror->zprl_children); + else + return (NULL); + } + } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != + 0) { if (ret != NULL) free(ret); ret = NULL; @@ -798,6 +847,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot) nvlist_t **child; uint_t children; replication_level_t *current = NULL, *new; + replication_level_t *raidz, *mirror; int ret; /* @@ -845,7 +895,21 @@ check_replication(nvlist_t *config, nvlist_t *newroot) */ ret = 0; if (current != NULL) { - if (strcmp(current->zprl_type, new->zprl_type) != 0) { + if (is_raidz_mirror(current, new, &raidz, &mirror) || + is_raidz_mirror(new, current, &raidz, &mirror)) { + if (raidz->zprl_parity != mirror->zprl_children - 1) { + vdev_error(gettext( + "mismatched replication level: pool and " + "new vdev with different redundancy, %s " + "and %s vdevs, %llu vs. %llu (%llu-way)\n"), + raidz->zprl_type, + mirror->zprl_type, + raidz->zprl_parity, + mirror->zprl_children - 1, + mirror->zprl_children); + ret = -1; + } + } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { vdev_error(gettext( "mismatched replication level: pool uses %s " "and new vdev is %s\n"), @@ -1177,6 +1241,13 @@ is_grouping(const char *type, int *mindev, int *maxdev) return (VDEV_TYPE_LOG); } + if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || + strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { + if (mindev != NULL) + *mindev = 1; + return (type); + } + if (strcmp(type, "cache") == 0) { if (mindev != NULL) *mindev = 1; @@ -1198,7 +1269,7 @@ construct_spec(int argc, char **argv) nvlist_t *nvroot, *nv, **top, **spares, **l2cache; int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; const char *type; - uint64_t is_log; + uint64_t is_log, is_special, is_dedup; boolean_t seen_logs; top = NULL; @@ -1208,7 +1279,7 @@ construct_spec(int argc, char **argv) nspares = 0; nlogs = 0; nl2cache = 0; - is_log = B_FALSE; + is_log = is_special = is_dedup = B_FALSE; seen_logs = B_FALSE; while (argc > 0) { @@ -1230,7 +1301,7 @@ construct_spec(int argc, char **argv) "specified only once\n")); return (NULL); } - is_log = B_FALSE; + is_log = is_special = is_dedup = B_FALSE; } if (strcmp(type, VDEV_TYPE_LOG) == 0) { @@ -1243,6 +1314,8 @@ construct_spec(int argc, char **argv) } seen_logs = B_TRUE; is_log = B_TRUE; + is_special = B_FALSE; + is_dedup = B_FALSE; argc--; argv++; /* @@ -1252,6 +1325,24 @@ construct_spec(int argc, char **argv) continue; } + if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { + is_special = B_TRUE; + is_log = B_FALSE; + is_dedup = B_FALSE; + argc--; + argv++; + continue; + } + + if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { + is_dedup = B_TRUE; + is_log = B_FALSE; + is_special = B_FALSE; + argc--; + argv++; + continue; + } + if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { if (l2cache != NULL) { (void) fprintf(stderr, @@ -1260,15 +1351,16 @@ construct_spec(int argc, char **argv) "specified only once\n")); return (NULL); } - is_log = B_FALSE; + is_log = is_special = is_dedup = B_FALSE; } - if (is_log) { + if (is_log || is_special || is_dedup) { if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { (void) fprintf(stderr, gettext("invalid vdev " - "specification: unsupported 'log' " - "device: %s\n"), type); + "specification: unsupported '%s' " + "device: %s\n"), is_log ? "log" : + "special", type); return (NULL); } nlogs++; @@ -1314,12 +1406,27 @@ construct_spec(int argc, char **argv) nl2cache = children; continue; } else { + /* create a top-level vdev with children */ verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, type) == 0); verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); + if (is_special) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_SPECIAL) == 0); + } + if (is_dedup) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_DEDUP) == 0); + } if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, @@ -1342,6 +1449,16 @@ construct_spec(int argc, char **argv) return (NULL); if (is_log) nlogs++; + if (is_special) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_SPECIAL) == 0); + } + if (is_dedup) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_DEDUP) == 0); + } argc--; argv++; } @@ -1449,6 +1566,30 @@ split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, return (newroot); } +static int +num_normal_vdevs(nvlist_t *nvroot) +{ + nvlist_t **top; + uint_t t, toplevels, normal = 0; + + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &top, &toplevels) == 0); + + for (t = 0; t < toplevels; t++) { + uint64_t log = B_FALSE; + + (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); + if (log) + continue; + if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + normal++; + } + + return (normal); +} + /* * Get and validate the contents of the given vdev specification. This ensures * that the nvlist returned is well-formed, that all the devices exist, and that @@ -1500,6 +1641,16 @@ make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, return (NULL); } + /* + * On pool create the new vdev spec must have one normal vdev. + */ + if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { + vdev_error(gettext("at least one general top-level vdev must " + "be specified\n")); + nvlist_free(newroot); + return (NULL); + } + /* * Run through the vdev specification and label any whole disks found. */ diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index a0d952191054..7369fc8a0ef2 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -20,11 +20,12 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. * Copyright 2017 RackTop Systems. */ @@ -144,6 +145,12 @@ typedef struct ztest_shared_hdr { static ztest_shared_hdr_t *ztest_shared_hdr; +enum ztest_class_state { + ZTEST_VDEV_CLASS_OFF, + ZTEST_VDEV_CLASS_ON, + ZTEST_VDEV_CLASS_RND +}; + typedef struct ztest_shared_opts { char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; @@ -166,6 +173,7 @@ typedef struct ztest_shared_opts { uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; int zo_mmp_test; + int zo_special_vdevs; } ztest_shared_opts_t; static const ztest_shared_opts_t ztest_opts_defaults = { @@ -188,7 +196,8 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_init = 1, .zo_time = 300, /* 5 minutes */ .zo_maxloops = 50, /* max loops during spa_freeze() */ - .zo_metaslab_force_ganging = 32 << 10 + .zo_metaslab_force_ganging = 32 << 10, + .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, }; extern uint64_t metaslab_force_ganging; @@ -350,6 +359,7 @@ ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; +ztest_func_t ztest_vdev_class_add; ztest_func_t ztest_vdev_aux_add_remove; ztest_func_t ztest_split_pool; ztest_func_t ztest_reguid; @@ -399,6 +409,8 @@ ztest_info_t ztest_info[] = { { ztest_vdev_LUN_growth, 1, &zopt_rarely }, { ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime }, + { ztest_vdev_class_add, 1, + &ztest_opts.zo_vdevtime }, { ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime }, { ztest_device_removal, 1, &zopt_sometimes }, @@ -611,6 +623,7 @@ usage(boolean_t requested) "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" "\t[-P passtime (default: %llu sec)] time per pass\n" "\t[-B alt_ztest (default: )] alternate ztest path\n" + "\t[-C vdev class state (default: random)] special=on|off|random\n" "\t[-o variable=value] ... set global variable to an unsigned\n" "\t 32-bit integer value\n" "\t[-h] (print help)\n" @@ -635,6 +648,46 @@ usage(boolean_t requested) exit(requested ? 0 : 1); } + +static void +ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) +{ + char name[32]; + char *value; + int state = ZTEST_VDEV_CLASS_RND; + + (void) strlcpy(name, input, sizeof (name)); + + value = strchr(name, '='); + if (value == NULL) { + (void) fprintf(stderr, "missing value in property=value " + "'-C' argument (%s)\n", input); + usage(B_FALSE); + } + *(value) = '\0'; + value++; + + if (strcmp(value, "on") == 0) { + state = ZTEST_VDEV_CLASS_ON; + } else if (strcmp(value, "off") == 0) { + state = ZTEST_VDEV_CLASS_OFF; + } else if (strcmp(value, "random") == 0) { + state = ZTEST_VDEV_CLASS_RND; + } else { + (void) fprintf(stderr, "invalid property value '%s'\n", value); + usage(B_FALSE); + } + + if (strcmp(name, "special") == 0) { + zo->zo_special_vdevs = state; + } else { + (void) fprintf(stderr, "invalid property name '%s'\n", name); + usage(B_FALSE); + } + if (zo->zo_verbose >= 3) + (void) printf("%s vdev state is '%s'\n", name, value); +} + static void process_options(int argc, char **argv) { @@ -648,7 +701,7 @@ process_options(int argc, char **argv) bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, - "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:")) != EOF) { + "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:")) != EOF) { value = 0; switch (opt) { case 'v': @@ -738,6 +791,9 @@ process_options(int argc, char **argv) case 'B': (void) strlcpy(altdir, optarg, sizeof (altdir)); break; + case 'C': + ztest_parse_name_value(optarg, zo); + break; case 'o': if (set_global_var(optarg) != 0) usage(B_FALSE); @@ -960,13 +1016,16 @@ make_vdev_mirror(char *path, char *aux, char *pool, size_t size, static nvlist_t * make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, - int log, int r, int m, int t) + const char *class, int r, int m, int t) { nvlist_t *root, **child; int c; + boolean_t log; ASSERT(t > 0); + log = (class != NULL && strcmp(class, "log") == 0); + child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < t; c++) { @@ -974,6 +1033,12 @@ make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, r, m); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log) == 0); + + if (class != NULL && class[0] != '\0') { + ASSERT(m > 1 || log); /* expecting a mirror */ + VERIFY(nvlist_add_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0); + } } VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); @@ -1014,6 +1079,9 @@ static int ztest_random_blocksize(void) { uint64_t block_shift; + + ASSERT(ztest_spa->spa_max_ashift != 0); + /* * Choose a block size >= the ashift. * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. @@ -2493,7 +2561,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) /* * Attempt to create using a bad file. */ - nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_file", nvroot, NULL, NULL)); nvlist_free(nvroot); @@ -2501,7 +2569,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) /* * Attempt to create using a bad mirror. */ - nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_mirror", nvroot, NULL, NULL)); nvlist_free(nvroot); @@ -2511,7 +2579,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) * what's in the nvroot; we should fail with EEXIST. */ rw_enter(&ztest_name_lock, RW_READER); - nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL)); nvlist_free(nvroot); VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); @@ -2593,7 +2661,7 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - 0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); + NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the @@ -2766,10 +2834,16 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) * If we have slogs then remove them 1/4 of the time. */ if (spa_has_slogs(spa) && ztest_random(4) == 0) { + metaslab_group_t *mg; + /* - * Grab the guid from the head of the log class rotor. + * find the first real slog in log allocation class */ - guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid; + mg = spa_log_class(spa)->mc_rotor; + while (!mg->mg_vd->vdev_islog) + mg = mg->mg_next; + + guid = mg->mg_vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); @@ -2798,12 +2872,11 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) spa_config_exit(spa, SCL_VDEV, FTAG); /* - * Make 1/4 of the devices be log devices. + * Make 1/4 of the devices be log devices */ nvroot = make_vdev_root(NULL, NULL, NULL, - ztest_opts.zo_vdev_size, 0, - ztest_random(4) == 0, ztest_opts.zo_raidz, - zs->zs_mirrors, 1); + ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? + "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -2822,6 +2895,83 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) mutex_exit(&ztest_vdev_lock); } +/* ARGSUSED */ +void +ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + uint64_t leaves; + nvlist_t *nvroot; + const char *class = (ztest_random(2) == 0) ? + VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; + int error; + + /* + * By default add a special vdev 50% of the time + */ + if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || + (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && + ztest_random(2) == 0)) { + return; + } + + mutex_enter(&ztest_vdev_lock); + + /* Only test with mirrors */ + if (zs->zs_mirrors < 2) { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* requires feature@allocation_classes */ + if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { + mutex_exit(&ztest_vdev_lock); + return; + } + + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; + spa_config_exit(spa, SCL_VDEV, FTAG); + + nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, + class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + + error = spa_vdev_add(spa, nvroot); + nvlist_free(nvroot); + + if (error == ENOSPC) + ztest_record_enospc("spa_vdev_add"); + else if (error != 0) + fatal(0, "spa_vdev_add() = %d", error); + + /* + * 50% of the time allow small blocks in the special class + */ + if (error == 0 && + spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { + if (ztest_opts.zo_verbose >= 3) + (void) printf("Enabling special VDEV small blocks\n"); + (void) ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); + } + + mutex_exit(&ztest_vdev_lock); + + if (ztest_opts.zo_verbose >= 3) { + metaslab_class_t *mc; + + if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) + mc = spa_special_class(spa); + else + mc = spa_dedup_class(spa); + (void) printf("Added a %s mirrored vdev (of %d)\n", + class, (int)mc->mc_groups); + } +} + /* * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. */ @@ -2886,7 +3036,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) * Add a new device. */ nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, - (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1); + (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); error = spa_vdev_add(spa, nvroot); switch (error) { @@ -3083,11 +3233,15 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * Locate this vdev. */ oldvd = rvd->vdev_child[top]; + + /* pick a child from the mirror */ if (zs->zs_mirrors >= 1) { ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); ASSERT(oldvd->vdev_children >= zs->zs_mirrors); oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; } + + /* pick a child out of the raidz group */ if (ztest_opts.zo_raidz > 1) { ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); @@ -3190,7 +3344,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * Build the nvlist describing newpath. */ root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, - ashift, 0, 0, 0, 1); + ashift, NULL, 0, 0, 1); error = spa_vdev_attach(spa, oldguid, root, replacing); @@ -3451,7 +3605,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) return; } ASSERT(psize > 0); - newsize = psize + psize / 8; + newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); ASSERT3U(newsize, >, psize); if (ztest_opts.zo_verbose >= 6) { @@ -6468,6 +6622,7 @@ make_random_props() nvlist_t *props; VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + if (ztest_random(2) == 0) return (props); VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); @@ -6549,7 +6704,7 @@ ztest_init(ztest_shared_t *zs) zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); props = make_random_props(); for (int i = 0; i < SPA_FEATURES; i++) { char buf[1024]; diff --git a/common/zfs/zfeature_common.c b/common/zfs/zfeature_common.c index ae7a06802796..b4e80758eee6 100644 --- a/common/zfs/zfeature_common.c +++ b/common/zfs/zfeature_common.c @@ -20,11 +20,12 @@ */ /* - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2017, Intel Corporation. */ #ifdef _KERNEL @@ -298,4 +299,11 @@ zpool_feature_init(void) "Reduce memory used by removed devices when their blocks are " "freed or remapped.", ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps); + + { + zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES, + "org.zfsonlinux:allocation_classes", "allocation_classes", + "Support for separate allocation classes.", + ZFEATURE_FLAG_READONLY_COMPAT, NULL); + } } diff --git a/common/zfs/zfeature_common.h b/common/zfs/zfeature_common.h index 3376b9921bac..af29560ae902 100644 --- a/common/zfs/zfeature_common.h +++ b/common/zfs/zfeature_common.h @@ -24,6 +24,7 @@ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2017, Intel Corporation. */ #ifndef _ZFEATURE_COMMON_H @@ -61,6 +62,7 @@ typedef enum spa_feature { SPA_FEATURE_OBSOLETE_COUNTS, SPA_FEATURE_POOL_CHECKPOINT, SPA_FEATURE_SPACEMAP_V2, + SPA_FEATURE_ALLOCATION_CLASSES, SPA_FEATURES } spa_feature_t; diff --git a/common/zfs/zfs_prop.c b/common/zfs/zfs_prop.c index ee792afc0afd..2e1a5421f535 100644 --- a/common/zfs/zfs_prop.c +++ b/common/zfs/zfs_prop.c @@ -435,6 +435,9 @@ zfs_prop_init(void) zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE"); + zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS, + "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "zero or 512 to 128K, power of 2", "SPECIAL_SMALL_BLOCKS"); /* hidden properties */ zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER, diff --git a/lib/libzfs/common/libzfs.h b/lib/libzfs/common/libzfs.h index ef26165d3066..fa005259c228 100644 --- a/lib/libzfs/common/libzfs.h +++ b/lib/libzfs/common/libzfs.h @@ -258,6 +258,7 @@ typedef struct splitflags { /* after splitting, import the pool */ int import : 1; + int name_flags; } splitflags_t; /* @@ -425,8 +426,15 @@ struct zfs_cmd; extern const char *zfs_history_event_names[]; +typedef enum { + VDEV_NAME_PATH = 1 << 0, + VDEV_NAME_GUID = 1 << 1, + VDEV_NAME_FOLLOW_LINKS = 1 << 2, + VDEV_NAME_TYPE_ID = 1 << 3, +} vdev_name_t; + extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, - boolean_t verbose); + int name_flags); extern int zpool_upgrade(zpool_handle_t *, uint64_t); extern int zpool_get_history(zpool_handle_t *, nvlist_t **); extern int zpool_history_unpack(char *, uint64_t, uint64_t *, diff --git a/lib/libzfs/common/libzfs_dataset.c b/lib/libzfs/common/libzfs_dataset.c index b48da7de8490..1b2bf860e2da 100644 --- a/lib/libzfs/common/libzfs_dataset.c +++ b/lib/libzfs/common/libzfs_dataset.c @@ -1171,6 +1171,36 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, } break; } + + case ZFS_PROP_SPECIAL_SMALL_BLOCKS: + if (zpool_hdl != NULL) { + char state[64] = ""; + + /* + * Issue a warning but do not fail so that + * tests for setable properties succeed. + */ + if (zpool_prop_get_feature(zpool_hdl, + "feature@allocation_classes", state, + sizeof (state)) != 0 || + strcmp(state, ZFS_FEATURE_ACTIVE) != 0) { + (void) fprintf(stderr, gettext( + "%s: property requires a special " + "device in the pool\n"), propname); + } + } + if (intval != 0 && + (intval < SPA_MINBLOCKSIZE || + intval > SPA_OLD_MAXBLOCKSIZE || !ISP2(intval))) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid '%s=%d' property: must be zero or " + "a power of 2 from 512B to 128K"), propname, + intval); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + case ZFS_PROP_MLSLABEL: { /* diff --git a/lib/libzfs/common/libzfs_pool.c b/lib/libzfs/common/libzfs_pool.c index 46e7044d6210..0596fa62ee57 100644 --- a/lib/libzfs/common/libzfs_pool.c +++ b/lib/libzfs/common/libzfs_pool.c @@ -26,6 +26,7 @@ * Copyright 2016 Nexenta Systems, Inc. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. */ #include @@ -1124,6 +1125,30 @@ zpool_get_state(zpool_handle_t *zhp) return (zhp->zpool_state); } +/* + * Check if vdev list contains a special vdev + */ +static boolean_t +zpool_has_special_vdev(nvlist_t *nvroot) +{ + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (uint_t c = 0; c < children; c++) { + char *bias; + + if (nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 && + strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) { + return (B_TRUE); + } + } + } + return (B_FALSE); +} + /* * Create the named pool, using the provided vdev list. It is assumed * that the consumer has already validated the contents of the nvlist, so we @@ -1169,6 +1194,17 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, fsprops, zoned, NULL, NULL, msg)) == NULL) { goto create_failed; } + + if (nvlist_exists(zc_fsprops, + zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) && + !zpool_has_special_vdev(nvroot)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "%s property requires a special vdev"), + zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)); + (void) zfs_error(hdl, EZFS_BADPROP, msg); + goto create_failed; + } + if (!zc_props && (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) { goto create_failed; @@ -1693,7 +1729,7 @@ print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, return; for (c = 0; c < children; c++) { - vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE); + vname = zpool_vdev_name(hdl, NULL, child[c], VDEV_NAME_TYPE_ID); print_vdev_tree(hdl, vname, child[c], indent + 2); free(vname); } @@ -2884,7 +2920,7 @@ zpool_vdev_attach(zpool_handle_t *zhp, verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0); - if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL) + if ((newname = zpool_vdev_name(NULL, NULL, child[0], 0)) == NULL) return (-1); /* @@ -3075,11 +3111,11 @@ find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren, for (mc = 0; mc < mchildren; mc++) { uint_t sc; char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp, - mchild[mc], B_FALSE); + mchild[mc], 0); for (sc = 0; sc < schildren; sc++) { char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp, - schild[sc], B_FALSE); + schild[sc], 0); boolean_t result = (strcmp(mpath, spath) == 0); free(spath); @@ -3670,20 +3706,33 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) */ char * zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, - boolean_t verbose) + int name_flags) { - char *path, *devid; + char *path, *devid, *env; uint64_t value; char buf[64]; vdev_stat_t *vs; uint_t vsc; - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - &value) == 0) { - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, - &value) == 0); - (void) snprintf(buf, sizeof (buf), "%llu", - (u_longlong_t)value); + env = getenv("ZPOOL_VDEV_NAME_PATH"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) + name_flags |= VDEV_NAME_PATH; + + env = getenv("ZPOOL_VDEV_NAME_GUID"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) + name_flags |= VDEV_NAME_GUID; + + env = getenv("ZPOOL_VDEV_NAME_FOLLOW_LINKS"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) + name_flags |= VDEV_NAME_FOLLOW_LINKS; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 || + name_flags & VDEV_NAME_GUID) { + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value); + (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value); path = buf; } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { @@ -3725,11 +3774,23 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, devid_str_free(newdevid); } + if (name_flags & VDEV_NAME_FOLLOW_LINKS) { + char *rp = realpath(path, NULL); + if (rp) { + strlcpy(buf, rp, sizeof (buf)); + path = buf; + free(rp); + } + } + if (strncmp(path, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0) path += strlen(ZFS_DISK_ROOTD); - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - &value) == 0 && value) { + /* + * Remove the partition from the path it this is a whole disk. + */ + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) + == 0 && value && !(name_flags & VDEV_NAME_PATH)) { int pathlen = strlen(path); char *tmp = zfs_strdup(hdl, path); @@ -3769,7 +3830,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, * We identify each top-level vdev by using a * naming convention. */ - if (verbose) { + if (name_flags & VDEV_NAME_TYPE_ID) { uint64_t id; verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, diff --git a/lib/libzpool/common/util.c b/lib/libzpool/common/util.c index c9da9adca788..38b2e9e45807 100644 --- a/lib/libzpool/common/util.c +++ b/lib/libzpool/common/util.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2017 RackTop Systems. + * Copyright (c) 2017, Intel Corporation. */ #include @@ -52,7 +53,6 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) uint_t c, children; char used[6], avail[6]; char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6]; - char *prefix = ""; if (indent == 0 && desc != NULL) { (void) printf(" " @@ -62,15 +62,24 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) } if (desc != NULL) { + char *suffix = "", *bias = NULL; + char bias_suffix[32]; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); - - if (is_log) - prefix = "log "; - + (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + &bias); if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) != 0) vs = &v0; + if (bias != NULL) { + (void) snprintf(bias_suffix, sizeof (bias_suffix), + " (%s)", bias); + suffix = bias_suffix; + } else if (is_log) { + suffix = " (log)"; + } + sec = MAX(1, vs->vs_timestamp / NANOSEC); nicenum(vs->vs_alloc, used, sizeof (used)); @@ -87,9 +96,9 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) (void) printf("%*s%s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n", indent, "", - prefix, - indent + strlen(prefix) - 25 - (vs->vs_space ? 0 : 12), desc, + (int)(indent+strlen(desc)-25-(vs->vs_space ? 0 : 12)), + suffix, vs->vs_space ? 6 : 0, vs->vs_space ? used : "", vs->vs_space ? 6 : 0, vs->vs_space ? avail : "", rops, wops, rbytes, wbytes, rerr, werr, cerr); diff --git a/man/man1m/zfs.1m b/man/man1m/zfs.1m index c1c3336d6181..142a0bf6c0d7 100644 --- a/man/man1m/zfs.1m +++ b/man/man1m/zfs.1m @@ -1132,6 +1132,20 @@ This feature must be enabled to be used .Po see .Xr zpool-features 5 .Pc . +.It Sy special_small_blocks Ns = Ns Em size +This value represents the threshold block size for including small file +blocks into the special allocation class. +Blocks smaller than or equal to this value will be assigned to the special +allocation class while greater blocks will be assigned to the regular class. +Valid values are zero or a power of two from 512B up to 128K. +The default size is 0 which means no small file blocks will be allocated in +the special class. +.Pp +Before setting this property, a special class vdev must be added to the +pool. +See +.Xr zpool 1M +for more details on the special allocation class. .It Sy mountpoint Ns = Ns Pa path Ns | Ns Sy none Ns | Ns Sy legacy Controls the mount point used for this file system. See the diff --git a/man/man1m/zpool.1m b/man/man1m/zpool.1m index 29935e5d77cf..5abbbcc018bf 100644 --- a/man/man1m/zpool.1m +++ b/man/man1m/zpool.1m @@ -24,6 +24,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2017 George Melikov. All Rights Reserved. +.\" Copyright 2019 Joyent, Inc. .\" .Dd April 27, 2018 .Dt ZPOOL 1M @@ -36,7 +37,7 @@ .Fl \? .Nm .Cm add -.Op Fl fn +.Op Fl fgLnP .Ar pool vdev Ns ... .Nm .Cm attach @@ -114,6 +115,7 @@ .Cm iostat .Op Fl v .Op Fl T Sy u Ns | Ns Sy d +.Op Fl gLP .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Nm @@ -122,7 +124,7 @@ .Ar device .Nm .Cm list -.Op Fl Hpv +.Op Fl HgLpPv .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... @@ -163,13 +165,13 @@ .Ar pool .Nm .Cm split -.Op Fl n +.Op Fl gLnP .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Ar pool newpool .Nm .Cm status -.Op Fl Dvx +.Op Fl DgLPvx .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count @@ -268,6 +270,23 @@ However, raidz vdev types are not supported for the intent log. For more information, see the .Sx Intent Log section. +.It Sy dedup +A device dedicated solely for allocating dedup data. +The redundancy of this device should match the redundancy of the other normal +devices in the pool. +If more than one dedup device is specified, then allocations are load-balanced +between devices. +.It Sy special +A device dedicated solely for allocating various kinds of internal metadata, +and optionally small file data. +The redundancy of this device should match the redundancy of the other normal +devices in the pool. +If more than one special device is specified, then allocations are +load-balanced between devices. +.Pp +For more information on special allocations, see the +.Sx Special Allocation Class +section. .It Sy cache A device used to cache storage pool data. A cache device cannot be configured as a mirror or raidz group. @@ -514,6 +533,31 @@ zfs properties) may be unenforceable while a checkpoint exists, because the checkpoint is allowed to consume the dataset's reservation. Finally, data that is part of the checkpoint but has been freed in the current state of the pool won't be scanned during a scrub. +.Ss Special Allocation Class +The allocations in the special class are dedicated to specific block types. +By default this includes all metadata, the indirect blocks of user data, and +any dedup data. +The class can also be provisioned to accept a limited percentage of small file +data blocks. +.Pp +A pool must always have at least one general (non-specified) vdev before +other devices can be assigned to the special class. +If the special class becomes full, then allocations intended for it will spill +back into the normal class. +.Pp +Dedup data can be excluded from the special class by setting the +.Sy zfs_ddt_data_is_special +zfs kernel variable to false (0). +.Pp +Inclusion of small file blocks in the special class is opt-in. +Each dataset can control the size of small file blocks allowed in the special +class by setting the +.Sy special_small_blocks +dataset property. +It defaults to zero so you must opt-in by setting it to a non-zero value. +See +.Xr zfs 1M +for more info on setting this property. .Ss Properties Each pool has several properties associated with it. Some properties are read-only statistics while others are configurable and @@ -789,7 +833,7 @@ Displays a help message. .It Xo .Nm .Cm add -.Op Fl fn +.Op Fl fgLnP .Ar pool vdev Ns ... .Xc Adds the specified virtual devices to the given pool. @@ -809,11 +853,30 @@ Forces use of .Ar vdev Ns s , even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner. +.It Fl g +Display +.Ar vdev , +GUIDs instead of the normal device names. +These GUIDs can be used in place of +device names for the zpool detach/offline/remove/replace commands. +.It Fl L +Display real paths for +.Ar vdev Ns s +resolving all symbolic links. +This can be used to look up the current block +device name regardless of the /dev/disk/ path used to open it. .It Fl n Displays the configuration that would be used without actually adding the .Ar vdev Ns s . The actual pool creation can still fail due to insufficient privileges or device sharing. +.It Fl P +Display real paths for +.Ar vdev Ns s +instead of only the last component of the path. +This can be used in conjunction with the +.Fl L +flag. .El .It Xo .Nm @@ -1443,8 +1506,8 @@ with no flags on the relevant target devices. .It Xo .Nm .Cm iostat -.Op Fl v .Op Fl T Sy u Ns | Ns Sy d +.Op Fl gLPv .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count .Xc @@ -1475,6 +1538,21 @@ Specify for standard date format. See .Xr date 1 . +.It Fl g +Display vdev GUIDs instead of the normal device names. +These GUIDs can be used in place of device names for the zpool +detach/offline/remove/replace commands. +.It Fl L +Display real paths for vdevs resolving all symbolic links. +This can be used to look up the current block device name regardless of the +.Pa /dev/disk/ +path used to open it. +.It Fl P +Display full paths for vdevs instead of only the last component of +the path. +This can be used in conjunction with the +.Fl L +flag. .It Fl v Verbose statistics Reports usage statistics for individual vdevs within the pool, in addition to the pool-wide statistics. @@ -1497,7 +1575,7 @@ Treat exported or foreign devices as inactive. .It Xo .Nm .Cm list -.Op Fl Hpv +.Op Fl HgLpPv .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... @@ -1518,6 +1596,10 @@ is specified, the command exits after .Ar count reports are printed. .Bl -tag -width Ds +.It Fl g +Display vdev GUIDs instead of the normal device names. +These GUIDs can be used in place of device names for the zpool +detach/offline/remove/replace commands. .It Fl H Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary @@ -1530,10 +1612,20 @@ section for a list of valid properties. The default list is .Cm name , size , allocated , free , checkpoint, expandsize , fragmentation , capacity , .Cm dedupratio , health , altroot . +.It Fl L +Display real paths for vdevs resolving all symbolic links. +This can be used to look up the current block device name regardless of the +/dev/disk/ path used to open it. .It Fl p Display numbers in parsable .Pq exact values. +.It Fl P +Display full paths for vdevs instead of only the last component of +the path. +This can be used in conjunction with the +.Fl L +flag. .It Fl T Sy u Ns | Ns Sy d Display a time stamp. Specify @@ -1746,7 +1838,7 @@ values. .It Xo .Nm .Cm split -.Op Fl n +.Op Fl gLnP .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root .Ar pool newpool @@ -1763,10 +1855,25 @@ At the time of the split, will be a replica of .Ar pool . .Bl -tag -width Ds +.It Fl g +Display vdev GUIDs instead of the normal device names. +These GUIDs can be used in place of device names for the zpool +detach/offline/remove/replace commands. +.It Fl L +Display real paths for vdevs resolving all symbolic links. +This can be used to look up the current block device name regardless of the +.Pa /dev/disk/ +path used to open it. .It Fl n Do dry run, do not actually perform the split. Print out the expected configuration of .Ar newpool . +.It Fl P +Display full paths for vdevs instead of only the last component of +the path. +This can be used in conjunction with the +.Fl L +flag. .It Fl o Ar property Ns = Ns Ar value Sets the specified property for .Ar newpool . @@ -1785,7 +1892,7 @@ and automatically import it. .It Xo .Nm .Cm status -.Op Fl Dvx +.Op Fl DgLPvx .Op Fl T Sy u Ns | Ns Sy d .Oo Ar pool Oc Ns ... .Op Ar interval Op Ar count @@ -1809,6 +1916,21 @@ Display a histogram of deduplication statistics, showing the allocated and referenced .Pq logically referenced in the pool block counts and sizes by reference count. +.It Fl g +Display vdev GUIDs instead of the normal device names. +These GUIDs can be used in place of device names for the zpool +detach/offline/remove/replace commands. +.It Fl L +Display real paths for vdevs resolving all symbolic links. +This can be used to look up the current block device name regardless of the +.Pa /dev/disk/ +path used to open it. +.It Fl P +Display full paths for vdevs instead of only the last component of +the path. +This can be used in conjunction with the +.Fl L +flag. .It Fl T Sy u Ns | Ns Sy d Display a time stamp. Specify @@ -2084,6 +2206,33 @@ data 23.9G 14.6G 9.30G 48% - 61% 1.00x ONLINE - c1t3d0 - - - - - .Ed .El +.Sh ENVIRONMENT VARIABLES +.Bl -tag -width "ZPOOL_VDEV_NAME_GUID" +.It Ev ZPOOL_VDEV_NAME_GUID +Cause +.Nm zpool subcommands to output vdev guids by default. +This behavior is identical to the +.Nm zpool status -g +command line option. +.El +.Bl -tag -width "ZPOOL_VDEV_NAME_FOLLOW_LINKS" +.It Ev ZPOOL_VDEV_NAME_FOLLOW_LINKS +Cause +.Nm zpool +subcommands to follow links for vdev names by default. +This behavior is identical to the +.Nm zpool status -L +command line option. +.El +.Bl -tag -width "ZPOOL_VDEV_NAME_PATH" +.It Ev ZPOOL_VDEV_NAME_PATH +Cause +.Nm zpool +subcommands to output full vdev path names by default. +This behavior is identical to the +.Nm zpool status -P +command line option. +.El .Sh INTERFACE STABILITY .Sy Evolving .Sh SEE ALSO diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index fda83e5482f3..57617184ba05 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -661,5 +661,27 @@ ever had their checksum set to \fBedonr\fR are destroyed. Booting off of pools using \fBedonr\fR is supported. +.sp +.ne 2 +.na +\fB\fBallocation_classes\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.intel:allocation_classes +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This feature enables support for separate allocation classes. + +This feature becomes \fBactive\fR when a dedicated allocation class vdev +(dedup or special) is created with zpool create or zpool add. With device +removal, it can be returned to the \fBenabled\fR state if all the top-level +vdevs from an allocation class are removed. + +.RE + .SH "SEE ALSO" \fBzpool\fR(1M) diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c index 24786ce9cdfc..95ca9f76aab8 100644 --- a/uts/common/fs/zfs/dmu.c +++ b/uts/common/fs/zfs/dmu.c @@ -2201,6 +2201,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; + zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? + os->os_zpl_special_smallblock : 0; } int diff --git a/uts/common/fs/zfs/dmu_objset.c b/uts/common/fs/zfs/dmu_objset.c index 36e6391d5718..d83153935f18 100644 --- a/uts/common/fs/zfs/dmu_objset.c +++ b/uts/common/fs/zfs/dmu_objset.c @@ -303,6 +303,20 @@ dnodesize_changed_cb(void *arg, uint64_t newval) } } +static void +smallblk_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE); + ASSERT(ISP2(newval)); + + os->os_zpl_special_smallblock = newval; +} + static void logbias_changed_cb(void *arg, uint64_t newval) { @@ -516,6 +530,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zfs_prop_to_name(ZFS_PROP_DNODESIZE), dnodesize_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name( + ZFS_PROP_SPECIAL_SMALL_BLOCKS), + smallblk_changed_cb, os); + } } if (needlock) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index d5a3e7232ac2..c92297c0fd2a 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2017, Intel Corporation. */ #include @@ -288,7 +289,7 @@ metaslab_class_validate(metaslab_class_t *mc) return (0); } -void +static void metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { @@ -325,7 +326,8 @@ metaslab_class_get_dspace(metaslab_class_t *mc) void metaslab_class_histogram_verify(metaslab_class_t *mc) { - vdev_t *rvd = mc->mc_spa->spa_root_vdev; + spa_t *spa = mc->mc_spa; + vdev_t *rvd = spa->spa_root_vdev; uint64_t *mc_hist; int i; @@ -831,7 +833,8 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - if (msp->ms_sm == NULL) + /* skip if not active or not a member */ + if (msp->ms_sm == NULL || msp->ms_group != mg) continue; for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) @@ -964,12 +967,14 @@ metaslab_group_fragmentation(metaslab_group_t *mg) if (msp->ms_fragmentation == ZFS_FRAG_INVALID) continue; + if (msp->ms_group != mg) + continue; valid_ms++; fragmentation += msp->ms_fragmentation; } - if (valid_ms <= vd->vdev_ms_count / 2) + if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) return (ZFS_FRAG_INVALID); fragmentation /= valid_ms; @@ -1000,7 +1005,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * groups to select from. Otherwise, we always consider it eligible * for allocations. */ - if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) + if ((mc != spa_normal_class(spa) && + mc != spa_special_class(spa) && + mc != spa_dedup_class(spa)) || + mc->mc_groups <= 1) return (B_TRUE); /* @@ -1534,12 +1542,26 @@ metaslab_unload(metaslab_t *msp) msp->ms_max_size = 0; } +static void +metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, + int64_t defer_delta, int64_t space_delta) +{ + vdev_space_update(vd, alloc_delta, defer_delta, space_delta); + + ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); + ASSERT(vd->vdev_ms_count != 0); + + metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, + vdev_deflated_space(vd, space_delta)); +} + int metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; - objset_t *mos = vd->vdev_spa->spa_meta_objset; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; metaslab_t *ms; int error; @@ -1596,8 +1618,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, /* * If metaslab_debug_load is set and we're initializing a metaslab - * that has an allocated space map object then load the its space - * map so that can verify frees. + * that has an allocated space map object then load the space map + * so that we can verify frees. */ if (metaslab_debug_load && ms->ms_sm != NULL) { mutex_enter(&ms->ms_lock); @@ -1619,16 +1641,19 @@ void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); - vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), - 0, -msp->ms_size); + metaslab_space_update(vd, mg->mg_class, + -space_map_allocated(msp->ms_sm), 0, -msp->ms_size); + space_map_close(msp->ms_sm); metaslab_unload(msp); + range_tree_destroy(msp->ms_allocatable); range_tree_destroy(msp->ms_freeing); range_tree_destroy(msp->ms_freed); @@ -2643,7 +2668,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) ASSERT3P(msp->ms_checkpointing, ==, NULL); msp->ms_checkpointing = range_tree_create(NULL, NULL); - vdev_space_update(vd, 0, 0, msp->ms_size); + metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); @@ -2665,7 +2690,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) defer_delta -= range_tree_space(*defer_tree); } - vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); + metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, + defer_delta, 0); /* * If there's a metaslab_load() in progress, wait for it to complete @@ -2764,21 +2790,25 @@ metaslab_sync_reassess(metaslab_group_t *mg) spa_config_exit(spa, SCL_ALLOC, FTAG); } -static uint64_t -metaslab_distance(metaslab_t *msp, dva_t *dva) +/* + * When writing a ditto block (i.e. more than one DVA for a given BP) on + * the same vdev as an existing DVA of this BP, then try to allocate it + * on a different metaslab than existing DVAs (i.e. a unique metaslab). + */ +static boolean_t +metaslab_is_unique(metaslab_t *msp, dva_t *dva) { - uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; - uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; - uint64_t start = msp->ms_id; + uint64_t dva_ms_id; + + if (DVA_GET_ASIZE(dva) == 0) + return (B_TRUE); if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) - return (1ULL << 63); + return (B_TRUE); - if (offset < start) - return ((start - offset) << ms_shift); - if (offset > start) - return ((offset - start) << ms_shift); - return (0); + dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; + + return (msp->ms_id != dva_ms_id); } /* @@ -3012,7 +3042,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, - dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator, + dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) { avl_index_t idx; @@ -3047,13 +3077,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) break; - uint64_t target_distance = min_distance - + (space_map_allocated(msp->ms_sm) != 0 ? 0 : - min_distance >> 1); - for (i = 0; i < d; i++) { - if (metaslab_distance(msp, &dva[i]) < target_distance) - break; + if (want_unique && + !metaslab_is_unique(msp, &dva[i])) + break; /* try another metaslab */ } if (i == d) break; @@ -3071,8 +3098,8 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, /* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, - int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, + int d, int allocator) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; @@ -3126,7 +3153,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, was_active = B_TRUE; } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, - min_distance, asize, allocator, zal, search, + want_unique, asize, allocator, zal, search, &was_active); } @@ -3264,6 +3291,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * metaslab. */ ASSERT(!metaslab_should_allocate(msp, asize)); + mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); @@ -3273,14 +3301,14 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, - int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, + int d, int allocator) { uint64_t offset; ASSERT(mg->mg_initialized); - offset = metaslab_group_alloc_normal(mg, zal, asize, txg, - min_distance, dva, d, allocator); + offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, + dva, d, allocator); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { @@ -3307,14 +3335,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, return (offset); } -/* - * If we have to write a ditto block (i.e. more than one DVA for a given BP) - * on the same vdev as an existing DVA of this BP, then try to allocate it - * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the - * existing DVAs. - */ -int ditto_same_vdev_distance_shift = 3; - /* * Allocate a block for the specified i/o. */ @@ -3331,6 +3351,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * For testing, make some blocks above a certain size be gang blocks. + * This will also test spilling from special to normal. */ if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, @@ -3382,6 +3403,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; } else { + ASSERT(mc->mc_rotor != NULL); mg = mc->mc_rotor; } @@ -3446,25 +3468,17 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, ASSERT(mg->mg_class == mc); - /* - * If we don't need to try hard, then require that the - * block be 1/8th of the device away from any other DVAs - * in this BP. If we are trying hard, allow any offset - * to be used (distance=0). - */ - uint64_t distance = 0; - if (!try_hard) { - distance = vd->vdev_asize >> - ditto_same_vdev_distance_shift; - if (distance <= (1ULL << vd->vdev_ms_shift)) - distance = 0; - } - uint64_t asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); + /* + * If we don't need to try hard, then require that the + * block be on an different metaslab from any other DVAs + * in this BP (unique=true). If we are trying hard, then + * allow any metaslab to be used (unique=false). + */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - distance, dva, d, allocator); + !try_hard, dva, d, allocator); if (offset != -1ULL) { /* @@ -3843,7 +3857,8 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, if (reserved_slots < max) available_slots = max - reserved_slots; - if (slots <= available_slots || GANG_ALLOCATION(flags)) { + if (slots <= available_slots || GANG_ALLOCATION(flags) || + flags & METASLAB_MUST_RESERVE) { /* * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. @@ -4126,9 +4141,11 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - for (int d = 0; d < ndvas; d++) - if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) + for (int d = 0; d < ndvas; d++) { + error = metaslab_claim_dva(spa, &dva[d], txg); + if (error != 0) break; + } spa_config_exit(spa, SCL_ALLOC, FTAG); diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index 82fe913b8bfd..403ace2d9da0 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -28,6 +28,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2018 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2017 Datto Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ @@ -274,8 +275,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { - alloc = metaslab_class_get_alloc(spa_normal_class(spa)); - size = metaslab_class_get_space(spa_normal_class(spa)); + alloc = metaslab_class_get_alloc(mc); + alloc += metaslab_class_get_alloc(spa_special_class(spa)); + alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); + + size = metaslab_class_get_space(mc); + size += metaslab_class_get_space(spa_special_class(spa)); + size += metaslab_class_get_space(spa_dedup_class(spa)); + spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); @@ -1137,6 +1144,8 @@ spa_activate(spa_t *spa, int mode) spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); @@ -1232,6 +1241,12 @@ spa_deactivate(spa_t *spa) metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; + metaslab_class_destroy(spa->spa_special_class); + spa->spa_special_class = NULL; + + metaslab_class_destroy(spa->spa_dedup_class); + spa->spa_dedup_class = NULL; + /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. @@ -4834,7 +4849,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, char *poolname; nvlist_t *nvl; - if (nvlist_lookup_string(props, + if (props == NULL || + nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) poolname = (char *)pool; @@ -4922,9 +4938,15 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_metaslab_set_size(rvd->vdev_child[c]); - vdev_expand(rvd->vdev_child[c], txg); + /* + * instantiate the metaslab groups (this will dirty the vdevs) + * we can no longer error exit past this point + */ + for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + vdev_metaslab_set_size(vd); + vdev_expand(vd, txg); } } @@ -7082,8 +7104,14 @@ spa_async_thread(void *arg) mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); + old_space += metaslab_class_get_space(spa_special_class(spa)); + old_space += metaslab_class_get_space(spa_dedup_class(spa)); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + new_space = metaslab_class_get_space(spa_normal_class(spa)); + new_space += metaslab_class_get_space(spa_special_class(spa)); + new_space += metaslab_class_get_space(spa_dedup_class(spa)); mutex_exit(&spa_namespace_lock); /* @@ -7780,6 +7808,9 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; + metaslab_class_t *normal = spa_normal_class(spa); + metaslab_class_t *special = spa_special_class(spa); + metaslab_class_t *dedup = spa_dedup_class(spa); vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; @@ -7872,9 +7903,13 @@ spa_sync(spa_t *spa, uint64_t txg) for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; + metaslab_class_t *mc; - if (mg == NULL || mg->mg_class != spa_normal_class(spa) || - !metaslab_group_initialized(mg)) + if (mg == NULL || !metaslab_group_initialized(mg)) + continue; + + mc = mg->mg_class; + if (mc != normal && mc != special && mc != dedup) continue; /* @@ -7893,12 +7928,18 @@ spa_sync(spa_t *spa, uint64_t txg) } slots_per_allocator += zfs_vdev_def_queue_depth; } - metaslab_class_t *mc = spa_normal_class(spa); + for (int i = 0; i < spa->spa_alloc_count; i++) { - ASSERT0(zfs_refcount_count(&mc->mc_alloc_slots[i])); - mc->mc_alloc_max_slots[i] = slots_per_allocator; + ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i])); + ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i])); + ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i])); + normal->mc_alloc_max_slots[i] = slots_per_allocator; + special->mc_alloc_max_slots[i] = slots_per_allocator; + dedup->mc_alloc_max_slots[i] = slots_per_allocator; } - mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c index bddd4fe858bb..9a80f89a8ac3 100644 --- a/uts/common/fs/zfs/spa_misc.c +++ b/uts/common/fs/zfs/spa_misc.c @@ -26,6 +26,7 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. */ #include @@ -387,6 +388,19 @@ spa_load_note(spa_t *spa, const char *fmt, ...) spa->spa_trust_config ? "trusted" : "untrusted", buf); } +/* + * By default dedup and user data indirects land in the special class + */ +int zfs_ddt_data_is_special = B_TRUE; +int zfs_user_indirect_is_special = B_TRUE; + +/* + * The percentage of special class final space reserved for metadata only. + * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only + * let metadata into the class. + */ +int zfs_special_class_metadata_reserve_pct = 25; + /* * ========================================================================== * SPA config locking @@ -1173,6 +1187,8 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); spa_config_exit(spa, SCL_ALL, spa); @@ -1516,6 +1532,16 @@ zfs_strtonum(const char *str, char **nptr) return (val); } +void +spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) +{ + /* + * We bump the feature refcount for each special vdev added to the pool + */ + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); + spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); +} + /* * ========================================================================== * Accessor functions @@ -1765,6 +1791,79 @@ spa_log_class(spa_t *spa) return (spa->spa_log_class); } +metaslab_class_t * +spa_special_class(spa_t *spa) +{ + return (spa->spa_special_class); +} + +metaslab_class_t * +spa_dedup_class(spa_t *spa) +{ + return (spa->spa_dedup_class); +} + +/* + * Locate an appropriate allocation class + */ +metaslab_class_t * +spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, + uint_t level, uint_t special_smallblk) +{ + if (DMU_OT_IS_ZIL(objtype)) { + if (spa->spa_log_class->mc_groups != 0) + return (spa_log_class(spa)); + else + return (spa_normal_class(spa)); + } + + boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; + + if (DMU_OT_IS_DDT(objtype)) { + if (spa->spa_dedup_class->mc_groups != 0) + return (spa_dedup_class(spa)); + else if (has_special_class && zfs_ddt_data_is_special) + return (spa_special_class(spa)); + else + return (spa_normal_class(spa)); + } + + /* Indirect blocks for user data can land in special if allowed */ + if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { + if (has_special_class && zfs_user_indirect_is_special) + return (spa_special_class(spa)); + else + return (spa_normal_class(spa)); + } + + if (DMU_OT_IS_METADATA(objtype) || level > 0) { + if (has_special_class) + return (spa_special_class(spa)); + else + return (spa_normal_class(spa)); + } + + /* + * Allow small file blocks in special class in some cases (like + * for the dRAID vdev feature). But always leave a reserve of + * zfs_special_class_metadata_reserve_pct exclusively for metadata. + */ + if (DMU_OT_IS_FILE(objtype) && + has_special_class && size <= special_smallblk) { + metaslab_class_t *special = spa_special_class(spa); + uint64_t alloc = metaslab_class_get_alloc(special); + uint64_t space = metaslab_class_get_space(special); + uint64_t limit = + (space * (100 - zfs_special_class_metadata_reserve_pct)) + / 100; + + if (alloc < limit) + return (special); + } + + return (spa_normal_class(spa)); +} + void spa_evicting_os_register(spa_t *spa, objset_t *os) { diff --git a/uts/common/fs/zfs/sys/dmu.h b/uts/common/fs/zfs/sys/dmu.h index 535c13fe05c1..954977578b3a 100644 --- a/uts/common/fs/zfs/sys/dmu.h +++ b/uts/common/fs/zfs/sys/dmu.h @@ -21,13 +21,14 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2013 DEY Storage Systems, Inc. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2014 Integros [integros.com] */ @@ -126,6 +127,16 @@ typedef enum dmu_object_byteswap { ((ot) & DMU_OT_METADATA) : \ dmu_ot[(ot)].ot_metadata) +#define DMU_OT_IS_DDT(ot) \ + ((ot) == DMU_OT_DDT_ZAP) + +#define DMU_OT_IS_ZIL(ot) \ + ((ot) == DMU_OT_INTENT_LOG) + +/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */ +#define DMU_OT_IS_FILE(ot) \ + ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) + #define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) @@ -216,6 +227,7 @@ typedef enum dmu_object_type { * * The DMU_OTN_* types do not have entries in the dmu_ot table, * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead + * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead * of indexing into dmu_ot directly (this works for both DMU_OT_* types * and DMU_OTN_* types). */ diff --git a/uts/common/fs/zfs/sys/dmu_objset.h b/uts/common/fs/zfs/sys/dmu_objset.h index 3028f0436566..cae1c7719a83 100644 --- a/uts/common/fs/zfs/sys/dmu_objset.h +++ b/uts/common/fs/zfs/sys/dmu_objset.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -113,6 +113,11 @@ struct objset { uint64_t os_normalization; uint64_t os_utf8only; uint64_t os_casesensitivity; + /* + * The largest zpl file block allowed in special class. + * cached here instead of zfsvfs for easier access. + */ + int os_zpl_special_smallblock; /* * Pointer is constant; the blkptr it points to is protected by diff --git a/uts/common/fs/zfs/sys/metaslab.h b/uts/common/fs/zfs/sys/metaslab.h index f0c68c77fc06..08fe3955b688 100644 --- a/uts/common/fs/zfs/sys/metaslab.h +++ b/uts/common/fs/zfs/sys/metaslab.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_METASLAB_H @@ -56,12 +57,17 @@ void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); uint64_t metaslab_block_maxsize(metaslab_t *); +/* + * metaslab alloc flags + */ #define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_AVOID 0x1 #define METASLAB_GANG_HEADER 0x2 #define METASLAB_GANG_CHILD 0x4 #define METASLAB_ASYNC_ALLOC 0x8 #define METASLAB_DONT_THROTTLE 0x10 +#define METASLAB_MUST_RESERVE 0x20 +#define METASLAB_FASTWRITE 0x40 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, @@ -92,8 +98,6 @@ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, zio_t *, int); void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); -void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t, - int64_t, int64_t); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); diff --git a/uts/common/fs/zfs/sys/spa.h b/uts/common/fs/zfs/sys/spa.h index dfda4eeabe6f..4ff552447e2c 100644 --- a/uts/common/fs/zfs/sys/spa.h +++ b/uts/common/fs/zfs/sys/spa.h @@ -27,6 +27,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_SPA_H @@ -801,6 +802,11 @@ extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); +extern metaslab_class_t *spa_special_class(spa_t *spa); +extern metaslab_class_t *spa_dedup_class(spa_t *spa); +extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, + dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); + extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_wait(spa_t *spa); @@ -862,6 +868,7 @@ extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); +extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern int spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); diff --git a/uts/common/fs/zfs/sys/spa_impl.h b/uts/common/fs/zfs/sys/spa_impl.h index cf9c32f6245e..dcb6cc9f19ef 100644 --- a/uts/common/fs/zfs/sys/spa_impl.h +++ b/uts/common/fs/zfs/sys/spa_impl.h @@ -25,6 +25,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_SPA_IMPL_H @@ -218,6 +219,8 @@ struct spa { boolean_t spa_is_initializing; /* true while opening pool */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ + metaslab_class_t *spa_special_class; /* special allocation class */ + metaslab_class_t *spa_dedup_class; /* dedup allocation class */ uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ uint64_t spa_freeze_txg; /* freeze pool at this txg */ diff --git a/uts/common/fs/zfs/sys/vdev.h b/uts/common/fs/zfs/sys/vdev.h index b45f0a2ca9a1..ef3bc5dd09ee 100644 --- a/uts/common/fs/zfs/sys/vdev.h +++ b/uts/common/fs/zfs/sys/vdev.h @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_VDEV_H @@ -107,6 +108,8 @@ extern boolean_t vdev_children_are_offline(vdev_t *vd); extern void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); +extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); + extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); diff --git a/uts/common/fs/zfs/sys/vdev_impl.h b/uts/common/fs/zfs/sys/vdev_impl.h index bbad77831721..c0bdeffb6451 100644 --- a/uts/common/fs/zfs/sys/vdev_impl.h +++ b/uts/common/fs/zfs/sys/vdev_impl.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_VDEV_IMPL_H @@ -146,6 +147,14 @@ struct vdev_queue { kmutex_t vq_lock; }; +typedef enum vdev_alloc_bias { + VDEV_BIAS_NONE, + VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ + VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ + VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ +} vdev_alloc_bias_t; + + /* * On-disk indirect vdev state. * @@ -239,6 +248,7 @@ struct vdev { boolean_t vdev_ishole; /* is a hole in the namespace */ kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ uint64_t vdev_top_zap; + vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ diff --git a/uts/common/fs/zfs/sys/zio.h b/uts/common/fs/zfs/sys/zio.h index e14baad67c4a..517764f1cec4 100644 --- a/uts/common/fs/zfs/sys/zio.h +++ b/uts/common/fs/zfs/sys/zio.h @@ -313,6 +313,7 @@ typedef struct zio_prop { boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; + uint32_t zp_zpl_smallblk; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; @@ -433,6 +434,7 @@ struct zio { vdev_t *io_vd; void *io_vsd; const zio_vsd_ops_t *io_vsd_ops; + metaslab_class_t *io_metaslab_class; /* dva throttle class */ uint64_t io_offset; hrtime_t io_timestamp; diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c index afc1c1dcf4e8..c72aebe87722 100644 --- a/uts/common/fs/zfs/vdev.c +++ b/uts/common/fs/zfs/vdev.c @@ -26,6 +26,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. */ #include @@ -192,6 +193,25 @@ vdev_getops(const char *type) return (ops); } +/* + * Derive the enumerated alloction bias from string input. + * String origin is either the per-vdev zap or zpool(1M). + */ +static vdev_alloc_bias_t +vdev_derive_alloc_bias(const char *bias) +{ + vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; + + if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) + alloc_bias = VDEV_BIAS_LOG; + else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) + alloc_bias = VDEV_BIAS_SPECIAL; + else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) + alloc_bias = VDEV_BIAS_DEDUP; + + return (alloc_bias); +} + /* ARGSUSED */ void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) @@ -515,6 +535,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, uint64_t guid = 0, islog, nparity; vdev_t *vd; vdev_indirect_config_t *vic; + vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; + boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -601,11 +623,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } ASSERT(nparity != -1ULL); + /* + * If creating a top-level vdev, check for allocation classes input + */ + if (top_level && alloctype == VDEV_ALLOC_ADD) { + char *bias; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + &bias) == 0) { + alloc_bias = vdev_derive_alloc_bias(bias); + + /* spa_vdev_add() expects feature to be enabled */ + if (spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, + SPA_FEATURE_ALLOCATION_CLASSES)) { + return (SET_ERROR(ENOTSUP)); + } + } + } + vd = vdev_alloc_common(spa, id, guid, ops); vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; + if (top_level && alloc_bias != VDEV_BIAS_NONE) + vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); @@ -656,7 +699,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, /* * If we're a top-level vdev, try to load the allocation parameters. */ - if (parent && !parent->vdev_parent && + if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); @@ -672,14 +715,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, ASSERT0(vd->vdev_top_zap); } - if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { + if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); - vd->vdev_mg = metaslab_group_create(islog ? - spa_log_class(spa) : spa_normal_class(spa), vd, - spa->spa_alloc_count); + /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && @@ -900,6 +941,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; + tvd->vdev_alloc_bias = svd->vdev_alloc_bias; + svd->vdev_alloc_bias = VDEV_BIAS_NONE; + tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; @@ -1049,6 +1093,55 @@ vdev_remove_parent(vdev_t *cvd) vdev_free(mvd); } +static void +vdev_metaslab_group_create(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + /* + * metaslab_group_create was delayed until allocation bias was available + */ + if (vd->vdev_mg == NULL) { + metaslab_class_t *mc; + + if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) + vd->vdev_alloc_bias = VDEV_BIAS_LOG; + + ASSERT3U(vd->vdev_islog, ==, + (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); + + switch (vd->vdev_alloc_bias) { + case VDEV_BIAS_LOG: + mc = spa_log_class(spa); + break; + case VDEV_BIAS_SPECIAL: + mc = spa_special_class(spa); + break; + case VDEV_BIAS_DEDUP: + mc = spa_dedup_class(spa); + break; + default: + mc = spa_normal_class(spa); + } + + vd->vdev_mg = metaslab_group_create(mc, vd, + spa->spa_alloc_count); + + /* + * The spa ashift values currently only reflect the + * general vdev classes. Class destination is late + * binding so ashift checking had to wait until now + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { + if (vd->vdev_ashift > spa->spa_max_ashift) + spa->spa_max_ashift = vd->vdev_ashift; + if (vd->vdev_ashift < spa->spa_min_ashift) + spa->spa_min_ashift = vd->vdev_ashift; + } + } +} + int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { @@ -1059,6 +1152,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; + boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); @@ -1074,7 +1168,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); - if (oldc != 0) { + if (expanding) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } @@ -1100,6 +1194,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } } +#ifndef _KERNEL + /* + * To accomodate zdb_leak_init() fake indirect + * metaslabs, we allocate a metaslab group for + * indirect vdevs which normally don't have one. + */ + if (vd->vdev_mg == NULL) { + ASSERT0(vdev_is_concrete(vd)); + vdev_metaslab_group_create(vd); + } +#endif error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { @@ -1117,8 +1222,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ - if (oldc == 0 && !vd->vdev_removing) + if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); + } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); @@ -1582,9 +1688,13 @@ vdev_open(vdev_t *vd) /* * Track the min and max ashift values for normal data devices. + * + * DJB - TBD these should perhaps be tracked per allocation class + * (e.g. spa_min_ashift is used to round up post compression buffers) */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && - !vd->vdev_islog && vd->vdev_aux == NULL) { + vd->vdev_alloc_bias == VDEV_BIAS_NONE && + vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) @@ -2079,13 +2189,13 @@ vdev_metaslab_set_size(vdev_t *vd) * * The net effect of applying above constrains is summarized below. * - * vdev size metaslab count + * vdev size metaslab count * --------------|----------------- - * < 8GB ~16 - * 8GB - 100GB one per 512MB - * 100GB - 3TB ~200 - * 3TB - 2PB one per 16GB - * > 2PB ~131,072 + * < 8GB ~16 + * 8GB - 100GB one per 512MB + * 100GB - 3TB ~200 + * 3TB - 2PB one per 16GB + * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial @@ -2462,6 +2572,30 @@ vdev_dtl_load(vdev_t *vd) return (error); } +static void +vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; + const char *string; + + ASSERT(alloc_bias != VDEV_BIAS_NONE); + + string = + (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : + (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : + (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; + + ASSERT(string != NULL); + VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, + 1, strlen(string) + 1, string, tx)); + + if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { + spa_activate_allocation_classes(spa, tx); + } +} + void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { @@ -2498,8 +2632,11 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); + if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) + vdev_zap_allocation_data(vd, tx); } } + for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } @@ -2696,10 +2833,27 @@ vdev_load(vdev_t *vd) vdev_set_deflate_ratio(vd); + /* + * On spa_load path, grab the allocation bias from our zap + */ + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + spa_t *spa = vd->vdev_spa; + char bias_str[64]; + + if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), + bias_str) == 0) { + ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); + vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); + } + } + /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { + vdev_metaslab_group_create(vd); + if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -2894,6 +3048,7 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg) metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); } @@ -3471,7 +3626,8 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) } if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { - vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; + vs->vs_fragmentation = (vd->vdev_mg != NULL) ? + vd->vdev_mg->mg_fragmentation : 0; } /* @@ -3657,19 +3813,25 @@ vdev_stat_update(zio_t *zio, uint64_t psize) } } +int64_t +vdev_deflated_space(vdev_t *vd, int64_t space) +{ + ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); + ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); + + return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); +} + /* - * Update the in-core space usage stats for this vdev, its metaslab class, - * and the root vdev. + * Update the in-core space usage stats for this vdev and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { - int64_t dspace_delta = space_delta; + int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; - metaslab_group_t *mg = vd->vdev_mg; - metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); @@ -3679,10 +3841,7 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, * because the root vdev's psize-to-asize is simply the max of its * childrens', thus not accurate enough for us. */ - ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); - ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); - dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * - vd->vdev_deflate_ratio; + dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_alloc += alloc_delta; @@ -3690,21 +3849,15 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); - if (mc == spa_normal_class(spa)) { + /* every class but log contributes to root space stats */ + if (vd->vdev_mg != NULL && !vd->vdev_islog) { mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } - - if (mc != NULL) { - ASSERT(rvd == vd->vdev_parent); - ASSERT(vd->vdev_ms_count != 0); - - metaslab_class_space_update(mc, - alloc_delta, defer_delta, space_delta, dspace_delta); - } + /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* @@ -4119,7 +4272,9 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_set_deflate_ratio(vd); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + vdev_is_concrete(vd)) { + vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } diff --git a/uts/common/fs/zfs/vdev_label.c b/uts/common/fs/zfs/vdev_label.c index 55c3060042d9..3c469ffe4362 100644 --- a/uts/common/fs/zfs/vdev_label.c +++ b/uts/common/fs/zfs/vdev_label.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. * Copyright 2019 Joyent, Inc. */ @@ -317,6 +318,28 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); } + + /* zpool command expects alloc class data */ + if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { + const char *bias = NULL; + + switch (vd->vdev_alloc_bias) { + case VDEV_BIAS_LOG: + bias = VDEV_ALLOC_BIAS_LOG; + break; + case VDEV_BIAS_SPECIAL: + bias = VDEV_ALLOC_BIAS_SPECIAL; + break; + case VDEV_BIAS_DEDUP: + bias = VDEV_ALLOC_BIAS_DEDUP; + break; + default: + ASSERT3U(vd->vdev_alloc_bias, ==, + VDEV_BIAS_NONE); + } + fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + bias); + } } if (vd->vdev_dtl_sm != NULL) { diff --git a/uts/common/fs/zfs/vdev_removal.c b/uts/common/fs/zfs/vdev_removal.c index c9af0e0729d4..c3c6672ccf1a 100644 --- a/uts/common/fs/zfs/vdev_removal.c +++ b/uts/common/fs/zfs/vdev_removal.c @@ -950,14 +950,17 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, ASSERT3U(size, <=, maxalloc); /* - * We use allocator 0 for this I/O because we don't expect device remap - * to be the steady state of the system, so parallelizing is not as - * critical as it is for other allocation types. We also want to ensure - * that the IOs are allocated together as much as possible, to reduce - * mapping sizes. + * An allocation class might not have any remaining vdevs or space */ - int error = metaslab_alloc_dva(spa, mg->mg_class, size, - &dst, 0, NULL, txg, 0, zal, 0); + metaslab_class_t *mc = mg->mg_class; + if (mc != spa_normal_class(spa) && mc->mc_groups <= 1) + mc = spa_normal_class(spa); + int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0, + zal, 0); + if (error == ENOSPC && mc != spa_normal_class(spa)) { + error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, + &dst, 0, NULL, txg, 0, zal, 0); + } if (error != 0) return (error); @@ -1868,15 +1871,31 @@ spa_vdev_remove_top_check(vdev_t *vd) if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) return (SET_ERROR(ENOTSUP)); + /* available space in the pool's normal class */ + uint64_t available = dsl_dir_space_available( + spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); + + metaslab_class_t *mc = vd->vdev_mg->mg_class; + + /* + * When removing a vdev from an allocation class that has + * remaining vdevs, include available space from the class. + */ + if (mc != spa_normal_class(spa) && mc->mc_groups > 1) { + uint64_t class_avail = metaslab_class_get_space(mc) - + metaslab_class_get_alloc(mc); + + /* add class space, adjusted for overhead */ + available += (class_avail * 94) / 100; + } + /* * There has to be enough free space to remove the * device and leave double the "slop" space (i.e. we * must leave at least 3% of the pool free, in addition to * the normal slop space). */ - if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir, - NULL, 0, B_TRUE) < - vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { + if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { return (SET_ERROR(ENOSPC)); } diff --git a/uts/common/fs/zfs/zfs_ioctl.c b/uts/common/fs/zfs/zfs_ioctl.c index de213a31c212..9ecaf457042d 100644 --- a/uts/common/fs/zfs/zfs_ioctl.c +++ b/uts/common/fs/zfs/zfs_ioctl.c @@ -4073,6 +4073,15 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) } break; + case ZFS_PROP_SPECIAL_SMALL_BLOCKS: + /* + * This property could require the allocation classes + * feature to be active for setting, however we allow + * it so that tests of settable properties succeed. + * The CLI will issue a warning in this case. + */ + break; + case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); diff --git a/uts/common/fs/zfs/zio.c b/uts/common/fs/zfs/zio.c index 7d54bc0046bd..619dad47f340 100644 --- a/uts/common/fs/zfs/zio.c +++ b/uts/common/fs/zfs/zio.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2017, Intel Corporation. */ #include @@ -620,6 +621,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bookmark = *zb; if (pio != NULL) { + if (zio->io_metaslab_class == NULL) + zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) @@ -1103,9 +1106,8 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { - metaslab_class_t *mc = spa_normal_class(pio->io_spa); - - ASSERT(mc->mc_alloc_throttle_enabled); + ASSERT(pio->io_metaslab_class != NULL); + ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); @@ -1409,8 +1411,9 @@ zio_write_compress(zio_t *zio) if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { - ASSERT(psize != 0); + VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; + zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { @@ -2825,7 +2828,7 @@ zio_io_to_allocate(spa_t *spa, int allocator) * reserve then we throttle. */ ASSERT3U(zio->io_allocator, ==, allocator); - if (!metaslab_class_throttle_reserve(spa_normal_class(spa), + if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { return (NULL); } @@ -2841,9 +2844,14 @@ zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; + metaslab_class_t *mc; + + /* locate an appropriate allocation class */ + mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, + zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || - !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || + !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (ZIO_PIPELINE_CONTINUE); @@ -2865,11 +2873,10 @@ zio_dva_throttle(zio_t *zio) zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + zio->io_metaslab_class = mc; avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); - - nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator); + nio = zio_io_to_allocate(spa, zio->io_allocator); mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); if (nio == zio) @@ -2890,7 +2897,7 @@ zio_dva_throttle(zio_t *zio) return (ZIO_PIPELINE_STOP); } -void +static void zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; @@ -2910,7 +2917,7 @@ static int zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; - metaslab_class_t *mc = spa_normal_class(spa); + metaslab_class_t *mc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; @@ -2926,20 +2933,57 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - if (zio->io_flags & ZIO_FLAG_NODATA) { + if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; - } - if (zio->io_flags & ZIO_FLAG_GANG_CHILD) { + if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; - } - if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) { + if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; + + /* + * if not already chosen, locate an appropriate allocation class + */ + mc = zio->io_metaslab_class; + if (mc == NULL) { + mc = spa_preferred_class(spa, zio->io_size, + zio->io_prop.zp_type, zio->io_prop.zp_level, + zio->io_prop.zp_zpl_smallblk); + zio->io_metaslab_class = mc; } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); + /* + * Fallback to normal class when an alloc class is full + */ + if (error == ENOSPC && mc != spa_normal_class(spa)) { + /* + * If throttling, transfer reservation over to normal class. + * The io_allocator slot can remain the same even though we + * are switching classes. + */ + if (mc->mc_alloc_throttle_enabled && + (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { + metaslab_class_throttle_unreserve(mc, + zio->io_prop.zp_copies, zio->io_allocator, zio); + zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; + + mc = spa_normal_class(spa); + VERIFY(metaslab_class_throttle_reserve(mc, + zio->io_prop.zp_copies, zio->io_allocator, zio, + flags | METASLAB_MUST_RESERVE)); + } else { + mc = spa_normal_class(spa); + } + zio->io_metaslab_class = mc; + + error = metaslab_alloc(spa, mc, zio->io_size, bp, + zio->io_prop.zp_copies, zio->io_txg, NULL, flags, + &zio->io_alloc_list, zio, zio->io_allocator); + } + if (error != 0) { zfs_dbgmsg("%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, @@ -3007,6 +3051,15 @@ zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp, ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); + + /* + * Block pointer fields are useful to metaslabs for stats and debugging. + * Fill in the obvious ones before calling into metaslab_alloc(). + */ + BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); + BP_SET_PSIZE(new_bp, size); + BP_SET_LEVEL(new_bp, 0); + /* * When allocating a zil block, we don't have information about * the final destination of the block except the objset it's part @@ -3529,13 +3582,15 @@ zio_ready(zio_t *zio) if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(zio->io_metaslab_class != NULL); + /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ metaslab_class_throttle_unreserve( - spa_normal_class(zio->io_spa), - zio->io_prop.zp_copies, zio->io_allocator, zio); + zio->io_metaslab_class, zio->io_prop.zp_copies, + zio->io_allocator, zio); zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } @@ -3617,14 +3672,15 @@ zio_dva_throttle_done(zio_t *zio) ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); + ASSERT(zio->io_metaslab_class != NULL); mutex_enter(&pio->io_lock); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); - metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), - 1, pio->io_allocator, pio); + metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, + pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that @@ -3643,7 +3699,6 @@ zio_done(zio_t *zio) vdev_t *vd = zio->io_vd; uint64_t psize = zio->io_size; zio_t *pio, *pio_next; - metaslab_class_t *mc = spa_normal_class(spa); zio_link_t *zl = NULL; /* @@ -3662,7 +3717,8 @@ zio_done(zio_t *zio) */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { - ASSERT(mc->mc_alloc_throttle_enabled); + ASSERT(zio->io_metaslab_class != NULL); + ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } @@ -3674,10 +3730,12 @@ zio_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(bp != NULL); + metaslab_group_alloc_verify(spa, zio->io_bp, zio, zio->io_allocator); VERIFY(zfs_refcount_not_held( - &mc->mc_alloc_slots[zio->io_allocator], zio)); + &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator], + zio)); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) diff --git a/uts/common/sys/fs/zfs.h b/uts/common/sys/fs/zfs.h index cf7c46631424..ed80492dd583 100644 --- a/uts/common/sys/fs/zfs.h +++ b/uts/common/sys/fs/zfs.h @@ -26,6 +26,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -163,6 +164,7 @@ typedef enum { ZFS_PROP_PREV_SNAP, ZFS_PROP_RECEIVE_RESUME_TOKEN, ZFS_PROP_REMAPTXG, /* not exposed to the user */ + ZFS_PROP_SPECIAL_SMALL_BLOCKS, ZFS_NUM_PROPS } zfs_prop_t; @@ -600,6 +602,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ +#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ + /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -645,6 +649,14 @@ typedef struct zpool_load_policy { #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" +#define VDEV_TOP_ZAP_ALLOCATION_BIAS \ + "org.zfsonlinux:allocation_bias" + +/* vdev metaslab allocation bias */ +#define VDEV_ALLOC_BIAS_LOG "log" +#define VDEV_ALLOC_BIAS_SPECIAL "special" +#define VDEV_ALLOC_BIAS_DEDUP "dedup" + #define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ "com.delphix:next_offset_to_initialize" #define VDEV_LEAF_ZAP_INITIALIZE_STATE \ From 4ca571aba060489dbd628e880af130c26c80b269 Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Wed, 6 Nov 2019 09:00:06 +0000 Subject: [PATCH 5/7] 10592 misc. metaslab and vdev related ZoL bug fixes illumos/illumos-gate@555d674d5d4b8191dc83723188349d28278b2431 https://github.com/illumos/illumos-gate/commit/555d674d5d4b8191dc83723188349d28278b2431 https://www.illumos.org/issues/10592 This is a collection of recent fixes from ZoL: 8eef997679b Error path in metaslab_load_impl() forgets to drop ms_sync_lock 928e8ad47d3 Introduce auxiliary metaslab histograms 425d3237ee8 Get rid of space_map_update() for ms_synced_length 6c926f426a2 Simplify log vdev removal code 21e7cf5da89 zdb -L should skip leak detection altogether df72b8bebe0 Rename range_tree_verify to range_tree_verify_not_present 75058f33034 Remove unused vdev_t fields Portions contributed by: Jerry Jelinek Author: Serapheim Dimitropoulos --- cmd/zdb/zdb.c | 277 ++++++------ man/man1m/zdb.1m | 4 +- uts/common/fs/zfs/metaslab.c | 498 ++++++++++++++++++---- uts/common/fs/zfs/range_tree.c | 8 +- uts/common/fs/zfs/spa_checkpoint.c | 6 +- uts/common/fs/zfs/space_map.c | 142 +++--- uts/common/fs/zfs/sys/metaslab.h | 3 + uts/common/fs/zfs/sys/metaslab_impl.h | 79 ++++ uts/common/fs/zfs/sys/range_tree.h | 3 +- uts/common/fs/zfs/sys/space_map.h | 31 +- uts/common/fs/zfs/sys/vdev_impl.h | 11 - uts/common/fs/zfs/vdev.c | 95 +---- uts/common/fs/zfs/vdev_indirect.c | 3 - uts/common/fs/zfs/vdev_indirect_mapping.c | 1 + uts/common/fs/zfs/vdev_initialize.c | 2 +- uts/common/fs/zfs/vdev_removal.c | 56 +-- 16 files changed, 772 insertions(+), 447 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index acfe7ca5f7a8..57c39cf05bd3 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -785,18 +785,21 @@ dump_spacemap(objset_t *os, space_map_t *sm) return; (void) printf("space map object %llu:\n", - (longlong_t)sm->sm_phys->smp_object); - (void) printf(" smp_objsize = 0x%llx\n", - (longlong_t)sm->sm_phys->smp_objsize); + (longlong_t)sm->sm_object); + (void) printf(" smp_length = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_length); (void) printf(" smp_alloc = 0x%llx\n", (longlong_t)sm->sm_phys->smp_alloc); + if (dump_opt['d'] < 6 && dump_opt['m'] < 4) + return; + /* * Print out the freelist entries in both encoded and decoded form. */ uint8_t mapshift = sm->sm_shift; int64_t alloc = 0; - uint64_t word; + uint64_t word, entry_id = 0; for (uint64_t offset = 0; offset < space_map_length(sm); offset += sizeof (word)) { @@ -804,11 +807,12 @@ dump_spacemap(objset_t *os, space_map_t *sm) sizeof (word), &word, DMU_READ_PREFETCH)); if (sm_entry_is_debug(word)) { - (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", - (u_longlong_t)(offset / sizeof (word)), + (void) printf("\t [%6llu] %s: txg %llu pass %llu\n", + (u_longlong_t)entry_id, ddata[SM_DEBUG_ACTION_DECODE(word)], (u_longlong_t)SM_DEBUG_TXG_DECODE(word), (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word)); + entry_id++; continue; } @@ -846,7 +850,7 @@ dump_spacemap(objset_t *os, space_map_t *sm) (void) printf("\t [%6llu] %c range:" " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", - (u_longlong_t)(offset / sizeof (word)), + (u_longlong_t)entry_id, entry_type, (u_longlong_t)entry_off, (u_longlong_t)(entry_off + entry_run), (u_longlong_t)entry_run, @@ -856,8 +860,9 @@ dump_spacemap(objset_t *os, space_map_t *sm) alloc += entry_run; else alloc -= entry_run; + entry_id++; } - if ((uint64_t)alloc != space_map_allocated(sm)) { + if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%lld) INCONSISTENT " "with space map summary (%lld)\n", (longlong_t)space_map_allocated(sm), (longlong_t)alloc); @@ -921,11 +926,8 @@ dump_metaslab(metaslab_t *msp) SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } - if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { - ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); - - dump_spacemap(spa->spa_meta_objset, msp->ms_sm); - } + ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); } static void @@ -3096,6 +3098,8 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) ddt_entry_t dde; int error; + ASSERT(!dump_opt['L']); + bzero(&ddb, sizeof (ddb)); while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { blkptr_t blk; @@ -3119,12 +3123,10 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) zcb->zcb_dedup_blocks++; } } - if (!dump_opt['L']) { - ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; - ddt_enter(ddt); - VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); - ddt_exit(ddt); - } + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + ddt_enter(ddt); + VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); + ddt_exit(ddt); } ASSERT(error == ENOENT); @@ -3166,6 +3168,9 @@ claim_segment_cb(void *arg, uint64_t offset, uint64_t size) static void zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) { + if (dump_opt['L']) + return; + if (spa->spa_vdev_removal == NULL) return; @@ -3257,7 +3262,6 @@ zdb_load_obsolete_counts(vdev_t *vd) space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(prev_obsolete_sm); vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, prev_obsolete_sm); space_map_close(prev_obsolete_sm); @@ -3351,9 +3355,9 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); - space_map_update(checkpoint_sm); VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), checkpoint_sm_exclude_entry_cb, &cseea)); space_map_close(checkpoint_sm); @@ -3363,6 +3367,8 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) static void zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) { + ASSERT(!dump_opt['L']); + vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); @@ -3459,6 +3465,8 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, static void zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) { + ASSERT(!dump_opt['L']); + vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; @@ -3505,67 +3513,63 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; - if (!dump_opt['L']) { - dsl_pool_t *dp = spa->spa_dsl_pool; - vdev_t *rvd = spa->spa_root_vdev; + if (dump_opt['L']) + return; - /* - * We are going to be changing the meaning of the metaslab's - * ms_allocatable. Ensure that the allocator doesn't try to - * use the tree. - */ - spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; - spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + dsl_pool_t *dp = spa->spa_dsl_pool; + vdev_t *rvd = spa->spa_root_vdev; - zcb->zcb_vd_obsolete_counts = - umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), - UMEM_NOFAIL); + /* + * We are going to be changing the meaning of the metaslab's + * ms_allocatable. Ensure that the allocator doesn't try to + * use the tree. + */ + spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; + spa->spa_log_class->mc_ops = &zdb_metaslab_ops; - /* - * For leak detection, we overload the ms_allocatable trees - * to contain allocated segments instead of free segments. - * As a result, we can't use the normal metaslab_load/unload - * interfaces. - */ - zdb_leak_init_prepare_indirect_vdevs(spa, zcb); - load_concrete_ms_allocatable_trees(spa, SM_ALLOC); + zcb->zcb_vd_obsolete_counts = + umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), + UMEM_NOFAIL); - /* - * On load_concrete_ms_allocatable_trees() we loaded all the - * allocated entries from the ms_sm to the ms_allocatable for - * each metaslab. If the pool has a checkpoint or is in the - * middle of discarding a checkpoint, some of these blocks - * may have been freed but their ms_sm may not have been - * updated because they are referenced by the checkpoint. In - * order to avoid false-positives during leak-detection, we - * go through the vdev's checkpoint space map and exclude all - * its entries from their relevant ms_allocatable. - * - * We also aggregate the space held by the checkpoint and add - * it to zcb_checkpoint_size. - * - * Note that at this point we are also verifying that all the - * entries on the checkpoint_sm are marked as allocated in - * the ms_sm of their relevant metaslab. - * [see comment in checkpoint_sm_exclude_entry_cb()] - */ - zdb_leak_init_exclude_checkpoint(spa, zcb); + /* + * For leak detection, we overload the ms_allocatable trees + * to contain allocated segments instead of free segments. + * As a result, we can't use the normal metaslab_load/unload + * interfaces. + */ + zdb_leak_init_prepare_indirect_vdevs(spa, zcb); + load_concrete_ms_allocatable_trees(spa, SM_ALLOC); - /* for cleaner progress output */ - (void) fprintf(stderr, "\n"); + /* + * On load_concrete_ms_allocatable_trees() we loaded all the + * allocated entries from the ms_sm to the ms_allocatable for + * each metaslab. If the pool has a checkpoint or is in the + * middle of discarding a checkpoint, some of these blocks + * may have been freed but their ms_sm may not have been + * updated because they are referenced by the checkpoint. In + * order to avoid false-positives during leak-detection, we + * go through the vdev's checkpoint space map and exclude all + * its entries from their relevant ms_allocatable. + * + * We also aggregate the space held by the checkpoint and add + * it to zcb_checkpoint_size. + * + * Note that at this point we are also verifying that all the + * entries on the checkpoint_sm are marked as allocated in + * the ms_sm of their relevant metaslab. + * [see comment in checkpoint_sm_exclude_entry_cb()] + */ + zdb_leak_init_exclude_checkpoint(spa, zcb); + ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); - if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { - ASSERT(spa_feature_is_enabled(spa, - SPA_FEATURE_DEVICE_REMOVAL)); - (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, - increment_indirect_mapping_cb, zcb, NULL); - } - } else { - /* - * If leak tracing is disabled, we still need to consider - * any checkpointed space in our space verification. - */ - zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa); + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); + + if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_DEVICE_REMOVAL)); + (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, + increment_indirect_mapping_cb, zcb, NULL); } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -3646,52 +3650,58 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) static boolean_t zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) { - boolean_t leaks = B_FALSE; - if (!dump_opt['L']) { - vdev_t *rvd = spa->spa_root_vdev; - for (unsigned c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - metaslab_group_t *mg = vd->vdev_mg; + if (dump_opt['L']) + return (B_FALSE); - if (zcb->zcb_vd_obsolete_counts[c] != NULL) { - leaks |= zdb_check_for_obsolete_leaks(vd, zcb); + boolean_t leaks = B_FALSE; + + vdev_t *rvd = spa->spa_root_vdev; + for (unsigned c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; +#if DEBUG + metaslab_group_t *mg = vd->vdev_mg; +#endif + + if (zcb->zcb_vd_obsolete_counts[c] != NULL) { + leaks |= zdb_check_for_obsolete_leaks(vd, zcb); + } + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(mg, ==, msp->ms_group); + + /* + * ms_allocatable has been overloaded + * to contain allocated segments. Now that + * we finished traversing all blocks, any + * block that remains in the ms_allocatable + * represents an allocated block that we + * did not claim during the traversal. + * Claimed blocks would have been removed + * from the ms_allocatable. For indirect + * vdevs, space remaining in the tree + * represents parts of the mapping that are + * not referenced, which is not a bug. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + range_tree_vacate(msp->ms_allocatable, + NULL, NULL); + } else { + range_tree_vacate(msp->ms_allocatable, + zdb_leak, vd); } - for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - ASSERT3P(mg, ==, msp->ms_group); - - /* - * ms_allocatable has been overloaded - * to contain allocated segments. Now that - * we finished traversing all blocks, any - * block that remains in the ms_allocatable - * represents an allocated block that we - * did not claim during the traversal. - * Claimed blocks would have been removed - * from the ms_allocatable. For indirect - * vdevs, space remaining in the tree - * represents parts of the mapping that are - * not referenced, which is not a bug. - */ - if (vd->vdev_ops == &vdev_indirect_ops) { - range_tree_vacate(msp->ms_allocatable, - NULL, NULL); - } else { - range_tree_vacate(msp->ms_allocatable, - zdb_leak, vd); - } - - if (msp->ms_loaded) { - msp->ms_loaded = B_FALSE; - } + if (msp->ms_loaded) { + msp->ms_loaded = B_FALSE; } } - umem_free(zcb->zcb_vd_obsolete_counts, - rvd->vdev_children * sizeof (uint32_t *)); - zcb->zcb_vd_obsolete_counts = NULL; } + + umem_free(zcb->zcb_vd_obsolete_counts, + rvd->vdev_children * sizeof (uint32_t *)); + zcb->zcb_vd_obsolete_counts = NULL; + return (leaks); } @@ -3730,13 +3740,18 @@ dump_block_stats(spa_t *spa) !dump_opt['L'] ? "nothing leaked " : ""); /* - * Load all space maps as SM_ALLOC maps, then traverse the pool - * claiming each block we discover. If the pool is perfectly - * consistent, the space maps will be empty when we're done. - * Anything left over is a leak; any block we can't claim (because - * it's not part of any space map) is a double allocation, - * reference to a freed block, or an unclaimed log block. + * When leak detection is enabled we load all space maps as SM_ALLOC + * maps, then traverse the pool claiming each block we discover. If + * the pool is perfectly consistent, the segment trees will be empty + * when we're done. Anything left over is a leak; any block we can't + * claim (because it's not part of any space map) is a double + * allocation, reference to a freed block, or an unclaimed log block. + * + * When leak detection is disabled (-L option) we still traverse the + * pool claiming each block we discover, but we skip opening any space + * maps. */ + bzero(&zcb, sizeof (zdb_cb_t)); zdb_leak_init(spa, &zcb); /* @@ -3815,11 +3830,10 @@ dump_block_stats(spa_t *spa) total_found = tzb->zb_asize - zcb.zcb_dedup_asize + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; - if (total_found == total_alloc) { - if (!dump_opt['L']) - (void) printf("\n\tNo leaks (block sum matches space" - " maps exactly)\n"); - } else { + if (total_found == total_alloc && !dump_opt['L']) { + (void) printf("\n\tNo leaks (block sum matches space" + " maps exactly)\n"); + } else if (!dump_opt['L']) { (void) printf("block traversal size %llu != alloc %llu " "(%s %lld)\n", (u_longlong_t)total_found, @@ -4159,7 +4173,6 @@ verify_device_removal_feature_counts(spa_t *spa) spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(prev_obsolete_sm); dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); (void) printf("\n"); space_map_close(prev_obsolete_sm); @@ -4365,7 +4378,8 @@ verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) * their respective ms_allocateable trees should not contain them. */ mutex_enter(&ms->ms_lock); - range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run); + range_tree_verify_not_present(ms->ms_allocatable, + sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); return (0); @@ -4428,7 +4442,6 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), checkpoint_sm_obj, 0, current_vd->vdev_asize, current_vd->vdev_ashift)); - space_map_update(checkpoint_sm); verify_checkpoint_sm_entry_cb_arg_t vcsec; vcsec.vcsec_vd = ckpoint_vd; @@ -4436,6 +4449,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) vcsec.vcsec_num_entries = space_map_length(checkpoint_sm) / sizeof (uint64_t); VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), verify_checkpoint_sm_entry_cb, &vcsec)); dump_spacemap(current->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); @@ -4515,7 +4529,7 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) * are part of the checkpoint were freed by mistake. */ range_tree_walk(ckpoint_msp->ms_allocatable, - (range_tree_func_t *)range_tree_verify, + (range_tree_func_t *)range_tree_verify_not_present, current_msp->ms_allocatable); } } @@ -4527,6 +4541,8 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) static void verify_checkpoint_blocks(spa_t *spa) { + ASSERT(!dump_opt['L']); + spa_t *checkpoint_spa; char *checkpoint_pool; nvlist_t *config = NULL; @@ -4592,7 +4608,6 @@ dump_leftover_checkpoint_blocks(spa_t *spa) VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); - space_map_update(checkpoint_sm); dump_spacemap(spa->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } diff --git a/man/man1m/zdb.1m b/man/man1m/zdb.1m index 63cfc5d7f1b8..ca771c24d787 100644 --- a/man/man1m/zdb.1m +++ b/man/man1m/zdb.1m @@ -10,7 +10,7 @@ .\" .\" .\" Copyright 2012, Richard Lowe. -.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" .Dd April 14, 2017 @@ -187,7 +187,7 @@ If the .Fl u option is also specified, also display the uberblocks on this device. .It Fl L -Disable leak tracing and the loading of space maps. +Disable leak detection and the loading of space maps. By default, .Nm verifies that all non-free blocks are referenced, which can be very expensive. diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index c92297c0fd2a..4552b809ed35 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -489,45 +489,62 @@ metaslab_compare(const void *x1, const void *x2) return (AVL_CMP(m1->ms_start, m2->ms_start)); } +uint64_t +metaslab_allocated_space(metaslab_t *msp) +{ + return (msp->ms_allocated_space); +} + /* * Verify that the space accounting on disk matches the in-core range_trees. */ -void +static void metaslab_verify_space(metaslab_t *msp, uint64_t txg) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - uint64_t allocated = 0; + uint64_t allocating = 0; uint64_t sm_free_space, msp_free_space; ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!msp->ms_condensing); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can only verify the metaslab space when we're called - * from syncing context with a loaded metaslab that has an allocated - * space map. Calling this in non-syncing context does not - * provide a consistent view of the metaslab since we're performing - * allocations in the future. + * from syncing context with a loaded metaslab that has an + * allocated space map. Calling this in non-syncing context + * does not provide a consistent view of the metaslab since + * we're performing allocations in the future. */ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || !msp->ms_loaded) return; - sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - - space_map_alloc_delta(msp->ms_sm); + /* + * Even though the smp_alloc field can get negative (e.g. + * see vdev_checkpoint_sm), that should never be the case + * when it come's to a metaslab's space map. + */ + ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); + + sm_free_space = msp->ms_size - metaslab_allocated_space(msp); /* - * Account for future allocations since we would have already - * deducted that space from the ms_freetree. + * Account for future allocations since we would have + * already deducted that space from the ms_allocatable. */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { - allocated += + allocating += range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } - msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + + ASSERT3U(msp->ms_deferspace, ==, + range_tree_space(msp->ms_defer[0]) + + range_tree_space(msp->ms_defer[1])); + + msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + msp->ms_deferspace + range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); @@ -832,6 +849,7 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; + ASSERT(msp != NULL); /* skip if not active or not a member */ if (msp->ms_sm == NULL || msp->ms_group != mg) @@ -1445,6 +1463,203 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; * ========================================================================== */ +static void +metaslab_aux_histograms_clear(metaslab_t *msp) +{ + /* + * Auxiliary histograms are only cleared when resetting them, + * which can only happen while the metaslab is loaded. + */ + ASSERT(msp->ms_loaded); + + bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); + for (int t = 0; t < TXG_DEFER_SIZE; t++) + bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); +} + +static void +metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, + range_tree_t *rt) +{ + /* + * This is modeled after space_map_histogram_add(), so refer to that + * function for implementation details. We want this to work like + * the space map histogram, and not the range tree histogram, as we + * are essentially constructing a delta that will be later subtracted + * from the space map histogram. + */ + int idx = 0; + for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + ASSERT3U(i, >=, idx + shift); + histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); + + if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { + ASSERT3U(idx + shift, ==, i); + idx++; + ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); + } + } +} + +/* + * Called at every sync pass that the metaslab gets synced. + * + * The reason is that we want our auxiliary histograms to be updated + * wherever the metaslab's space map histogram is updated. This way + * we stay consistent on which parts of the metaslab space map's + * histogram are currently not available for allocations (e.g because + * they are in the defer, freed, and freeing trees). + */ +static void +metaslab_aux_histograms_update(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_sm; + ASSERT(sm != NULL); + + /* + * This is similar to the metaslab's space map histogram updates + * that take place in metaslab_sync(). The only difference is that + * we only care about segments that haven't made it into the + * ms_allocatable tree yet. + */ + if (msp->ms_loaded) { + metaslab_aux_histograms_clear(msp); + + metaslab_aux_histogram_add(msp->ms_synchist, + sm->sm_shift, msp->ms_freed); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + metaslab_aux_histogram_add(msp->ms_deferhist[t], + sm->sm_shift, msp->ms_defer[t]); + } + } + + metaslab_aux_histogram_add(msp->ms_synchist, + sm->sm_shift, msp->ms_freeing); +} + +/* + * Called every time we are done syncing (writing to) the metaslab, + * i.e. at the end of each sync pass. + * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] + */ +static void +metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + space_map_t *sm = msp->ms_sm; + + if (sm == NULL) { + /* + * We came here from metaslab_init() when creating/opening a + * pool, looking at a metaslab that hasn't had any allocations + * yet. + */ + return; + } + + /* + * This is similar to the actions that we take for the ms_freed + * and ms_defer trees in metaslab_sync_done(). + */ + uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; + if (defer_allowed) { + bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], + sizeof (msp->ms_synchist)); + } else { + bzero(msp->ms_deferhist[hist_index], + sizeof (msp->ms_deferhist[hist_index])); + } + bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); +} + +/* + * Ensure that the metaslab's weight and fragmentation are consistent + * with the contents of the histogram (either the range tree's histogram + * or the space map's depending whether the metaslab is loaded). + */ +static void +metaslab_verify_weight_and_frag(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* see comment in metaslab_verify_unflushed_changes() */ + if (msp->ms_group == NULL) + return; + + /* + * Devices being removed always return a weight of 0 and leave + * fragmentation and ms_max_size as is - there is nothing for + * us to verify here. + */ + vdev_t *vd = msp->ms_group->mg_vd; + if (vd->vdev_removing) + return; + + /* + * If the metaslab is dirty it probably means that we've done + * some allocations or frees that have changed our histograms + * and thus the weight. + */ + for (int t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&vd->vdev_ms_list, msp, t)) + return; + } + + /* + * This verification checks that our in-memory state is consistent + * with what's on disk. If the pool is read-only then there aren't + * any changes and we just have the initially-loaded state. + */ + if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) + return; + + /* some extra verification for in-core tree if you can */ + if (msp->ms_loaded) { + range_tree_stat_verify(msp->ms_allocatable); + VERIFY(space_map_histogram_verify(msp->ms_sm, + msp->ms_allocatable)); + } + + uint64_t weight = msp->ms_weight; + uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; + boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); + uint64_t frag = msp->ms_fragmentation; + uint64_t max_segsize = msp->ms_max_size; + + msp->ms_weight = 0; + msp->ms_fragmentation = 0; + msp->ms_max_size = 0; + + /* + * This function is used for verification purposes. Regardless of + * whether metaslab_weight() thinks this metaslab should be active or + * not, we want to ensure that the actual weight (and therefore the + * value of ms_weight) would be the same if it was to be recalculated + * at this point. + */ + msp->ms_weight = metaslab_weight(msp) | was_active; + + VERIFY3U(max_segsize, ==, msp->ms_max_size); + + /* + * If the weight type changed then there is no point in doing + * verification. Revert fields to their original values. + */ + if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || + (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { + msp->ms_fragmentation = frag; + msp->ms_weight = weight; + return; + } + + VERIFY3U(msp->ms_fragmentation, ==, frag); + VERIFY3U(msp->ms_weight, ==, weight); +} + /* * Wait for any in-progress metaslab loads to complete. */ @@ -1466,47 +1681,94 @@ metaslab_load_impl(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loading); + ASSERT(!msp->ms_condensing); /* - * Nobody else can manipulate a loading metaslab, so it's now safe - * to drop the lock. This way we don't have to hold the lock while - * reading the spacemap from disk. + * We temporarily drop the lock to unblock other operations while we + * are reading the space map. Therefore, metaslab_sync() and + * metaslab_sync_done() can run at the same time as we do. + * + * metaslab_sync() can append to the space map while we are loading. + * Therefore we load only entries that existed when we started the + * load. Additionally, metaslab_sync_done() has to wait for the load + * to complete because there are potential races like metaslab_load() + * loading parts of the space map that are currently being appended + * by metaslab_sync(). If we didn't, the ms_allocatable would have + * entries that metaslab_sync_done() would try to re-add later. + * + * That's why before dropping the lock we remember the synced length + * of the metaslab and read up to that point of the space map, + * ignoring entries appended by metaslab_sync() that happen after we + * drop the lock. */ + uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); - /* - * If the space map has not been allocated yet, then treat - * all the space in the metaslab as free and add it to ms_allocatable. - */ if (msp->ms_sm != NULL) { - error = space_map_load(msp->ms_sm, msp->ms_allocatable, - SM_FREE); + error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, + SM_FREE, length); } else { + /* + * The space map has not been allocated yet, so treat + * all the space in the metaslab as free and add it to the + * ms_allocatable tree. + */ range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); } + /* + * We need to grab the ms_sync_lock to prevent metaslab_sync() from + * changing the ms_sm and the metaslab's range trees while we are + * about to use them and populate the ms_allocatable. The ms_lock + * is insufficient for this because metaslab_sync() doesn't hold + * the ms_lock while writing the ms_checkpointing tree to disk. + */ + mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); + ASSERT(!msp->ms_condensing); - if (error != 0) + if (error != 0) { + mutex_exit(&msp->ms_sync_lock); return (error); + } ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; /* - * If the metaslab already has a spacemap, then we need to - * remove all segments from the defer tree; otherwise, the - * metaslab is completely empty and we can skip this. + * The ms_allocatable contains the segments that exist in the + * ms_defer trees [see ms_synced_length]. Thus we need to remove + * them from ms_allocatable as they will be added again in + * metaslab_sync_done(). */ - if (msp->ms_sm != NULL) { - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defer[t], - range_tree_remove, msp->ms_allocatable); - } + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defer[t], + range_tree_remove, msp->ms_allocatable); } + + /* + * Call metaslab_recalculate_weight_and_sort() now that the + * metaslab is loaded so we get the metaslab's real weight. + * + * Unless this metaslab was created with older software and + * has not yet been converted to use segment-based weight, we + * expect the new weight to be better or equal to the weight + * that the metaslab had while it was not loaded. This is + * because the old weight does not take into account the + * consolidation of adjacent segments between TXGs. [see + * comment for ms_synchist and ms_deferhist[] for more info] + */ + uint64_t weight = msp->ms_weight; + metaslab_recalculate_weight_and_sort(msp); + if (!WEIGHT_IS_SPACEBASED(weight)) + ASSERT3U(weight, <=, msp->ms_weight); msp->ms_max_size = metaslab_block_maxsize(msp); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + metaslab_verify_space(msp, spa_syncing_txg(spa)); + mutex_exit(&msp->ms_sync_lock); + return (0); } @@ -1523,6 +1785,7 @@ metaslab_load(metaslab_t *msp) if (msp->ms_loaded) return (0); VERIFY(!msp->ms_loading); + ASSERT(!msp->ms_condensing); msp->ms_loading = B_TRUE; int error = metaslab_load_impl(msp); @@ -1536,10 +1799,29 @@ void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); + + metaslab_verify_weight_and_frag(msp); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; + msp->ms_weight &= ~METASLAB_ACTIVE_MASK; msp->ms_max_size = 0; + + /* + * We explicitly recalculate the metaslab's weight based on its space + * map (as it is now not loaded). We want unload metaslabs to always + * have their weights calculated from the space map histograms, while + * loaded ones have it calculated from their in-core range tree + * [see metaslab_load()]. This way, the weight reflects the information + * available in-core, whether it is loaded or not + * + * If ms_group == NULL means that we came here from metaslab_fini(), + * at which point it doesn't make sense for us to do the recalculation + * and the sorting. + */ + if (msp->ms_group != NULL) + metaslab_recalculate_weight_and_sort(msp); } static void @@ -1579,6 +1861,13 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. + * + * Note: + * When called from vdev_expand(), we can't call into the DMU as + * we are holding the spa_config_lock as a writer and we would + * deadlock [see relevant comment in vdev_metaslab_init()]. in + * that case, the object parameter is zero though, so we won't + * call into the DMU. */ if (object != 0) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, @@ -1590,14 +1879,17 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, } ASSERT(ms->ms_sm != NULL); + ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0); + ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } /* - * We create the main range tree here, but we don't create the + * We create the ms_allocatable here, but we don't create the * other range trees until metaslab_sync_done(). This serves * two purposes: it allows metaslab_sync_done() to detect the - * addition of new space; and for debugging, it ensures that we'd - * data fault on any attempt to use this metaslab before it's ready. + * addition of new space; and for debugging, it ensures that + * we'd data fault on any attempt to use this metaslab before + * it's ready. */ ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms); metaslab_group_add(mg, ms); @@ -1613,8 +1905,11 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, * out this txg. This ensures that we don't attempt to allocate * from it before we have initialized it completely. */ - if (txg <= TXG_INITIAL) + if (txg <= TXG_INITIAL) { metaslab_sync_done(ms, 0); + metaslab_space_update(vd, mg->mg_class, + metaslab_allocated_space(ms), 0, 0); + } /* * If metaslab_debug_load is set and we're initializing a metaslab @@ -1648,7 +1943,7 @@ metaslab_fini(metaslab_t *msp) mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); metaslab_space_update(vd, mg->mg_class, - -space_map_allocated(msp->ms_sm), 0, -msp->ms_size); + -metaslab_allocated_space(msp), 0, -msp->ms_size); space_map_close(msp->ms_sm); @@ -1669,6 +1964,9 @@ metaslab_fini(metaslab_t *msp) range_tree_destroy(msp->ms_checkpointing); + for (int t = 0; t < TXG_SIZE; t++) + ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); + mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); @@ -1684,7 +1982,7 @@ metaslab_fini(metaslab_t *msp) * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done * by calculating the space in each bucket of the spacemap histogram and - * multiplying that by the fragmetation metric in this table. Doing + * multiplying that by the fragmentation metric in this table. Doing * this for all buckets and dividing it by the total amount of free * space in this metaslab (i.e. the total free space in all buckets) gives * us the fragmentation metric. This means that a high fragmentation metric @@ -1719,10 +2017,10 @@ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { }; /* - * Calclate the metaslab's fragmentation metric. A return value - * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does - * not support this metric. Otherwise, the return value should be in the - * range [0, 100]. + * Calculate the metaslab's fragmentation metric and set ms_fragmentation. + * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not + * been upgraded and does not support this metric. Otherwise, the return + * value should be in the range [0, 100]. */ static void metaslab_set_fragmentation(metaslab_t *msp) @@ -1815,7 +2113,7 @@ metaslab_space_weight(metaslab_t *msp) /* * The baseline weight is the metaslab's free space. */ - space = msp->ms_size - space_map_allocated(msp->ms_sm); + space = msp->ms_size - metaslab_allocated_space(msp); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { @@ -1919,14 +2217,38 @@ metaslab_weight_from_range_tree(metaslab_t *msp) static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) { - uint64_t weight = 0; + space_map_t *sm = msp->ms_sm; + ASSERT(!msp->ms_loaded); + ASSERT(sm != NULL); + ASSERT3U(space_map_object(sm), !=, 0); + ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + /* + * Create a joint histogram from all the segments that have made + * it to the metaslab's space map histogram, that are not yet + * available for allocation because they are still in the freeing + * pipeline (e.g. freeing, freed, and defer trees). Then subtract + * these segments from the space map's histogram to get a more + * accurate weight. + */ + uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + deferspace_histogram[i] += msp->ms_synchist[i]; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + deferspace_histogram[i] += msp->ms_deferhist[t][i]; + } + } + + uint64_t weight = 0; for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { - if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { - WEIGHT_SET_COUNT(weight, - msp->ms_sm->sm_phys->smp_histogram[i]); - WEIGHT_SET_INDEX(weight, i + - msp->ms_sm->sm_shift); + ASSERT3U(sm->sm_phys->smp_histogram[i], >=, + deferspace_histogram[i]); + uint64_t count = + sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; + if (count != 0) { + WEIGHT_SET_COUNT(weight, count); + WEIGHT_SET_INDEX(weight, i + sm->sm_shift); WEIGHT_SET_ACTIVE(weight, 0); break; } @@ -1951,7 +2273,7 @@ metaslab_segment_weight(metaslab_t *msp) /* * The metaslab is completely free. */ - if (space_map_allocated(msp->ms_sm) == 0) { + if (metaslab_allocated_space(msp) == 0) { int idx = highbit64(msp->ms_size) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; @@ -1973,7 +2295,7 @@ metaslab_segment_weight(metaslab_t *msp) /* * If the metaslab is fully allocated then just make the weight 0. */ - if (space_map_allocated(msp->ms_sm) == msp->ms_size) + if (metaslab_allocated_space(msp) == msp->ms_size) return (0); /* * If the metaslab is already loaded, then use the range tree to @@ -2054,6 +2376,8 @@ metaslab_weight(metaslab_t *msp) */ if (msp->ms_loaded) msp->ms_max_size = metaslab_block_maxsize(msp); + else + ASSERT0(msp->ms_max_size); /* * Segment-based weighting requires space map histogram support. @@ -2069,6 +2393,15 @@ metaslab_weight(metaslab_t *msp) return (weight); } +void +metaslab_recalculate_weight_and_sort(metaslab_t *msp) +{ + /* note: we preserve the mask (e.g. indication of primary, etc..) */ + uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; + metaslab_group_sort(msp->ms_group, msp, + metaslab_weight(msp) | was_active); +} + static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) @@ -2453,17 +2786,17 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) VERIFY(txg <= spa_final_dirty_txg(spa)); /* - * The only state that can actually be changing concurrently with - * metaslab_sync() is the metaslab's ms_allocatable. No other - * thread can be modifying this txg's alloc, freeing, + * The only state that can actually be changing concurrently + * with metaslab_sync() is the metaslab's ms_allocatable. No + * other thread can be modifying this txg's alloc, freeing, * freed, or space_map_phys_t. We drop ms_lock whenever we - * could call into the DMU, because the DMU can call down to us - * (e.g. via zio_free()) at any time. + * could call into the DMU, because the DMU can call down to + * us (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state - * concurrently, and it is locked out by the ms_sync_lock. Note - * that the ms_lock is insufficient for this, because it is dropped - * by space_map_write(). + * concurrently, and it is locked out by the ms_sync_lock. + * Note that the ms_lock is insufficient for this, because it + * is dropped by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); @@ -2475,7 +2808,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); + ASSERT(msp->ms_sm != NULL); + ASSERT0(metaslab_allocated_space(msp)); } if (!range_tree_is_empty(msp->ms_checkpointing) && @@ -2523,6 +2858,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) mutex_enter(&msp->ms_lock); } + msp->ms_allocated_space += range_tree_space(alloctree); + ASSERT3U(msp->ms_allocated_space, >=, + range_tree_space(msp->ms_freeing)); + msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); + if (!range_tree_is_empty(msp->ms_checkpointing)) { ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); @@ -2536,14 +2876,13 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_write(vd->vdev_checkpoint_sm, msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); - space_map_update(vd->vdev_checkpoint_sm); spa->spa_checkpoint_info.sci_dspace += range_tree_space(msp->ms_checkpointing); vd->vdev_stat.vs_checkpoint_space += range_tree_space(msp->ms_checkpointing); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, - -vd->vdev_checkpoint_sm->sm_alloc); + -space_map_allocated(vd->vdev_checkpoint_sm)); range_tree_vacate(msp->ms_checkpointing, NULL, NULL); } @@ -2588,6 +2927,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * time we load the space map. */ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); + metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); @@ -2595,16 +2935,18 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * For sync pass 1, we avoid traversing this txg's free range tree - * and instead will just swap the pointers for freeing and - * freed. We can safely do this since the freed_tree is - * guaranteed to be empty on the initial pass. + * and instead will just swap the pointers for freeing and freed. + * We can safely do this since the freed_tree is guaranteed to be + * empty on the initial pass. */ if (spa_sync_pass(spa) == 1) { range_tree_swap(&msp->ms_freeing, &msp->ms_freed); + ASSERT0(msp->ms_allocated_this_txg); } else { range_tree_vacate(msp->ms_freeing, range_tree_add, msp->ms_freed); } + msp->ms_allocated_this_txg += range_tree_space(alloctree); range_tree_vacate(alloctree, NULL, NULL); ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); @@ -2682,7 +3024,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) } defer_delta = 0; - alloc_delta = space_map_alloc_delta(msp->ms_sm); + alloc_delta = msp->ms_allocated_this_txg - + range_tree_space(msp->ms_freed); if (defer_allowed) { defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); @@ -2714,7 +3057,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); } - space_map_update(msp->ms_sm); + + msp->ms_synced_length = space_map_length(msp->ms_sm); msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); @@ -2726,6 +3070,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) */ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } + metaslab_aux_histograms_update_done(msp, defer_allowed); if (msp->ms_new) { msp->ms_new = B_FALSE; @@ -2733,12 +3078,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) mg->mg_ms_ready++; mutex_exit(&mg->mg_lock); } + /* - * Calculate the new weights before unloading any metaslabs. - * This will give us the most accurate weighting. + * Re-sort metaslab within its group now that we've adjusted + * its allocatable space. */ - metaslab_group_sort(mg, msp, metaslab_weight(msp) | - (msp->ms_weight & METASLAB_ACTIVE_MASK)); + metaslab_recalculate_weight_and_sort(msp); /* * If the metaslab is loaded and we've not tried to load or allocate @@ -2765,6 +3110,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_checkpointing)); + msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); } @@ -4020,7 +4366,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, zio_alloc_list_t *zal, zio_t *zio, int allocator) { dva_t *dva = bp->blk_dva; - dva_t *hintdva = hintbp->blk_dva; + dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; ASSERT(bp->blk_birth == 0); @@ -4187,14 +4533,16 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); - if (msp->ms_loaded) - range_tree_verify(msp->ms_allocatable, offset, size); + if (msp->ms_loaded) { + range_tree_verify_not_present(msp->ms_allocatable, + offset, size); + } - range_tree_verify(msp->ms_freeing, offset, size); - range_tree_verify(msp->ms_checkpointing, offset, size); - range_tree_verify(msp->ms_freed, offset, size); + range_tree_verify_not_present(msp->ms_freeing, offset, size); + range_tree_verify_not_present(msp->ms_checkpointing, offset, size); + range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) - range_tree_verify(msp->ms_defer[j], offset, size); + range_tree_verify_not_present(msp->ms_defer[j], offset, size); mutex_exit(&msp->ms_lock); } diff --git a/uts/common/fs/zfs/range_tree.c b/uts/common/fs/zfs/range_tree.c index 99bdacb87deb..0a852a9c8da7 100644 --- a/uts/common/fs/zfs/range_tree.c +++ b/uts/common/fs/zfs/range_tree.c @@ -311,13 +311,11 @@ range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) } void -range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) +range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size) { - range_seg_t *rs; - - rs = range_tree_find(rt, off, size); + range_seg_t *rs = range_tree_find(rt, off, size); if (rs != NULL) - panic("freeing free block; rs=%p", (void *)rs); + panic("segment already in tree; rs=%p", (void *)rs); } boolean_t diff --git a/uts/common/fs/zfs/spa_checkpoint.c b/uts/common/fs/zfs/spa_checkpoint.c index 12d50366455c..62c3137cd590 100644 --- a/uts/common/fs/zfs/spa_checkpoint.c +++ b/uts/common/fs/zfs/spa_checkpoint.c @@ -129,7 +129,7 @@ * uberblock would reference data in the removed device. For this reason * and others of similar nature, we disallow the following operations that * can change the config: - * vdev removal and attach/detach, mirror splitting, and pool reguid. + * vdev removal and attach/detach, mirror splitting, and pool reguid. * * - As most of the checkpoint logic is implemented in the SPA and doesn't * distinguish datasets when it comes to space accounting, having a @@ -262,7 +262,7 @@ spa_checkpoint_accounting_verify(spa_t *spa) if (vd->vdev_checkpoint_sm != NULL) { ckpoint_sm_space_sum += - -vd->vdev_checkpoint_sm->sm_alloc; + -space_map_allocated(vd->vdev_checkpoint_sm); vs_ckpoint_space_sum += vd->vdev_stat.vs_checkpoint_space; ASSERT3U(ckpoint_sm_space_sum, ==, @@ -347,7 +347,7 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) error, vd->vdev_id); } ASSERT0(words_after); - ASSERT0(vd->vdev_checkpoint_sm->sm_alloc); + ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm)); ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); space_map_free(vd->vdev_checkpoint_sm, tx); diff --git a/uts/common/fs/zfs/space_map.c b/uts/common/fs/zfs/space_map.c index e85a85f91346..71e1e8cabc96 100644 --- a/uts/common/fs/zfs/space_map.c +++ b/uts/common/fs/zfs/space_map.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include @@ -81,20 +81,22 @@ sm_entry_is_double_word(uint64_t e) /* * Iterate through the space map, invoking the callback on each (non-debug) - * space map entry. + * space map entry. Stop after reading 'end' bytes of the space map. */ int -space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) +space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) { - uint64_t sm_len = space_map_length(sm); - ASSERT3U(sm->sm_blksz, !=, 0); + uint64_t blksz = sm->sm_blksz; - dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len, + ASSERT3U(blksz, !=, 0); + ASSERT3U(end, <=, space_map_length(sm)); + ASSERT0(P2PHASE(end, sizeof (uint64_t))); + + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end, ZIO_PRIORITY_SYNC_READ); - uint64_t blksz = sm->sm_blksz; int error = 0; - for (uint64_t block_base = 0; block_base < sm_len && error == 0; + for (uint64_t block_base = 0; block_base < end && error == 0; block_base += blksz) { dmu_buf_t *db; error = dmu_buf_hold(sm->sm_os, space_map_object(sm), @@ -103,7 +105,7 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) return (error); uint64_t *block_start = db->db_data; - uint64_t block_length = MIN(sm_len - block_base, blksz); + uint64_t block_length = MIN(end - block_base, blksz); uint64_t *block_end = block_start + (block_length / sizeof (uint64_t)); @@ -186,7 +188,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, * dmu_buf_hold(). */ uint64_t last_word_offset = - sm->sm_phys->smp_objsize - sizeof (uint64_t); + sm->sm_phys->smp_length - sizeof (uint64_t); error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, FTAG, &db, DMU_READ_NO_PREFETCH); if (error != 0) @@ -199,7 +201,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, uint64_t *words = db->db_data; *nwords = - (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); @@ -298,8 +300,7 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, uint64_t e = buf[i]; if (sm_entry_is_debug(e)) { - sm->sm_phys->smp_objsize -= sizeof (uint64_t); - space_map_update(sm); + sm->sm_phys->smp_length -= sizeof (uint64_t); continue; } @@ -354,15 +355,13 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, sm->sm_phys->smp_alloc -= entry_run; else sm->sm_phys->smp_alloc += entry_run; - sm->sm_phys->smp_objsize -= words * sizeof (uint64_t); - space_map_update(sm); + sm->sm_phys->smp_length -= words * sizeof (uint64_t); } } if (space_map_length(sm) == 0) { ASSERT0(error); - ASSERT0(sm->sm_phys->smp_objsize); - ASSERT0(sm->sm_alloc); + ASSERT0(space_map_allocated(sm)); } zio_buf_free(buf, bufsz); @@ -390,6 +389,33 @@ space_map_load_callback(space_map_entry_t *sme, void *arg) return (0); } +/* + * Load the spacemap into the rangetree, like space_map_load. But only + * read the first 'length' bytes of the spacemap. + */ +int +space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t length) +{ + space_map_load_arg_t smla; + + VERIFY0(range_tree_space(rt)); + + if (maptype == SM_FREE) + range_tree_add(rt, sm->sm_start, sm->sm_size); + + smla.smla_rt = rt; + smla.smla_sm = sm; + smla.smla_type = maptype; + int err = space_map_iterate(sm, length, + space_map_load_callback, &smla); + + if (err != 0) + range_tree_vacate(rt, NULL, NULL); + + return (err); +} + /* * Load the space map disk into the specified range tree. Segments of maptype * are added to the range tree, other segment types are removed. @@ -397,30 +423,7 @@ space_map_load_callback(space_map_entry_t *sme, void *arg) int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) { - uint64_t space; - int err; - space_map_load_arg_t smla; - - VERIFY0(range_tree_space(rt)); - space = space_map_allocated(sm); - - if (maptype == SM_FREE) { - range_tree_add(rt, sm->sm_start, sm->sm_size); - space = sm->sm_size - space; - } - - smla.smla_rt = rt; - smla.smla_sm = sm; - smla.smla_type = maptype; - err = space_map_iterate(sm, space_map_load_callback, &smla); - - if (err == 0) { - VERIFY3U(range_tree_space(rt), ==, space); - } else { - range_tree_vacate(rt, NULL, NULL); - } - - return (err); + return (space_map_load_length(sm, rt, maptype, space_map_length(sm))); } void @@ -506,10 +509,10 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); - dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize, + dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length, sizeof (dentry), &dentry, tx); - sm->sm_phys->smp_objsize += sizeof (dentry); + sm->sm_phys->smp_length += sizeof (dentry); } /* @@ -541,7 +544,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, uint64_t *block_base = db->db_data; uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); uint64_t *block_cursor = block_base + - (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); ASSERT3P(block_cursor, <=, block_end); @@ -564,7 +567,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, if (block_cursor == block_end) { dmu_buf_rele(db, tag); - uint64_t next_word_offset = sm->sm_phys->smp_objsize; + uint64_t next_word_offset = sm->sm_phys->smp_length; VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), next_word_offset, tag, &db, DMU_READ_PREFETCH)); @@ -594,7 +597,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, SM_DEBUG_SYNCPASS_ENCODE(0) | SM_DEBUG_TXG_ENCODE(0); block_cursor++; - sm->sm_phys->smp_objsize += sizeof (uint64_t); + sm->sm_phys->smp_length += sizeof (uint64_t); ASSERT3P(block_cursor, ==, block_end); continue; } @@ -625,7 +628,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, words); break; } - sm->sm_phys->smp_objsize += words * sizeof (uint64_t); + sm->sm_phys->smp_length += words * sizeof (uint64_t); start += run_len; size -= run_len; @@ -652,7 +655,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * We do this right after we write the intro debug entry * because the estimate does not take it into account. */ - uint64_t initial_objsize = sm->sm_phys->smp_objsize; + uint64_t initial_objsize = sm->sm_phys->smp_length; uint64_t estimated_growth = space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); uint64_t estimated_final_objsize = initial_objsize + estimated_growth; @@ -663,7 +666,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * and use that to get a hold of the last block, so we can * start appending to it. */ - uint64_t next_word_offset = sm->sm_phys->smp_objsize; + uint64_t next_word_offset = sm->sm_phys->smp_length; VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); ASSERT3U(db->db_size, ==, sm->sm_blksz); @@ -711,7 +714,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * Therefore we expect the actual objsize to be equal or less * than whatever we estimated it to be. */ - ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize); + ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length); #endif } @@ -867,23 +870,10 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) } dmu_buf_will_dirty(sm->sm_dbuf, tx); - sm->sm_phys->smp_objsize = 0; + sm->sm_phys->smp_length = 0; sm->sm_phys->smp_alloc = 0; } -/* - * Update the in-core space_map allocation and length values. - */ -void -space_map_update(space_map_t *sm) -{ - if (sm == NULL) - return; - - sm->sm_alloc = sm->sm_phys->smp_alloc; - sm->sm_length = sm->sm_phys->smp_objsize; -} - uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) { @@ -1065,32 +1055,14 @@ space_map_object(space_map_t *sm) return (sm != NULL ? sm->sm_object : 0); } -/* - * Returns the already synced, on-disk allocated space. - */ -uint64_t +int64_t space_map_allocated(space_map_t *sm) { - return (sm != NULL ? sm->sm_alloc : 0); + return (sm != NULL ? sm->sm_phys->smp_alloc : 0); } -/* - * Returns the already synced, on-disk length; - */ uint64_t space_map_length(space_map_t *sm) { - return (sm != NULL ? sm->sm_length : 0); -} - -/* - * Returns the allocated space that is currently syncing. - */ -int64_t -space_map_alloc_delta(space_map_t *sm) -{ - if (sm == NULL) - return (0); - ASSERT(sm->sm_dbuf != NULL); - return (sm->sm_phys->smp_alloc - space_map_allocated(sm)); + return (sm != NULL ? sm->sm_phys->smp_length : 0); } diff --git a/uts/common/fs/zfs/sys/metaslab.h b/uts/common/fs/zfs/sys/metaslab.h index 08fe3955b688..d26b095d14ef 100644 --- a/uts/common/fs/zfs/sys/metaslab.h +++ b/uts/common/fs/zfs/sys/metaslab.h @@ -52,6 +52,8 @@ void metaslab_fini(metaslab_t *); int metaslab_load(metaslab_t *); void metaslab_unload(metaslab_t *); +uint64_t metaslab_allocated_space(metaslab_t *); + void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); @@ -115,6 +117,7 @@ void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, boolean_t); void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); +void metaslab_recalculate_weight_and_sort(metaslab_t *); #ifdef __cplusplus } diff --git a/uts/common/fs/zfs/sys/metaslab_impl.h b/uts/common/fs/zfs/sys/metaslab_impl.h index a2c8e6051772..f8d36f38f7b7 100644 --- a/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/uts/common/fs/zfs/sys/metaslab_impl.h @@ -340,8 +340,34 @@ struct metaslab_group { * being written. */ struct metaslab { + /* + * This is the main lock of the metaslab and its purpose is to + * coordinate our allocations and frees [e.g metaslab_block_alloc(), + * metaslab_free_concrete(), ..etc] with our various syncing + * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. + * + * The lock is also used during some miscellaneous operations like + * using the metaslab's histogram for the metaslab group's histogram + * aggregation, or marking the metaslab for initialization. + */ kmutex_t ms_lock; + + /* + * Acquired together with the ms_lock whenever we expect to + * write to metaslab data on-disk (i.e flushing entries to + * the metaslab's space map). It helps coordinate readers of + * the metaslab's space map [see spa_vdev_remove_thread()] + * with writers [see metaslab_sync()]. + * + * Note that metaslab_load(), even though a reader, uses + * a completely different mechanism to deal with the reading + * of the metaslab's space map based on ms_synced_length. That + * said, the function still uses the ms_sync_lock after it + * has read the ms_sm [see relevant comment in metaslab_load() + * as to why]. + */ kmutex_t ms_sync_lock; + kcondvar_t ms_load_cv; space_map_t *ms_sm; uint64_t ms_id; @@ -351,6 +377,7 @@ struct metaslab { range_tree_t *ms_allocating[TXG_SIZE]; range_tree_t *ms_allocatable; + uint64_t ms_allocated_this_txg; /* * The following range trees are accessed only from syncing context. @@ -375,6 +402,55 @@ struct metaslab { boolean_t ms_loaded; boolean_t ms_loading; + /* + * The following histograms count entries that are in the + * metaslab's space map (and its histogram) but are not in + * ms_allocatable yet, because they are in ms_freed, ms_freeing, + * or ms_defer[]. + * + * When the metaslab is not loaded, its ms_weight needs to + * reflect what is allocatable (i.e. what will be part of + * ms_allocatable if it is loaded). The weight is computed from + * the spacemap histogram, but that includes ranges that are + * not yet allocatable (because they are in ms_freed, + * ms_freeing, or ms_defer[]). Therefore, when calculating the + * weight, we need to remove those ranges. + * + * The ranges in the ms_freed and ms_defer[] range trees are all + * present in the spacemap. However, the spacemap may have + * multiple entries to represent a contiguous range, because it + * is written across multiple sync passes, but the changes of + * all sync passes are consolidated into the range trees. + * Adjacent ranges that are freed in different sync passes of + * one txg will be represented separately (as 2 or more entries) + * in the space map (and its histogram), but these adjacent + * ranges will be consolidated (represented as one entry) in the + * ms_freed/ms_defer[] range trees (and their histograms). + * + * When calculating the weight, we can not simply subtract the + * range trees' histograms from the spacemap's histogram, + * because the range trees' histograms may have entries in + * higher buckets than the spacemap, due to consolidation. + * Instead we must subtract the exact entries that were added to + * the spacemap's histogram. ms_synchist and ms_deferhist[] + * represent these exact entries, so we can subtract them from + * the spacemap's histogram when calculating ms_weight. + * + * ms_synchist represents the same ranges as ms_freeing + + * ms_freed, but without consolidation across sync passes. + * + * ms_deferhist[i] represents the same ranges as ms_defer[i], + * but without consolidation across sync passes. + */ + uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; + uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; + + /* + * Tracks the exact amount of allocated space of this metaslab + * (and specifically the metaslab's space map) up to the most + * recently completed sync pass [see usage in metaslab_sync()]. + */ + uint64_t ms_allocated_space; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ uint64_t ms_activation_weight; /* activation weight */ @@ -411,6 +487,9 @@ struct metaslab { avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ + /* updated every time we are done syncing the metaslab's space map */ + uint64_t ms_synced_length; + boolean_t ms_new; }; diff --git a/uts/common/fs/zfs/sys/range_tree.h b/uts/common/fs/zfs/sys/range_tree.h index 9360e0150933..3816dabf7c1c 100644 --- a/uts/common/fs/zfs/sys/range_tree.h +++ b/uts/common/fs/zfs/sys/range_tree.h @@ -81,9 +81,10 @@ void range_tree_fini(void); range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +void range_tree_verify_not_present(range_tree_t *rt, + uint64_t start, uint64_t size); uint64_t range_tree_space(range_tree_t *rt); boolean_t range_tree_is_empty(range_tree_t *rt); -void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); void range_tree_stat_verify(range_tree_t *rt); uint64_t range_tree_min(range_tree_t *rt); diff --git a/uts/common/fs/zfs/sys/space_map.h b/uts/common/fs/zfs/sys/space_map.h index d3d852978a57..2bce20b48ba5 100644 --- a/uts/common/fs/zfs/sys/space_map.h +++ b/uts/common/fs/zfs/sys/space_map.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_SPACE_MAP_H @@ -55,10 +55,17 @@ extern "C" { * for backward compatibility. */ typedef struct space_map_phys { - uint64_t smp_object; /* on-disk space map object */ - uint64_t smp_objsize; /* size of the object */ - int64_t smp_alloc; /* space allocated from the map */ - uint64_t smp_pad[5]; /* reserved */ + /* object number: not needed but kept for backwards compatibility */ + uint64_t smp_object; + + /* length of the object in bytes */ + uint64_t smp_length; + + /* space allocated from the map */ + int64_t smp_alloc; + + /* reserved */ + uint64_t smp_pad[5]; /* * The smp_histogram maintains a histogram of free regions. Each @@ -81,8 +88,6 @@ typedef struct space_map { uint64_t sm_start; /* start of map */ uint64_t sm_size; /* size of map */ uint8_t sm_shift; /* unit shift */ - uint64_t sm_length; /* synced length */ - int64_t sm_alloc; /* synced space allocated */ objset_t *sm_os; /* objset for this map */ uint64_t sm_object; /* object id for this map */ uint32_t sm_blksz; /* block size for space map */ @@ -189,18 +194,20 @@ boolean_t sm_entry_is_double_word(uint64_t e); typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg); int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); -int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg); +int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t length); +int space_map_iterate(space_map_t *sm, uint64_t length, + sm_cb_t callback, void *arg); int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, dmu_tx_t *tx); +boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt); void space_map_histogram_clear(space_map_t *sm); void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx); -void space_map_update(space_map_t *sm); - uint64_t space_map_object(space_map_t *sm); -uint64_t space_map_allocated(space_map_t *sm); +int64_t space_map_allocated(space_map_t *sm); uint64_t space_map_length(space_map_t *sm); void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, @@ -216,8 +223,6 @@ int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, uint64_t start, uint64_t size, uint8_t shift); void space_map_close(space_map_t *sm); -int64_t space_map_alloc_delta(space_map_t *sm); - #ifdef __cplusplus } #endif diff --git a/uts/common/fs/zfs/sys/vdev_impl.h b/uts/common/fs/zfs/sys/vdev_impl.h index c0bdeffb6451..6ddbe55a0c4b 100644 --- a/uts/common/fs/zfs/sys/vdev_impl.h +++ b/uts/common/fs/zfs/sys/vdev_impl.h @@ -246,7 +246,6 @@ struct vdev { uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_removing; /* device is being removed? */ boolean_t vdev_ishole; /* is a hole in the namespace */ - kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ @@ -305,16 +304,6 @@ struct vdev { range_tree_t *vdev_obsolete_segments; space_map_t *vdev_obsolete_sm; - /* - * The queue depth parameters determine how many async writes are - * still pending (i.e. allocated but not yet issued to disk) per - * top-level (vdev_async_write_queue_depth) and the maximum allowed - * (vdev_max_async_write_queue_depth). These values only apply to - * top-level vdevs. - */ - uint64_t vdev_async_write_queue_depth; - uint64_t vdev_max_async_write_queue_depth; - /* * Leaf vdev state. */ diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c index c72aebe87722..11767fdcad59 100644 --- a/uts/common/fs/zfs/vdev.c +++ b/uts/common/fs/zfs/vdev.c @@ -501,7 +501,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); @@ -889,7 +888,6 @@ vdev_free(vdev_t *vd) rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); - mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); @@ -1251,12 +1249,12 @@ vdev_metaslab_fini(vdev_t *vd) } if (vd->vdev_ms != NULL) { - uint64_t count = vd->vdev_ms_count; + metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_passivate(mg); - metaslab_group_passivate(vd->vdev_mg); + uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - if (msp != NULL) metaslab_fini(msp); } @@ -1264,6 +1262,9 @@ vdev_metaslab_fini(vdev_t *vd) vd->vdev_ms = NULL; vd->vdev_ms_count = 0; + + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + ASSERT0(mg->mg_histogram[i]); } ASSERT0(vd->vdev_ms_count); } @@ -2549,13 +2550,6 @@ vdev_dtl_load(vdev_t *vd) ASSERT(vd->vdev_dtl_sm != NULL); mutex_enter(&vd->vdev_dtl_lock); - - /* - * Now that we've opened the space_map we need to update - * the in-core DTL. - */ - space_map_update(vd->vdev_dtl_sm); - error = space_map_load(vd->vdev_dtl_sm, vd->vdev_dtl[DTL_MISSING], SM_ALLOC); mutex_exit(&vd->vdev_dtl_lock); @@ -2715,10 +2709,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) } dmu_tx_commit(tx); - - mutex_enter(&vd->vdev_dtl_lock); - space_map_update(vd->vdev_dtl_sm); - mutex_exit(&vd->vdev_dtl_lock); } /* @@ -2861,7 +2851,10 @@ vdev_load(vdev_t *vd) "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); - } else if ((error = vdev_metaslab_init(vd, 0)) != 0) { + } + + error = vdev_metaslab_init(vd, 0); + if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, @@ -2875,9 +2868,10 @@ vdev_load(vdev_t *vd) ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); - if ((error = space_map_open(&vd->vdev_checkpoint_sm, + error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, - vd->vdev_ashift))) { + vd->vdev_ashift); + if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", @@ -2885,15 +2879,15 @@ vdev_load(vdev_t *vd) return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); - space_map_update(vd->vdev_checkpoint_sm); /* * Since the checkpoint_sm contains free entries - * exclusively we can use sm_alloc to indicate the - * culmulative checkpointed space that has been freed. + * exclusively we can use space_map_allocated() to + * indicate the cumulative checkpointed space that + * has been freed. */ vd->vdev_stat.vs_checkpoint_space = - -vd->vdev_checkpoint_sm->sm_alloc; + -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } @@ -2925,7 +2919,6 @@ vdev_load(vdev_t *vd) (u_longlong_t)obsolete_sm_object, error); return (error); } - space_map_update(vd->vdev_obsolete_sm); } return (0); @@ -3012,47 +3005,6 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg) ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); - if (vd->vdev_ms != NULL) { - metaslab_group_t *mg = vd->vdev_mg; - - metaslab_group_histogram_verify(mg); - metaslab_class_histogram_verify(mg->mg_class); - - for (int m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - - if (msp == NULL || msp->ms_sm == NULL) - continue; - - mutex_enter(&msp->ms_lock); - /* - * If the metaslab was not loaded when the vdev - * was removed then the histogram accounting may - * not be accurate. Update the histogram information - * here so that we ensure that the metaslab group - * and metaslab class are up-to-date. - */ - metaslab_group_histogram_remove(mg, msp); - - VERIFY0(space_map_allocated(msp->ms_sm)); - space_map_close(msp->ms_sm); - msp->ms_sm = NULL; - mutex_exit(&msp->ms_lock); - } - - if (vd->vdev_checkpoint_sm != NULL) { - ASSERT(spa_has_checkpoint(spa)); - space_map_close(vd->vdev_checkpoint_sm); - vd->vdev_checkpoint_sm = NULL; - } - - metaslab_group_histogram_verify(mg); - metaslab_class_histogram_verify(mg->mg_class); - - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) - ASSERT0(mg->mg_histogram[i]); - } - dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); @@ -3086,17 +3038,14 @@ vdev_sync(vdev_t *vd, uint64_t txg) spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; - dmu_tx_t *tx; + ASSERT3U(txg, ==, spa->spa_syncing_txg); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (range_tree_space(vd->vdev_obsolete_segments) > 0) { - dmu_tx_t *tx; - ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vdev_indirect_sync_obsolete(vd, tx); - dmu_tx_commit(tx); /* * If the vdev is indirect, it can't have dirty @@ -3105,6 +3054,7 @@ vdev_sync(vdev_t *vd, uint64_t txg) if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); + dmu_tx_commit(tx); return; } } @@ -3115,12 +3065,10 @@ vdev_sync(vdev_t *vd, uint64_t txg) !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); - dmu_tx_commit(tx); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { @@ -3139,6 +3087,7 @@ vdev_sync(vdev_t *vd, uint64_t txg) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); + dmu_tx_commit(tx); } uint64_t @@ -3368,8 +3317,6 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { - ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc, - !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } diff --git a/uts/common/fs/zfs/vdev_indirect.c b/uts/common/fs/zfs/vdev_indirect.c index 75c038311004..5b6415937f61 100644 --- a/uts/common/fs/zfs/vdev_indirect.c +++ b/uts/common/fs/zfs/vdev_indirect.c @@ -680,7 +680,6 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr) VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(prev_obsolete_sm); counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); if (prev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, @@ -831,7 +830,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) VERIFY0(space_map_open(&vd->vdev_obsolete_sm, spa->spa_meta_objset, obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(vd->vdev_obsolete_sm); } ASSERT(vd->vdev_obsolete_sm != NULL); @@ -840,7 +838,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) space_map_write(vd->vdev_obsolete_sm, vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); - space_map_update(vd->vdev_obsolete_sm); range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); } diff --git a/uts/common/fs/zfs/vdev_indirect_mapping.c b/uts/common/fs/zfs/vdev_indirect_mapping.c index 1da101733e4c..3d0f1344dd88 100644 --- a/uts/common/fs/zfs/vdev_indirect_mapping.c +++ b/uts/common/fs/zfs/vdev_indirect_mapping.c @@ -557,6 +557,7 @@ vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, losma.losma_counts = counts; losma.losma_vim = vim; VERIFY0(space_map_iterate(obsolete_space_sm, + space_map_length(obsolete_space_sm), load_obsolete_sm_callback, &losma)); } diff --git a/uts/common/fs/zfs/vdev_initialize.c b/uts/common/fs/zfs/vdev_initialize.c index bf246cd8ddcf..e1aa4e9523b4 100644 --- a/uts/common/fs/zfs/vdev_initialize.c +++ b/uts/common/fs/zfs/vdev_initialize.c @@ -442,7 +442,7 @@ vdev_initialize_calculate_progress(vdev_t *vd) mutex_enter(&msp->ms_lock); uint64_t ms_free = msp->ms_size - - space_map_allocated(msp->ms_sm); + metaslab_allocated_space(msp); if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) ms_free /= vd->vdev_top->vdev_children; diff --git a/uts/common/fs/zfs/vdev_removal.c b/uts/common/fs/zfs/vdev_removal.c index c3c6672ccf1a..e4d0224333ff 100644 --- a/uts/common/fs/zfs/vdev_removal.c +++ b/uts/common/fs/zfs/vdev_removal.c @@ -283,15 +283,8 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) if (ms->ms_sm == NULL) continue; - /* - * Sync tasks happen before metaslab_sync(), therefore - * smp_alloc and sm_alloc must be the same. - */ - ASSERT3U(space_map_allocated(ms->ms_sm), ==, - ms->ms_sm->sm_phys->smp_alloc); - spa->spa_removing_phys.sr_to_copy += - space_map_allocated(ms->ms_sm); + metaslab_allocated_space(ms); /* * Space which we are freeing this txg does not need to @@ -1401,22 +1394,8 @@ spa_vdev_remove_thread(void *arg) * appropriate action (see free_from_removing_vdev()). */ if (msp->ms_sm != NULL) { - space_map_t *sm = NULL; - - /* - * We have to open a new space map here, because - * ms_sm's sm_length and sm_alloc may not reflect - * what's in the object contents, if we are in between - * metaslab_sync() and metaslab_sync_done(). - */ - VERIFY0(space_map_open(&sm, - spa->spa_dsl_pool->dp_meta_objset, - msp->ms_sm->sm_object, msp->ms_sm->sm_start, - msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); - space_map_update(sm); - VERIFY0(space_map_load(sm, svr->svr_allocd_segs, - SM_ALLOC)); - space_map_close(sm); + VERIFY0(space_map_load(msp->ms_sm, + svr->svr_allocd_segs, SM_ALLOC)); range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); @@ -1611,16 +1590,6 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) ASSERT0(range_tree_space(msp->ms_freed)); if (msp->ms_sm != NULL) { - /* - * Assert that the in-core spacemap has the same - * length as the on-disk one, so we can use the - * existing in-core spacemap to load it from disk. - */ - ASSERT3U(msp->ms_sm->sm_alloc, ==, - msp->ms_sm->sm_phys->smp_alloc); - ASSERT3U(msp->ms_sm->sm_length, ==, - msp->ms_sm->sm_phys->smp_objsize); - mutex_enter(&svr->svr_lock); VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); @@ -1713,9 +1682,6 @@ spa_vdev_remove_cancel(spa_t *spa) return (error); } -/* - * Called every sync pass of every txg if there's a svr. - */ void svr_sync(spa_t *spa, dmu_tx_t *tx) { @@ -1779,6 +1745,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Stop allocating from this vdev. @@ -1793,15 +1760,14 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* - * Evacuate the device. We don't hold the config lock as writer - * since we need to do I/O but we do keep the + * Evacuate the device. We don't hold the config lock as + * writer since we need to do I/O but we do keep the * spa_namespace_lock held. Once this completes the device * should no longer have any blocks allocated on it. */ - if (vd->vdev_islog) { - if (vd->vdev_stat.vs_alloc != 0) - error = spa_reset_logs(spa); - } + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (vd->vdev_stat.vs_alloc != 0) + error = spa_reset_logs(spa); *txg = spa_vdev_config_enter(spa); @@ -1820,6 +1786,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) vdev_dirty_leaves(vd, VDD_DTL, *txg); vdev_config_dirty(vd); + vdev_metaslab_fini(vd); + spa_history_log_internal(spa, "vdev remove", NULL, "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); @@ -1849,6 +1817,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) if (list_link_active(&vd->vdev_config_dirty_node)) vdev_config_clean(vd); + ASSERT0(vd->vdev_stat.vs_alloc); + /* * Clean up the vdev namespace. */ From 9ce2c74957bc5fb61c8b8e9fc9ca5fcf6a25b844 Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Wed, 6 Nov 2019 09:04:19 +0000 Subject: [PATCH 6/7] 10570 Need workaround to EFI boot on AMI BIOS illumos/illumos-gate@fd7977362aae2eaa5dcb89671159f4fd82f22ca1 https://github.com/illumos/illumos-gate/commit/fd7977362aae2eaa5dcb89671159f4fd82f22ca1 https://www.illumos.org/issues/10570 Some BIOS types are very pick about the GPT header size. Author: John Levon --- uts/common/fs/zfs/zvol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/uts/common/fs/zfs/zvol.c b/uts/common/fs/zfs/zvol.c index 3566984ab4b0..10ea804f8de2 100644 --- a/uts/common/fs/zfs/zvol.c +++ b/uts/common/fs/zfs/zvol.c @@ -27,6 +27,7 @@ * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2019, Joyent, Inc. */ /* @@ -1509,7 +1510,7 @@ zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs) gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE); gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); - gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt)); + gpt.efi_gpt_HeaderSize = LE_32(EFI_HEADER_SIZE); gpt.efi_gpt_MyLBA = LE_64(1ULL); gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL); gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1); @@ -1519,7 +1520,7 @@ zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs) LE_32(sizeof (efi_gpe_t)); CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table); gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); - CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table); + CRC32(crc, &gpt, EFI_HEADER_SIZE, -1U, crc32_table); gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length), flag)) From f4eba6fe5e46352368dd64b7df8ed55278b1175d Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Wed, 6 Nov 2019 09:05:39 +0000 Subject: [PATCH 7/7] 11541 allocation_classes feature must be enabled to add log device illumos/illumos-gate@c1064fd7ce62fe763a4475e9988ffea3b22137de https://github.com/illumos/illumos-gate/commit/c1064fd7ce62fe763a4475e9988ffea3b22137de https://www.illumos.org/issues/11541 After the allocation_classes feature was integrated, one can no longer add a log device to a pool unless that feature is enabled. There is an explicit check for this, but it is unnecessary in the case of log devices, so we should handle this better instead of forcing the feature to be enabled. Author: Jerry Jelinek --- uts/common/fs/zfs/vdev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c index 11767fdcad59..3935f39942e0 100644 --- a/uts/common/fs/zfs/vdev.c +++ b/uts/common/fs/zfs/vdev.c @@ -25,7 +25,7 @@ * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ @@ -633,7 +633,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ - if (spa->spa_load_state != SPA_LOAD_CREATE && + if (alloc_bias != VDEV_BIAS_LOG && + spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP));