MFV r354382,r354385: 10601 10757 Pool allocation classes
illumos/illumos-gate@663207adb1663207adb1
10601 Pool allocation classes https://www.illumos.org/issues/10601 illumos port of ZoL Pool allocation classes. Includes at least these two commits: 441709695 Pool allocation classes misplacing small file blocks cc99f275a Pool allocation classes 10757 Add -gLp to zpool subcommands for alt vdev names https://www.illumos.org/issues/10757 Port from ZoL of d2f3e292d Add -gLp to zpool subcommands for alt vdev names Note that a subsequent ZoL commit changed -p to -P a77f29f93 Change full path subcommand flag from -p to -P Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Portions contributed by: Håkan Johansson <f96hajo@chalmers.se> Portions contributed by: Richard Yao <ryao@gentoo.org> Portions contributed by: Chunwei Chen <david.chen@nutanix.com> Portions contributed by: loli10K <ezomori.nozomu@gmail.com> Author: Don Brady <don.brady@delphix.com> 11541 allocation_classes feature must be enabled to add log device illumos/illumos-gate@c1064fd7cec1064fd7ce
https://www.illumos.org/issues/11541 After the allocation_classes feature was integrated, one can no longer add a log device to a pool unless that feature is enabled. There is an explicit check for this, but it is unnecessary in the case of log devices, so we should handle this better instead of forcing the feature to be enabled. Author: Jerry Jelinek <jerry.jelinek@joyent.com> FreeBSD notes. I faithfully added the new -g, -L, -P flags, but only -g does something: vdev GUIDs are displayed instead of device names. -L, resolve symlinks, and -P, display full disk paths, do nothing at the moment. The use of special vdevs is backward compatible for read-only access, so root pools should be bootable, but exercise caution. MFC after: 4 weeks
This commit is contained in:
parent
35761346d5
commit
8f9d69492c
@ -21,7 +21,7 @@
|
||||
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright 2017 Nexenta Systems, Inc.
|
||||
* Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
|
||||
@ -931,13 +931,23 @@ dump_metaslab(metaslab_t *msp)
|
||||
static void
|
||||
print_vdev_metaslab_header(vdev_t *vd)
|
||||
{
|
||||
(void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n",
|
||||
(u_longlong_t)vd->vdev_id,
|
||||
vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
|
||||
const char *bias_str;
|
||||
|
||||
bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ?
|
||||
VDEV_ALLOC_BIAS_LOG :
|
||||
(alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
|
||||
(alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP :
|
||||
vd->vdev_islog ? "log" : "";
|
||||
|
||||
(void) printf("\tvdev %10llu %s\n"
|
||||
"\t%-10s%5llu %-19s %-15s %-12s\n",
|
||||
(u_longlong_t)vd->vdev_id, bias_str,
|
||||
"metaslabs", (u_longlong_t)vd->vdev_ms_count,
|
||||
"offset", "spacemap", "free");
|
||||
(void) printf("\t%15s %19s %15s %10s\n",
|
||||
(void) printf("\t%15s %19s %15s %12s\n",
|
||||
"---------------", "-------------------",
|
||||
"---------------", "-------------");
|
||||
"---------------", "------------");
|
||||
}
|
||||
|
||||
static void
|
||||
@ -953,7 +963,7 @@ dump_metaslab_groups(spa_t *spa)
|
||||
vdev_t *tvd = rvd->vdev_child[c];
|
||||
metaslab_group_t *mg = tvd->vdev_mg;
|
||||
|
||||
if (mg->mg_class != mc)
|
||||
if (mg == NULL || mg->mg_class != mc)
|
||||
continue;
|
||||
|
||||
metaslab_group_histogram_verify(mg);
|
||||
@ -2807,6 +2817,7 @@ typedef struct zdb_blkstats {
|
||||
uint64_t zb_count;
|
||||
uint64_t zb_gangs;
|
||||
uint64_t zb_ditto_samevdev;
|
||||
uint64_t zb_ditto_same_ms;
|
||||
uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
|
||||
} zdb_blkstats_t;
|
||||
|
||||
@ -2846,6 +2857,16 @@ typedef struct zdb_cb {
|
||||
uint32_t **zcb_vd_obsolete_counts;
|
||||
} zdb_cb_t;
|
||||
|
||||
/* test if two DVA offsets from same vdev are within the same metaslab */
|
||||
static boolean_t
|
||||
same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
|
||||
{
|
||||
vdev_t *vd = vdev_lookup_top(spa, vdev);
|
||||
uint64_t ms_shift = vd->vdev_ms_shift;
|
||||
|
||||
return ((off1 >> ms_shift) == (off2 >> ms_shift));
|
||||
}
|
||||
|
||||
static void
|
||||
zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
|
||||
dmu_object_type_t type)
|
||||
@ -2857,6 +2878,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
|
||||
if (zilog && zil_bp_tree_add(zilog, bp) != 0)
|
||||
return;
|
||||
|
||||
spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
|
||||
int t = (i & 1) ? type : ZDB_OT_TOTAL;
|
||||
@ -2882,8 +2905,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
|
||||
switch (BP_GET_NDVAS(bp)) {
|
||||
case 2:
|
||||
if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
|
||||
DVA_GET_VDEV(&bp->blk_dva[1]))
|
||||
DVA_GET_VDEV(&bp->blk_dva[1])) {
|
||||
zb->zb_ditto_samevdev++;
|
||||
|
||||
if (same_metaslab(zcb->zcb_spa,
|
||||
DVA_GET_VDEV(&bp->blk_dva[0]),
|
||||
DVA_GET_OFFSET(&bp->blk_dva[0]),
|
||||
DVA_GET_OFFSET(&bp->blk_dva[1])))
|
||||
zb->zb_ditto_same_ms++;
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
|
||||
@ -2892,13 +2922,37 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
|
||||
DVA_GET_VDEV(&bp->blk_dva[2])) +
|
||||
(DVA_GET_VDEV(&bp->blk_dva[1]) ==
|
||||
DVA_GET_VDEV(&bp->blk_dva[2]));
|
||||
if (equal != 0)
|
||||
if (equal != 0) {
|
||||
zb->zb_ditto_samevdev++;
|
||||
|
||||
if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
|
||||
DVA_GET_VDEV(&bp->blk_dva[1]) &&
|
||||
same_metaslab(zcb->zcb_spa,
|
||||
DVA_GET_VDEV(&bp->blk_dva[0]),
|
||||
DVA_GET_OFFSET(&bp->blk_dva[0]),
|
||||
DVA_GET_OFFSET(&bp->blk_dva[1])))
|
||||
zb->zb_ditto_same_ms++;
|
||||
else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
|
||||
DVA_GET_VDEV(&bp->blk_dva[2]) &&
|
||||
same_metaslab(zcb->zcb_spa,
|
||||
DVA_GET_VDEV(&bp->blk_dva[0]),
|
||||
DVA_GET_OFFSET(&bp->blk_dva[0]),
|
||||
DVA_GET_OFFSET(&bp->blk_dva[2])))
|
||||
zb->zb_ditto_same_ms++;
|
||||
else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
|
||||
DVA_GET_VDEV(&bp->blk_dva[2]) &&
|
||||
same_metaslab(zcb->zcb_spa,
|
||||
DVA_GET_VDEV(&bp->blk_dva[1]),
|
||||
DVA_GET_OFFSET(&bp->blk_dva[1]),
|
||||
DVA_GET_OFFSET(&bp->blk_dva[2])))
|
||||
zb->zb_ditto_same_ms++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
|
||||
|
||||
if (BP_IS_EMBEDDED(bp)) {
|
||||
zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
|
||||
zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
|
||||
@ -3709,6 +3763,7 @@ dump_block_stats(spa_t *spa)
|
||||
uint64_t norm_alloc, norm_space, total_alloc, total_found;
|
||||
int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
|
||||
boolean_t leaks = B_FALSE;
|
||||
int err;
|
||||
|
||||
bzero(&zcb, sizeof (zcb));
|
||||
(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
|
||||
@ -3751,8 +3806,10 @@ dump_block_stats(spa_t *spa)
|
||||
flags |= TRAVERSE_PREFETCH_DATA;
|
||||
|
||||
zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
|
||||
zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
|
||||
zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
|
||||
zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
|
||||
zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
|
||||
err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
|
||||
|
||||
/*
|
||||
* If we've traversed the data blocks then we need to wait for those
|
||||
@ -3768,6 +3825,12 @@ dump_block_stats(spa_t *spa)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Done after zio_wait() since zcb_haderrors is modified in
|
||||
* zdb_blkptr_done()
|
||||
*/
|
||||
zcb.zcb_haderrors |= err;
|
||||
|
||||
if (zcb.zcb_haderrors) {
|
||||
(void) printf("\nError counts:\n\n");
|
||||
(void) printf("\t%5s %s\n", "errno", "count");
|
||||
@ -3789,7 +3852,10 @@ dump_block_stats(spa_t *spa)
|
||||
norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
|
||||
norm_space = metaslab_class_get_space(spa_normal_class(spa));
|
||||
|
||||
total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
|
||||
total_alloc = norm_alloc +
|
||||
metaslab_class_get_alloc(spa_log_class(spa)) +
|
||||
metaslab_class_get_alloc(spa_special_class(spa)) +
|
||||
metaslab_class_get_alloc(spa_dedup_class(spa));
|
||||
total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
|
||||
zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
|
||||
|
||||
@ -3811,31 +3877,50 @@ dump_block_stats(spa_t *spa)
|
||||
return (2);
|
||||
|
||||
(void) printf("\n");
|
||||
(void) printf("\tbp count: %10llu\n",
|
||||
(void) printf("\t%-16s %14llu\n", "bp count:",
|
||||
(u_longlong_t)tzb->zb_count);
|
||||
(void) printf("\tganged count: %10llu\n",
|
||||
(void) printf("\t%-16s %14llu\n", "ganged count:",
|
||||
(longlong_t)tzb->zb_gangs);
|
||||
(void) printf("\tbp logical: %10llu avg: %6llu\n",
|
||||
(void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:",
|
||||
(u_longlong_t)tzb->zb_lsize,
|
||||
(u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
|
||||
(void) printf("\tbp physical: %10llu avg:"
|
||||
" %6llu compression: %6.2f\n",
|
||||
(u_longlong_t)tzb->zb_psize,
|
||||
(void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
|
||||
"bp physical:", (u_longlong_t)tzb->zb_psize,
|
||||
(u_longlong_t)(tzb->zb_psize / tzb->zb_count),
|
||||
(double)tzb->zb_lsize / tzb->zb_psize);
|
||||
(void) printf("\tbp allocated: %10llu avg:"
|
||||
" %6llu compression: %6.2f\n",
|
||||
(u_longlong_t)tzb->zb_asize,
|
||||
(void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
|
||||
"bp allocated:", (u_longlong_t)tzb->zb_asize,
|
||||
(u_longlong_t)(tzb->zb_asize / tzb->zb_count),
|
||||
(double)tzb->zb_lsize / tzb->zb_asize);
|
||||
(void) printf("\tbp deduped: %10llu ref>1:"
|
||||
" %6llu deduplication: %6.2f\n",
|
||||
(u_longlong_t)zcb.zcb_dedup_asize,
|
||||
(void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n",
|
||||
"bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize,
|
||||
(u_longlong_t)zcb.zcb_dedup_blocks,
|
||||
(double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
|
||||
(void) printf("\tSPA allocated: %10llu used: %5.2f%%\n",
|
||||
(void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
|
||||
(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
|
||||
|
||||
if (spa_special_class(spa)->mc_rotor != NULL) {
|
||||
uint64_t alloc = metaslab_class_get_alloc(
|
||||
spa_special_class(spa));
|
||||
uint64_t space = metaslab_class_get_space(
|
||||
spa_special_class(spa));
|
||||
|
||||
(void) printf("\t%-16s %14llu used: %5.2f%%\n",
|
||||
"Special class", (u_longlong_t)alloc,
|
||||
100.0 * alloc / space);
|
||||
}
|
||||
|
||||
if (spa_dedup_class(spa)->mc_rotor != NULL) {
|
||||
uint64_t alloc = metaslab_class_get_alloc(
|
||||
spa_dedup_class(spa));
|
||||
uint64_t space = metaslab_class_get_space(
|
||||
spa_dedup_class(spa));
|
||||
|
||||
(void) printf("\t%-16s %14llu used: %5.2f%%\n",
|
||||
"Dedup class", (u_longlong_t)alloc,
|
||||
100.0 * alloc / space);
|
||||
}
|
||||
|
||||
for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
|
||||
if (zcb.zcb_embedded_blocks[i] == 0)
|
||||
continue;
|
||||
@ -3857,6 +3942,10 @@ dump_block_stats(spa_t *spa)
|
||||
(void) printf("\tDittoed blocks on same vdev: %llu\n",
|
||||
(longlong_t)tzb->zb_ditto_samevdev);
|
||||
}
|
||||
if (tzb->zb_ditto_same_ms != 0) {
|
||||
(void) printf("\tDittoed blocks in same metaslab: %llu\n",
|
||||
(longlong_t)tzb->zb_ditto_same_ms);
|
||||
}
|
||||
|
||||
for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
|
||||
vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
|
||||
|
@ -1134,8 +1134,23 @@ This feature must be enabled to be used
|
||||
.Po see
|
||||
.Xr zpool-features 7
|
||||
.Pc .
|
||||
.It Sy special_small_blocks Ns = Ns Ar size
|
||||
This value represents the threshold block size for including small file
|
||||
blocks into the special allocation class.
|
||||
Blocks smaller than or equal to this value will be assigned to the special
|
||||
allocation class while greater blocks will be assigned to the regular class.
|
||||
Valid values are zero or a power of two from 512B up to 128K.
|
||||
The default size is 0 which means no small file blocks will be allocated in
|
||||
the special class.
|
||||
.Pp
|
||||
Before setting this property, a special class vdev must be added to the
|
||||
pool.
|
||||
See
|
||||
.Xr zpool 8
|
||||
for more details on the special allocation class.
|
||||
.It Sy mountpoint Ns = Ns Ar path | Cm none | legacy
|
||||
Controls the mount point used for this file system. See the
|
||||
Controls the mount point used for this file system.
|
||||
See the
|
||||
.Qq Sx Mount Points
|
||||
section for more information on how this property is used.
|
||||
.Pp
|
||||
@ -3023,7 +3038,7 @@ property of the filesystem or volume which is received into.
|
||||
To use this flag, the storage pool must have the
|
||||
.Sy extensible_dataset
|
||||
feature enabled. See
|
||||
.Xr zpool-features 5
|
||||
.Xr zpool-features 7
|
||||
for details on ZFS feature flags.
|
||||
.El
|
||||
.It Xo
|
||||
|
@ -632,6 +632,25 @@ and will return to being
|
||||
once all filesystems that have ever had their checksum set to
|
||||
.Sy skein
|
||||
are destroyed.
|
||||
.It Sy allocation_classes
|
||||
.Bl -column "READ\-ONLY COMPATIBLE" "com.intel:allocation_classes"
|
||||
.It GUID Ta com.intel:allocation_classes
|
||||
.It READ\-ONLY COMPATIBLE Ta yes
|
||||
.It DEPENDENCIES Ta none
|
||||
.El
|
||||
.Pp
|
||||
This feature enables support for separate allocation classes.
|
||||
.Pp
|
||||
This feature becomes
|
||||
.Sy active
|
||||
when a dedicated allocation class vdev
|
||||
(dedup or special) is created with
|
||||
.Dq zpool create
|
||||
or
|
||||
.Dq zpool add .
|
||||
With device removal, it can be returned to the
|
||||
.Sy enabled
|
||||
state if all the top-level vdevs from an allocation class are removed.
|
||||
.El
|
||||
.Sh SEE ALSO
|
||||
.Xr zpool 8
|
||||
|
@ -24,6 +24,8 @@
|
||||
.\" Copyright (c) 2012, 2017 by Delphix. All Rights Reserved.
|
||||
.\" Copyright 2017 Nexenta Systems, Inc.
|
||||
.\" Copyright (c) 2017 Datto Inc.
|
||||
.\" Copyright (c) 2017 George Melikov. All Rights Reserved.
|
||||
.\" Copyright 2019 Joyent, Inc.
|
||||
.\"
|
||||
.\" $FreeBSD$
|
||||
.\"
|
||||
@ -38,7 +40,7 @@
|
||||
.Op Fl \&?
|
||||
.Nm
|
||||
.Cm add
|
||||
.Op Fl fn
|
||||
.Op Fl fgLnP
|
||||
.Ar pool vdev ...
|
||||
.Nm
|
||||
.Cm attach
|
||||
@ -127,17 +129,19 @@
|
||||
.Op Ar device Ns ...
|
||||
.Nm
|
||||
.Cm iostat
|
||||
.Op Fl T Cm d Ns | Ns Cm u
|
||||
.Op Fl v
|
||||
.Op Fl T Cm d Ns | Ns Cm u
|
||||
.Op Fl gLP
|
||||
.Op Ar pool
|
||||
.Ar ...
|
||||
.Op Ar inverval Op Ar count
|
||||
.Nm
|
||||
.Cm labelclear
|
||||
.Op Fl f
|
||||
.Ar device
|
||||
.Nm
|
||||
.Cm list
|
||||
.Op Fl Hpv
|
||||
.Op Fl HgLpPv
|
||||
.Op Fl o Ar property Ns Op , Ns Ar ...
|
||||
.Op Fl T Cm d Ns | Ns Cm u
|
||||
.Op Ar pool
|
||||
@ -179,7 +183,7 @@
|
||||
.Ar property Ns = Ns Ar value pool
|
||||
.Nm
|
||||
.Cm split
|
||||
.Op Fl n
|
||||
.Op Fl gLnP
|
||||
.Op Fl R Ar altroot
|
||||
.Op Fl o Ar mntopts
|
||||
.Op Fl o Ar property Ns = Ns Ar value
|
||||
@ -187,7 +191,7 @@
|
||||
.Op Ar device ...
|
||||
.Nm
|
||||
.Cm status
|
||||
.Op Fl Dvx
|
||||
.Op Fl DgLPvx
|
||||
.Op Fl T Cm d Ns | Ns Cm u
|
||||
.Op Ar pool
|
||||
.Ar ...
|
||||
@ -320,11 +324,27 @@ types are not supported for the intent log. For more information,
|
||||
see the
|
||||
.Qq Sx Intent Log
|
||||
section.
|
||||
.It Sy dedup
|
||||
A device dedicated solely for allocating dedup data.
|
||||
The redundancy of this device should match the redundancy of the other normal
|
||||
devices in the pool.
|
||||
If more than one dedup device is specified, then allocations are load-balanced
|
||||
between devices.
|
||||
.It Sy special
|
||||
A device dedicated solely for allocating various kinds of internal metadata,
|
||||
and optionally small file data.
|
||||
The redundancy of this device should match the redundancy of the other normal
|
||||
devices in the pool.
|
||||
If more than one special device is specified, then allocations are
|
||||
load-balanced between devices.
|
||||
.Pp
|
||||
For more information on special allocations, see the
|
||||
.Sx Special Allocation Class
|
||||
section.
|
||||
.It Sy cache
|
||||
A device used to cache storage pool data. A cache device cannot be configured
|
||||
as a mirror or
|
||||
.No raidz
|
||||
group. For more information, see the
|
||||
A device used to cache storage pool data.
|
||||
A cache device cannot be configured as a mirror or raidz group.
|
||||
For more information, see the
|
||||
.Qq Sx Cache Devices
|
||||
section.
|
||||
.El
|
||||
@ -602,6 +622,31 @@ zfs properties) may be unenforceable while a checkpoint exists, because the
|
||||
checkpoint is allowed to consume the dataset's reservation.
|
||||
Finally, data that is part of the checkpoint but has been freed in the
|
||||
current state of the pool won't be scanned during a scrub.
|
||||
.Ss Special Allocation Class
|
||||
The allocations in the special class are dedicated to specific block types.
|
||||
By default this includes all metadata, the indirect blocks of user data, and
|
||||
any dedup data.
|
||||
The class can also be provisioned to accept a limited percentage of small file
|
||||
data blocks.
|
||||
.Pp
|
||||
A pool must always have at least one general (non-specified) vdev before
|
||||
other devices can be assigned to the special class.
|
||||
If the special class becomes full, then allocations intended for it will spill
|
||||
back into the normal class.
|
||||
.Pp
|
||||
Dedup data can be excluded from the special class by setting the
|
||||
.Sy vfs.zfs.ddt_data_is_special
|
||||
sysctl to false (0).
|
||||
.Pp
|
||||
Inclusion of small file blocks in the special class is opt-in.
|
||||
Each dataset can control the size of small file blocks allowed in the special
|
||||
class by setting the
|
||||
.Sy special_small_blocks
|
||||
dataset property.
|
||||
It defaults to zero so you must opt-in by setting it to a non-zero value.
|
||||
See
|
||||
.Xr zfs 1M
|
||||
for more info on setting this property.
|
||||
.Ss Properties
|
||||
Each pool has several properties associated with it. Some properties are
|
||||
read-only statistics while others are configurable and change the behavior of
|
||||
@ -872,7 +917,7 @@ Displays a help message.
|
||||
.It Xo
|
||||
.Nm
|
||||
.Cm add
|
||||
.Op Fl fn
|
||||
.Op Fl fgLnP
|
||||
.Ar pool vdev ...
|
||||
.Xc
|
||||
.Pp
|
||||
@ -891,11 +936,30 @@ Forces use of
|
||||
.Ar vdev ,
|
||||
even if they appear in use or specify a conflicting replication level.
|
||||
Not all devices can be overridden in this manner.
|
||||
.It Fl g
|
||||
Display
|
||||
.Ar vdev ,
|
||||
GUIDs instead of the normal device names.
|
||||
These GUIDs can be used in place of
|
||||
device names for the zpool detach/offline/remove/replace commands.
|
||||
.It Fl L
|
||||
Display real paths for
|
||||
.Ar vdev Ns s
|
||||
resolving all symbolic links.
|
||||
This can be used to look up the current block
|
||||
device name regardless of the /dev/disk/ path used to open it.
|
||||
.It Fl n
|
||||
Displays the configuration that would be used without actually adding the
|
||||
.Ar vdev Ns s.
|
||||
The actual pool creation can still fail due to insufficient privileges or device
|
||||
sharing.
|
||||
The actual pool creation can still fail due to insufficient privileges or
|
||||
device sharing.
|
||||
.It Fl P
|
||||
Display real paths for
|
||||
.Ar vdev Ns s
|
||||
instead of only the last component of the path.
|
||||
This can be used in conjunction with the
|
||||
.Fl L
|
||||
flag.
|
||||
.El
|
||||
.It Xo
|
||||
.Nm
|
||||
@ -1512,7 +1576,7 @@ with no flags on the relevant target devices.
|
||||
.Nm
|
||||
.Cm iostat
|
||||
.Op Fl T Cm d Ns | Ns Cm u
|
||||
.Op Fl v
|
||||
.Op Fl gLPv
|
||||
.Op Ar pool
|
||||
.Ar ...
|
||||
.Op Ar interval Op Ar count
|
||||
@ -1544,10 +1608,25 @@ Use modifier
|
||||
.Cm u
|
||||
for unixtime
|
||||
.Pq equals Qq Ic date +%s .
|
||||
.It Fl g
|
||||
Display vdev GUIDs instead of the normal device names.
|
||||
These GUIDs can be used in place of device names for the zpool
|
||||
detach/offline/remove/replace commands.
|
||||
.It Fl L
|
||||
Display real paths for vdevs resolving all symbolic links.
|
||||
This can be used to look up the current block device name regardless of the
|
||||
.Pa /dev/disk/
|
||||
path used to open it.
|
||||
.It Fl P
|
||||
Display full paths for vdevs instead of only the last component of
|
||||
the path.
|
||||
This can be used in conjunction with the
|
||||
.Fl L
|
||||
flag.
|
||||
.It Fl v
|
||||
Verbose statistics. Reports usage statistics for individual
|
||||
.No vdev Ns s
|
||||
within the pool, in addition to the pool-wide statistics.
|
||||
Verbose statistics.
|
||||
Reports usage statistics for individual vdevs within the
|
||||
pool, in addition to the pool-wide statistics.
|
||||
.El
|
||||
.It Xo
|
||||
.Nm
|
||||
@ -1570,7 +1649,7 @@ Treat exported or foreign devices as inactive.
|
||||
.It Xo
|
||||
.Nm
|
||||
.Cm list
|
||||
.Op Fl Hpv
|
||||
.Op Fl HgLpPv
|
||||
.Op Fl o Ar property Ns Op , Ns Ar ...
|
||||
.Op Fl T Cm d Ns | Ns Cm u
|
||||
.Op Ar pool
|
||||
@ -1603,11 +1682,27 @@ Use modifier
|
||||
.Cm u
|
||||
for unixtime
|
||||
.Pq equals Qq Ic date +%s .
|
||||
.It Fl g
|
||||
Display vdev GUIDs instead of the normal device names.
|
||||
These GUIDs can be used in place of device names for the zpool
|
||||
detach/offline/remove/replace commands.
|
||||
.It Fl H
|
||||
Scripted mode. Do not display headers, and separate fields by a single tab
|
||||
instead of arbitrary space.
|
||||
.It Fl L
|
||||
Display real paths for vdevs resolving all symbolic links.
|
||||
This can be used to look up the current block device name regardless of the
|
||||
/dev/disk/ path used to open it.
|
||||
.It Fl p
|
||||
Display numbers in parsable (exact) values.
|
||||
Display numbers in parsable
|
||||
.Pq exact
|
||||
values.
|
||||
.It Fl P
|
||||
Display full paths for vdevs instead of only the last component of
|
||||
the path.
|
||||
This can be used in conjunction with the
|
||||
.Fl L
|
||||
flag.
|
||||
.It Fl v
|
||||
Verbose statistics. Reports usage statistics for individual
|
||||
.Em vdevs
|
||||
@ -1702,7 +1797,7 @@ the background.
|
||||
The removal progress can be monitored with
|
||||
.Nm zpool Cm status.
|
||||
This feature must be enabled to be used, see
|
||||
.Xr zpool-features 5
|
||||
.Xr zpool-features 7
|
||||
.Pp
|
||||
A mirrored top-level device (log or data) can be removed by specifying the
|
||||
top-level mirror for the same.
|
||||
@ -1844,7 +1939,7 @@ values.
|
||||
.It Xo
|
||||
.Nm
|
||||
.Cm split
|
||||
.Op Fl n
|
||||
.Op Fl gLnP
|
||||
.Op Fl R Ar altroot
|
||||
.Op Fl o Ar mntopts
|
||||
.Op Fl o Ar property Ns = Ns Ar value
|
||||
@ -1884,6 +1979,15 @@ parameter for the new pool's alternate root. See the
|
||||
description in the
|
||||
.Qq Sx Properties
|
||||
section, above.
|
||||
.It Fl g
|
||||
Display vdev GUIDs instead of the normal device names.
|
||||
These GUIDs can be used in place of device names for the zpool
|
||||
detach/offline/remove/replace commands.
|
||||
.It Fl L
|
||||
Display real paths for vdevs resolving all symbolic links.
|
||||
This can be used to look up the current block device name regardless of the
|
||||
.Pa /dev/disk/
|
||||
path used to open it.
|
||||
.It Fl n
|
||||
Displays the configuration that would be created without actually splitting the
|
||||
pool. The actual pool split could still fail due to insufficient privileges or
|
||||
@ -1900,11 +2004,17 @@ option.
|
||||
Sets the specified property on the new pool. See the
|
||||
.Qq Sx Properties
|
||||
section, above, for more information on the available pool properties.
|
||||
.It Fl P
|
||||
Display full paths for vdevs instead of only the last component of
|
||||
the path.
|
||||
This can be used in conjunction with the
|
||||
.Fl L
|
||||
flag.
|
||||
.El
|
||||
.It Xo
|
||||
.Nm
|
||||
.Cm status
|
||||
.Op Fl Dvx
|
||||
.Op Fl DgLPvx
|
||||
.Op Fl T Cm d Ns | Ns Cm u
|
||||
.Op Ar pool
|
||||
.Ar ...
|
||||
@ -1939,6 +2049,21 @@ Display a histogram of deduplication statistics, showing the allocated
|
||||
and referenced
|
||||
.Pq logically referenced in the pool
|
||||
block counts and sizes by reference count.
|
||||
.It Fl g
|
||||
Display vdev GUIDs instead of the normal device names.
|
||||
These GUIDs can be used in place of device names for the zpool
|
||||
detach/offline/remove/replace commands.
|
||||
.It Fl L
|
||||
Display real paths for vdevs resolving all symbolic links.
|
||||
This can be used to look up the current block device name regardless of the
|
||||
.Pa /dev/disk/
|
||||
path used to open it.
|
||||
.It Fl P
|
||||
Display full paths for vdevs instead of only the last component of
|
||||
the path.
|
||||
This can be used in conjunction with the
|
||||
.Fl L
|
||||
flag.
|
||||
.It Fl T Cm d Ns | Ns Cm u
|
||||
Print a timestamp.
|
||||
.Pp
|
||||
@ -2030,6 +2155,30 @@ An error occurred.
|
||||
.It 2
|
||||
Invalid command line options were specified.
|
||||
.El
|
||||
.Sh ENVIRONMENT VARIABLES
|
||||
.Bl -tag -width "ZPOOL_VDEV_NAME_FOLLOW_LINKS"
|
||||
.It Ev ZPOOL_VDEV_NAME_GUID
|
||||
Cause
|
||||
.Nm zpool
|
||||
subcommands to output vdev guids by default.
|
||||
This behavior is identical to the
|
||||
.Nm zpool status -g
|
||||
command line option.
|
||||
.It Ev ZPOOL_VDEV_NAME_FOLLOW_LINKS
|
||||
Cause
|
||||
.Nm zpool
|
||||
subcommands to follow links for vdev names by default.
|
||||
This behavior is identical to the
|
||||
.Nm zpool status -L
|
||||
command line option.
|
||||
.It Ev ZPOOL_VDEV_NAME_PATH
|
||||
Cause
|
||||
.Nm zpool
|
||||
subcommands to output full vdev path names by default.
|
||||
This behavior is identical to the
|
||||
.Nm zpool status -P
|
||||
command line option.
|
||||
.El
|
||||
.Sh EXAMPLES
|
||||
.Bl -tag -width 0n
|
||||
.It Sy Example 1 No Creating a RAID-Z Storage Pool
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -21,7 +21,8 @@
|
||||
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2016, 2017 Intel Corporation.
|
||||
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
|
||||
*/
|
||||
|
||||
@ -508,6 +509,9 @@ make_leaf_vdev(const char *arg, uint64_t is_log)
|
||||
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
|
||||
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
|
||||
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
|
||||
if (is_log)
|
||||
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
VDEV_ALLOC_BIAS_LOG) == 0);
|
||||
if (strcmp(type, VDEV_TYPE_DISK) == 0)
|
||||
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
|
||||
(uint64_t)wholedisk) == 0);
|
||||
@ -564,6 +568,9 @@ make_leaf_vdev(const char *arg, uint64_t is_log)
|
||||
*
|
||||
* Otherwise, make sure that the current spec (if there is one) and the new
|
||||
* spec have consistent replication levels.
|
||||
*
|
||||
* If there is no current spec (create), make sure new spec has at least
|
||||
* one general purpose vdev.
|
||||
*/
|
||||
typedef struct replication_level {
|
||||
char *zprl_type;
|
||||
@ -573,6 +580,19 @@ typedef struct replication_level {
|
||||
|
||||
#define ZPOOL_FUZZ (16 * 1024 * 1024)
|
||||
|
||||
static boolean_t
|
||||
is_raidz_mirror(replication_level_t *a, replication_level_t *b,
|
||||
replication_level_t **raidz, replication_level_t **mirror)
|
||||
{
|
||||
if (strcmp(a->zprl_type, "raidz") == 0 &&
|
||||
strcmp(b->zprl_type, "mirror") == 0) {
|
||||
*raidz = a;
|
||||
*mirror = b;
|
||||
return (B_TRUE);
|
||||
}
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a list of toplevel vdevs, return the current replication level. If
|
||||
* the config is inconsistent, then NULL is returned. If 'fatal' is set, then
|
||||
@ -590,6 +610,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
|
||||
replication_level_t lastrep = {0};
|
||||
replication_level_t rep;
|
||||
replication_level_t *ret;
|
||||
replication_level_t *raidz, *mirror;
|
||||
boolean_t dontreport;
|
||||
|
||||
ret = safe_malloc(sizeof (replication_level_t));
|
||||
@ -787,11 +808,39 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
|
||||
|
||||
/*
|
||||
* At this point, we have the replication of the last toplevel
|
||||
* vdev in 'rep'. Compare it to 'lastrep' to see if its
|
||||
* vdev in 'rep'. Compare it to 'lastrep' to see if it is
|
||||
* different.
|
||||
*/
|
||||
if (lastrep.zprl_type != NULL) {
|
||||
if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
|
||||
if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
|
||||
is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
|
||||
/*
|
||||
* Accepted raidz and mirror when they can
|
||||
* handle the same number of disk failures.
|
||||
*/
|
||||
if (raidz->zprl_parity !=
|
||||
mirror->zprl_children - 1) {
|
||||
if (ret != NULL)
|
||||
free(ret);
|
||||
ret = NULL;
|
||||
if (fatal)
|
||||
vdev_error(gettext(
|
||||
"mismatched replication "
|
||||
"level: "
|
||||
"%s and %s vdevs with "
|
||||
"different redundancy, "
|
||||
"%llu vs. %llu (%llu-way) "
|
||||
"are present\n"),
|
||||
raidz->zprl_type,
|
||||
mirror->zprl_type,
|
||||
raidz->zprl_parity,
|
||||
mirror->zprl_children - 1,
|
||||
mirror->zprl_children);
|
||||
else
|
||||
return (NULL);
|
||||
}
|
||||
} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
|
||||
0) {
|
||||
if (ret != NULL)
|
||||
free(ret);
|
||||
ret = NULL;
|
||||
@ -854,6 +903,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
|
||||
nvlist_t **child;
|
||||
uint_t children;
|
||||
replication_level_t *current = NULL, *new;
|
||||
replication_level_t *raidz, *mirror;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
@ -901,7 +951,21 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
|
||||
*/
|
||||
ret = 0;
|
||||
if (current != NULL) {
|
||||
if (strcmp(current->zprl_type, new->zprl_type) != 0) {
|
||||
if (is_raidz_mirror(current, new, &raidz, &mirror) ||
|
||||
is_raidz_mirror(new, current, &raidz, &mirror)) {
|
||||
if (raidz->zprl_parity != mirror->zprl_children - 1) {
|
||||
vdev_error(gettext(
|
||||
"mismatched replication level: pool and "
|
||||
"new vdev with different redundancy, %s "
|
||||
"and %s vdevs, %llu vs. %llu (%llu-way)\n"),
|
||||
raidz->zprl_type,
|
||||
mirror->zprl_type,
|
||||
raidz->zprl_parity,
|
||||
mirror->zprl_children - 1,
|
||||
mirror->zprl_children);
|
||||
ret = -1;
|
||||
}
|
||||
} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
|
||||
vdev_error(gettext(
|
||||
"mismatched replication level: pool uses %s "
|
||||
"and new vdev is %s\n"),
|
||||
@ -1237,6 +1301,13 @@ is_grouping(const char *type, int *mindev, int *maxdev)
|
||||
return (VDEV_TYPE_LOG);
|
||||
}
|
||||
|
||||
if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
|
||||
strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
|
||||
if (mindev != NULL)
|
||||
*mindev = 1;
|
||||
return (type);
|
||||
}
|
||||
|
||||
if (strcmp(type, "cache") == 0) {
|
||||
if (mindev != NULL)
|
||||
*mindev = 1;
|
||||
@ -1258,7 +1329,7 @@ construct_spec(int argc, char **argv)
|
||||
nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
|
||||
int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
|
||||
const char *type;
|
||||
uint64_t is_log;
|
||||
uint64_t is_log, is_special, is_dedup;
|
||||
boolean_t seen_logs;
|
||||
|
||||
top = NULL;
|
||||
@ -1268,7 +1339,7 @@ construct_spec(int argc, char **argv)
|
||||
nspares = 0;
|
||||
nlogs = 0;
|
||||
nl2cache = 0;
|
||||
is_log = B_FALSE;
|
||||
is_log = is_special = is_dedup = B_FALSE;
|
||||
seen_logs = B_FALSE;
|
||||
|
||||
while (argc > 0) {
|
||||
@ -1290,7 +1361,7 @@ construct_spec(int argc, char **argv)
|
||||
"specified only once\n"));
|
||||
return (NULL);
|
||||
}
|
||||
is_log = B_FALSE;
|
||||
is_log = is_special = is_dedup = B_FALSE;
|
||||
}
|
||||
|
||||
if (strcmp(type, VDEV_TYPE_LOG) == 0) {
|
||||
@ -1303,6 +1374,8 @@ construct_spec(int argc, char **argv)
|
||||
}
|
||||
seen_logs = B_TRUE;
|
||||
is_log = B_TRUE;
|
||||
is_special = B_FALSE;
|
||||
is_dedup = B_FALSE;
|
||||
argc--;
|
||||
argv++;
|
||||
/*
|
||||
@ -1312,6 +1385,24 @@ construct_spec(int argc, char **argv)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
|
||||
is_special = B_TRUE;
|
||||
is_log = B_FALSE;
|
||||
is_dedup = B_FALSE;
|
||||
argc--;
|
||||
argv++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
|
||||
is_dedup = B_TRUE;
|
||||
is_log = B_FALSE;
|
||||
is_special = B_FALSE;
|
||||
argc--;
|
||||
argv++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
|
||||
if (l2cache != NULL) {
|
||||
(void) fprintf(stderr,
|
||||
@ -1320,15 +1411,16 @@ construct_spec(int argc, char **argv)
|
||||
"specified only once\n"));
|
||||
return (NULL);
|
||||
}
|
||||
is_log = B_FALSE;
|
||||
is_log = is_special = is_dedup = B_FALSE;
|
||||
}
|
||||
|
||||
if (is_log) {
|
||||
if (is_log || is_special || is_dedup) {
|
||||
if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
|
||||
(void) fprintf(stderr,
|
||||
gettext("invalid vdev "
|
||||
"specification: unsupported 'log' "
|
||||
"device: %s\n"), type);
|
||||
"specification: unsupported '%s' "
|
||||
"device: %s\n"), is_log ? "log" :
|
||||
"special", type);
|
||||
return (NULL);
|
||||
}
|
||||
nlogs++;
|
||||
@ -1374,12 +1466,27 @@ construct_spec(int argc, char **argv)
|
||||
nl2cache = children;
|
||||
continue;
|
||||
} else {
|
||||
/* create a top-level vdev with children */
|
||||
verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
|
||||
0) == 0);
|
||||
verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
|
||||
type) == 0);
|
||||
verify(nvlist_add_uint64(nv,
|
||||
ZPOOL_CONFIG_IS_LOG, is_log) == 0);
|
||||
if (is_log)
|
||||
verify(nvlist_add_string(nv,
|
||||
ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
VDEV_ALLOC_BIAS_LOG) == 0);
|
||||
if (is_special) {
|
||||
verify(nvlist_add_string(nv,
|
||||
ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
VDEV_ALLOC_BIAS_SPECIAL) == 0);
|
||||
}
|
||||
if (is_dedup) {
|
||||
verify(nvlist_add_string(nv,
|
||||
ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
VDEV_ALLOC_BIAS_DEDUP) == 0);
|
||||
}
|
||||
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
|
||||
verify(nvlist_add_uint64(nv,
|
||||
ZPOOL_CONFIG_NPARITY,
|
||||
@ -1402,6 +1509,16 @@ construct_spec(int argc, char **argv)
|
||||
return (NULL);
|
||||
if (is_log)
|
||||
nlogs++;
|
||||
if (is_special) {
|
||||
verify(nvlist_add_string(nv,
|
||||
ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
VDEV_ALLOC_BIAS_SPECIAL) == 0);
|
||||
}
|
||||
if (is_dedup) {
|
||||
verify(nvlist_add_string(nv,
|
||||
ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
VDEV_ALLOC_BIAS_DEDUP) == 0);
|
||||
}
|
||||
argc--;
|
||||
argv++;
|
||||
}
|
||||
@ -1513,6 +1630,30 @@ split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
|
||||
return (newroot);
|
||||
}
|
||||
|
||||
static int
|
||||
num_normal_vdevs(nvlist_t *nvroot)
|
||||
{
|
||||
nvlist_t **top;
|
||||
uint_t t, toplevels, normal = 0;
|
||||
|
||||
verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
|
||||
&top, &toplevels) == 0);
|
||||
|
||||
for (t = 0; t < toplevels; t++) {
|
||||
uint64_t log = B_FALSE;
|
||||
|
||||
(void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
|
||||
if (log)
|
||||
continue;
|
||||
if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
|
||||
continue;
|
||||
|
||||
normal++;
|
||||
}
|
||||
|
||||
return (normal);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get and validate the contents of the given vdev specification. This ensures
|
||||
* that the nvlist returned is well-formed, that all the devices exist, and that
|
||||
@ -1565,6 +1706,16 @@ make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
|
||||
}
|
||||
|
||||
#ifdef illumos
|
||||
/*
|
||||
* On pool create the new vdev spec must have one normal vdev.
|
||||
*/
|
||||
if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
|
||||
vdev_error(gettext("at least one general top-level vdev must "
|
||||
"be specified\n"));
|
||||
nvlist_free(newroot);
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Run through the vdev specification and label any whole disks found.
|
||||
*/
|
||||
|
@ -20,12 +20,13 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
|
||||
* Copyright (c) 2013 Steven Hartland. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright 2017 Joyent, Inc.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
* Copyright 2017 RackTop Systems.
|
||||
*/
|
||||
|
||||
@ -146,6 +147,12 @@ typedef struct ztest_shared_hdr {
|
||||
|
||||
static ztest_shared_hdr_t *ztest_shared_hdr;
|
||||
|
||||
enum ztest_class_state {
|
||||
ZTEST_VDEV_CLASS_OFF,
|
||||
ZTEST_VDEV_CLASS_ON,
|
||||
ZTEST_VDEV_CLASS_RND
|
||||
};
|
||||
|
||||
typedef struct ztest_shared_opts {
|
||||
char zo_pool[ZFS_MAX_DATASET_NAME_LEN];
|
||||
char zo_dir[ZFS_MAX_DATASET_NAME_LEN];
|
||||
@ -168,6 +175,7 @@ typedef struct ztest_shared_opts {
|
||||
uint64_t zo_maxloops;
|
||||
uint64_t zo_metaslab_force_ganging;
|
||||
int zo_mmp_test;
|
||||
int zo_special_vdevs;
|
||||
} ztest_shared_opts_t;
|
||||
|
||||
static const ztest_shared_opts_t ztest_opts_defaults = {
|
||||
@ -190,7 +198,8 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
|
||||
.zo_init = 1,
|
||||
.zo_time = 300, /* 5 minutes */
|
||||
.zo_maxloops = 50, /* max loops during spa_freeze() */
|
||||
.zo_metaslab_force_ganging = 32 << 10
|
||||
.zo_metaslab_force_ganging = 32 << 10,
|
||||
.zo_special_vdevs = ZTEST_VDEV_CLASS_RND,
|
||||
};
|
||||
|
||||
extern uint64_t metaslab_force_ganging;
|
||||
@ -352,6 +361,7 @@ ztest_func_t ztest_dsl_dataset_promote_busy;
|
||||
ztest_func_t ztest_vdev_attach_detach;
|
||||
ztest_func_t ztest_vdev_LUN_growth;
|
||||
ztest_func_t ztest_vdev_add_remove;
|
||||
ztest_func_t ztest_vdev_class_add;
|
||||
ztest_func_t ztest_vdev_aux_add_remove;
|
||||
ztest_func_t ztest_split_pool;
|
||||
ztest_func_t ztest_reguid;
|
||||
@ -401,6 +411,8 @@ ztest_info_t ztest_info[] = {
|
||||
{ ztest_vdev_LUN_growth, 1, &zopt_rarely },
|
||||
{ ztest_vdev_add_remove, 1,
|
||||
&ztest_opts.zo_vdevtime },
|
||||
{ ztest_vdev_class_add, 1,
|
||||
&ztest_opts.zo_vdevtime },
|
||||
{ ztest_vdev_aux_add_remove, 1,
|
||||
&ztest_opts.zo_vdevtime },
|
||||
{ ztest_device_removal, 1, &zopt_sometimes },
|
||||
@ -613,6 +625,7 @@ usage(boolean_t requested)
|
||||
"\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
|
||||
"\t[-P passtime (default: %llu sec)] time per pass\n"
|
||||
"\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
|
||||
"\t[-C vdev class state (default: random)] special=on|off|random\n"
|
||||
"\t[-o variable=value] ... set global variable to an unsigned\n"
|
||||
"\t 32-bit integer value\n"
|
||||
"\t[-h] (print help)\n"
|
||||
@ -637,6 +650,46 @@ usage(boolean_t requested)
|
||||
exit(requested ? 0 : 1);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo)
|
||||
{
|
||||
char name[32];
|
||||
char *value;
|
||||
int state = ZTEST_VDEV_CLASS_RND;
|
||||
|
||||
(void) strlcpy(name, input, sizeof (name));
|
||||
|
||||
value = strchr(name, '=');
|
||||
if (value == NULL) {
|
||||
(void) fprintf(stderr, "missing value in property=value "
|
||||
"'-C' argument (%s)\n", input);
|
||||
usage(B_FALSE);
|
||||
}
|
||||
*(value) = '\0';
|
||||
value++;
|
||||
|
||||
if (strcmp(value, "on") == 0) {
|
||||
state = ZTEST_VDEV_CLASS_ON;
|
||||
} else if (strcmp(value, "off") == 0) {
|
||||
state = ZTEST_VDEV_CLASS_OFF;
|
||||
} else if (strcmp(value, "random") == 0) {
|
||||
state = ZTEST_VDEV_CLASS_RND;
|
||||
} else {
|
||||
(void) fprintf(stderr, "invalid property value '%s'\n", value);
|
||||
usage(B_FALSE);
|
||||
}
|
||||
|
||||
if (strcmp(name, "special") == 0) {
|
||||
zo->zo_special_vdevs = state;
|
||||
} else {
|
||||
(void) fprintf(stderr, "invalid property name '%s'\n", name);
|
||||
usage(B_FALSE);
|
||||
}
|
||||
if (zo->zo_verbose >= 3)
|
||||
(void) printf("%s vdev state is '%s'\n", name, value);
|
||||
}
|
||||
|
||||
static void
|
||||
process_options(int argc, char **argv)
|
||||
{
|
||||
@ -650,7 +703,7 @@ process_options(int argc, char **argv)
|
||||
bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
|
||||
|
||||
while ((opt = getopt(argc, argv,
|
||||
"v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:")) != EOF) {
|
||||
"v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:")) != EOF) {
|
||||
value = 0;
|
||||
switch (opt) {
|
||||
case 'v':
|
||||
@ -740,6 +793,9 @@ process_options(int argc, char **argv)
|
||||
case 'B':
|
||||
(void) strlcpy(altdir, optarg, sizeof (altdir));
|
||||
break;
|
||||
case 'C':
|
||||
ztest_parse_name_value(optarg, zo);
|
||||
break;
|
||||
case 'o':
|
||||
if (set_global_var(optarg) != 0)
|
||||
usage(B_FALSE);
|
||||
@ -962,13 +1018,16 @@ make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
|
||||
|
||||
static nvlist_t *
|
||||
make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
|
||||
int log, int r, int m, int t)
|
||||
const char *class, int r, int m, int t)
|
||||
{
|
||||
nvlist_t *root, **child;
|
||||
int c;
|
||||
boolean_t log;
|
||||
|
||||
ASSERT(t > 0);
|
||||
|
||||
log = (class != NULL && strcmp(class, "log") == 0);
|
||||
|
||||
child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
|
||||
|
||||
for (c = 0; c < t; c++) {
|
||||
@ -976,6 +1035,12 @@ make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
|
||||
r, m);
|
||||
VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
|
||||
log) == 0);
|
||||
|
||||
if (class != NULL && class[0] != '\0') {
|
||||
ASSERT(m > 1 || log); /* expecting a mirror */
|
||||
VERIFY(nvlist_add_string(child[c],
|
||||
ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0);
|
||||
}
|
||||
}
|
||||
|
||||
VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
|
||||
@ -1016,6 +1081,9 @@ static int
|
||||
ztest_random_blocksize(void)
|
||||
{
|
||||
uint64_t block_shift;
|
||||
|
||||
ASSERT(ztest_spa->spa_max_ashift != 0);
|
||||
|
||||
/*
|
||||
* Choose a block size >= the ashift.
|
||||
* If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
|
||||
@ -2495,7 +2563,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
|
||||
/*
|
||||
* Attempt to create using a bad file.
|
||||
*/
|
||||
nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
|
||||
nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1);
|
||||
VERIFY3U(ENOENT, ==,
|
||||
spa_create("ztest_bad_file", nvroot, NULL, NULL));
|
||||
nvlist_free(nvroot);
|
||||
@ -2503,7 +2571,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
|
||||
/*
|
||||
* Attempt to create using a bad mirror.
|
||||
*/
|
||||
nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
|
||||
nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1);
|
||||
VERIFY3U(ENOENT, ==,
|
||||
spa_create("ztest_bad_mirror", nvroot, NULL, NULL));
|
||||
nvlist_free(nvroot);
|
||||
@ -2513,7 +2581,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
|
||||
* what's in the nvroot; we should fail with EEXIST.
|
||||
*/
|
||||
rw_enter(&ztest_name_lock, RW_READER);
|
||||
nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
|
||||
nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1);
|
||||
VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL));
|
||||
nvlist_free(nvroot);
|
||||
VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
|
||||
@ -2595,7 +2663,7 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
|
||||
(void) spa_destroy(name);
|
||||
|
||||
nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
|
||||
0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
|
||||
NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
|
||||
|
||||
/*
|
||||
* If we're configuring a RAIDZ device then make sure that the
|
||||
@ -2768,10 +2836,16 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
|
||||
* If we have slogs then remove them 1/4 of the time.
|
||||
*/
|
||||
if (spa_has_slogs(spa) && ztest_random(4) == 0) {
|
||||
metaslab_group_t *mg;
|
||||
|
||||
/*
|
||||
* Grab the guid from the head of the log class rotor.
|
||||
* find the first real slog in log allocation class
|
||||
*/
|
||||
guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
|
||||
mg = spa_log_class(spa)->mc_rotor;
|
||||
while (!mg->mg_vd->vdev_islog)
|
||||
mg = mg->mg_next;
|
||||
|
||||
guid = mg->mg_vd->vdev_guid;
|
||||
|
||||
spa_config_exit(spa, SCL_VDEV, FTAG);
|
||||
|
||||
@ -2800,12 +2874,11 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
|
||||
spa_config_exit(spa, SCL_VDEV, FTAG);
|
||||
|
||||
/*
|
||||
* Make 1/4 of the devices be log devices.
|
||||
* Make 1/4 of the devices be log devices
|
||||
*/
|
||||
nvroot = make_vdev_root(NULL, NULL, NULL,
|
||||
ztest_opts.zo_vdev_size, 0,
|
||||
ztest_random(4) == 0, ztest_opts.zo_raidz,
|
||||
zs->zs_mirrors, 1);
|
||||
ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ?
|
||||
"log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
|
||||
|
||||
error = spa_vdev_add(spa, nvroot);
|
||||
nvlist_free(nvroot);
|
||||
@ -2824,6 +2897,83 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
|
||||
mutex_exit(&ztest_vdev_lock);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
void
|
||||
ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
|
||||
{
|
||||
ztest_shared_t *zs = ztest_shared;
|
||||
spa_t *spa = ztest_spa;
|
||||
uint64_t leaves;
|
||||
nvlist_t *nvroot;
|
||||
const char *class = (ztest_random(2) == 0) ?
|
||||
VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP;
|
||||
int error;
|
||||
|
||||
/*
|
||||
* By default add a special vdev 50% of the time
|
||||
*/
|
||||
if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) ||
|
||||
(ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND &&
|
||||
ztest_random(2) == 0)) {
|
||||
return;
|
||||
}
|
||||
|
||||
mutex_enter(&ztest_vdev_lock);
|
||||
|
||||
/* Only test with mirrors */
|
||||
if (zs->zs_mirrors < 2) {
|
||||
mutex_exit(&ztest_vdev_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
/* requires feature@allocation_classes */
|
||||
if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) {
|
||||
mutex_exit(&ztest_vdev_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
|
||||
|
||||
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
|
||||
ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
|
||||
spa_config_exit(spa, SCL_VDEV, FTAG);
|
||||
|
||||
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
|
||||
class, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
|
||||
|
||||
error = spa_vdev_add(spa, nvroot);
|
||||
nvlist_free(nvroot);
|
||||
|
||||
if (error == ENOSPC)
|
||||
ztest_record_enospc("spa_vdev_add");
|
||||
else if (error != 0)
|
||||
fatal(0, "spa_vdev_add() = %d", error);
|
||||
|
||||
/*
|
||||
* 50% of the time allow small blocks in the special class
|
||||
*/
|
||||
if (error == 0 &&
|
||||
spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) {
|
||||
if (ztest_opts.zo_verbose >= 3)
|
||||
(void) printf("Enabling special VDEV small blocks\n");
|
||||
(void) ztest_dsl_prop_set_uint64(zd->zd_name,
|
||||
ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE);
|
||||
}
|
||||
|
||||
mutex_exit(&ztest_vdev_lock);
|
||||
|
||||
if (ztest_opts.zo_verbose >= 3) {
|
||||
metaslab_class_t *mc;
|
||||
|
||||
if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0)
|
||||
mc = spa_special_class(spa);
|
||||
else
|
||||
mc = spa_dedup_class(spa);
|
||||
(void) printf("Added a %s mirrored vdev (of %d)\n",
|
||||
class, (int)mc->mc_groups);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
|
||||
*/
|
||||
@ -2888,7 +3038,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
|
||||
* Add a new device.
|
||||
*/
|
||||
nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
|
||||
(ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
|
||||
(ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1);
|
||||
error = spa_vdev_add(spa, nvroot);
|
||||
|
||||
switch (error) {
|
||||
@ -3085,11 +3235,15 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
|
||||
* Locate this vdev.
|
||||
*/
|
||||
oldvd = rvd->vdev_child[top];
|
||||
|
||||
/* pick a child from the mirror */
|
||||
if (zs->zs_mirrors >= 1) {
|
||||
ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
|
||||
ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
|
||||
oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
|
||||
}
|
||||
|
||||
/* pick a child out of the raidz group */
|
||||
if (ztest_opts.zo_raidz > 1) {
|
||||
ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
|
||||
ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
|
||||
@ -3192,7 +3346,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
|
||||
* Build the nvlist describing newpath.
|
||||
*/
|
||||
root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
|
||||
ashift, 0, 0, 0, 1);
|
||||
ashift, NULL, 0, 0, 1);
|
||||
|
||||
error = spa_vdev_attach(spa, oldguid, root, replacing);
|
||||
|
||||
@ -3453,7 +3607,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
|
||||
return;
|
||||
}
|
||||
ASSERT(psize > 0);
|
||||
newsize = psize + psize / 8;
|
||||
newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE);
|
||||
ASSERT3U(newsize, >, psize);
|
||||
|
||||
if (ztest_opts.zo_verbose >= 6) {
|
||||
@ -6470,6 +6624,7 @@ make_random_props()
|
||||
nvlist_t *props;
|
||||
|
||||
VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
|
||||
|
||||
if (ztest_random(2) == 0)
|
||||
return (props);
|
||||
VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
|
||||
@ -6551,7 +6706,7 @@ ztest_init(ztest_shared_t *zs)
|
||||
zs->zs_splits = 0;
|
||||
zs->zs_mirrors = ztest_opts.zo_mirrors;
|
||||
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
|
||||
0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
|
||||
NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
|
||||
props = make_random_props();
|
||||
for (int i = 0; i < SPA_FEATURES; i++) {
|
||||
char buf[1024];
|
||||
|
@ -260,6 +260,7 @@ typedef struct splitflags {
|
||||
|
||||
/* after splitting, import the pool */
|
||||
int import : 1;
|
||||
int name_flags;
|
||||
} splitflags_t;
|
||||
|
||||
/*
|
||||
@ -428,8 +429,15 @@ struct zfs_cmd;
|
||||
|
||||
extern const char *zfs_history_event_names[];
|
||||
|
||||
typedef enum {
|
||||
VDEV_NAME_PATH = 1 << 0,
|
||||
VDEV_NAME_GUID = 1 << 1,
|
||||
VDEV_NAME_FOLLOW_LINKS = 1 << 2,
|
||||
VDEV_NAME_TYPE_ID = 1 << 3,
|
||||
} vdev_name_t;
|
||||
|
||||
extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
|
||||
boolean_t verbose);
|
||||
int name_flags);
|
||||
extern int zpool_upgrade(zpool_handle_t *, uint64_t);
|
||||
extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
|
||||
extern int zpool_history_unpack(char *, uint64_t, uint64_t *,
|
||||
|
@ -1186,6 +1186,36 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
|
||||
if (zpool_hdl != NULL) {
|
||||
char state[64] = "";
|
||||
|
||||
/*
|
||||
* Issue a warning but do not fail so that
|
||||
* tests for setable properties succeed.
|
||||
*/
|
||||
if (zpool_prop_get_feature(zpool_hdl,
|
||||
"feature@allocation_classes", state,
|
||||
sizeof (state)) != 0 ||
|
||||
strcmp(state, ZFS_FEATURE_ACTIVE) != 0) {
|
||||
(void) fprintf(stderr, gettext(
|
||||
"%s: property requires a special "
|
||||
"device in the pool\n"), propname);
|
||||
}
|
||||
}
|
||||
if (intval != 0 &&
|
||||
(intval < SPA_MINBLOCKSIZE ||
|
||||
intval > SPA_OLD_MAXBLOCKSIZE || !ISP2(intval))) {
|
||||
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
||||
"invalid '%s=%d' property: must be zero or "
|
||||
"a power of 2 from 512B to 128K"), propname,
|
||||
intval);
|
||||
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
||||
goto error;
|
||||
}
|
||||
break;
|
||||
|
||||
case ZFS_PROP_MLSLABEL:
|
||||
{
|
||||
#ifdef illumos
|
||||
|
@ -26,6 +26,7 @@
|
||||
* Copyright 2016 Nexenta Systems, Inc.
|
||||
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
|
||||
* Copyright (c) 2017 Datto Inc.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
@ -1125,6 +1126,30 @@ zpool_get_state(zpool_handle_t *zhp)
|
||||
return (zhp->zpool_state);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if vdev list contains a special vdev
|
||||
*/
|
||||
static boolean_t
|
||||
zpool_has_special_vdev(nvlist_t *nvroot)
|
||||
{
|
||||
nvlist_t **child;
|
||||
uint_t children;
|
||||
|
||||
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child,
|
||||
&children) == 0) {
|
||||
for (uint_t c = 0; c < children; c++) {
|
||||
char *bias;
|
||||
|
||||
if (nvlist_lookup_string(child[c],
|
||||
ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 &&
|
||||
strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
|
||||
return (B_TRUE);
|
||||
}
|
||||
}
|
||||
}
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create the named pool, using the provided vdev list. It is assumed
|
||||
* that the consumer has already validated the contents of the nvlist, so we
|
||||
@ -1170,6 +1195,17 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
|
||||
fsprops, zoned, NULL, NULL, msg)) == NULL) {
|
||||
goto create_failed;
|
||||
}
|
||||
|
||||
if (nvlist_exists(zc_fsprops,
|
||||
zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) &&
|
||||
!zpool_has_special_vdev(nvroot)) {
|
||||
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
||||
"%s property requires a special vdev"),
|
||||
zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS));
|
||||
(void) zfs_error(hdl, EZFS_BADPROP, msg);
|
||||
goto create_failed;
|
||||
}
|
||||
|
||||
if (!zc_props &&
|
||||
(nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) {
|
||||
goto create_failed;
|
||||
@ -1694,7 +1730,7 @@ print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv,
|
||||
return;
|
||||
|
||||
for (c = 0; c < children; c++) {
|
||||
vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE);
|
||||
vname = zpool_vdev_name(hdl, NULL, child[c], VDEV_NAME_TYPE_ID);
|
||||
print_vdev_tree(hdl, vname, child[c], indent + 2);
|
||||
free(vname);
|
||||
}
|
||||
@ -2892,7 +2928,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
|
||||
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
|
||||
ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0);
|
||||
|
||||
if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL)
|
||||
if ((newname = zpool_vdev_name(NULL, NULL, child[0], 0)) == NULL)
|
||||
return (-1);
|
||||
|
||||
/*
|
||||
@ -3093,11 +3129,11 @@ find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren,
|
||||
for (mc = 0; mc < mchildren; mc++) {
|
||||
uint_t sc;
|
||||
char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp,
|
||||
mchild[mc], B_FALSE);
|
||||
mchild[mc], 0);
|
||||
|
||||
for (sc = 0; sc < schildren; sc++) {
|
||||
char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp,
|
||||
schild[sc], B_FALSE);
|
||||
schild[sc], 0);
|
||||
boolean_t result = (strcmp(mpath, spath) == 0);
|
||||
|
||||
free(spath);
|
||||
@ -3685,9 +3721,9 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
|
||||
*/
|
||||
char *
|
||||
zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
|
||||
boolean_t verbose)
|
||||
int name_flags)
|
||||
{
|
||||
char *path, *devid;
|
||||
char *path, *devid, *env;
|
||||
uint64_t value;
|
||||
char buf[64];
|
||||
vdev_stat_t *vs;
|
||||
@ -3695,6 +3731,21 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
|
||||
int have_stats;
|
||||
int have_path;
|
||||
|
||||
env = getenv("ZPOOL_VDEV_NAME_PATH");
|
||||
if (env && (strtoul(env, NULL, 0) > 0 ||
|
||||
!strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2)))
|
||||
name_flags |= VDEV_NAME_PATH;
|
||||
|
||||
env = getenv("ZPOOL_VDEV_NAME_GUID");
|
||||
if (env && (strtoul(env, NULL, 0) > 0 ||
|
||||
!strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2)))
|
||||
name_flags |= VDEV_NAME_GUID;
|
||||
|
||||
env = getenv("ZPOOL_VDEV_NAME_FOLLOW_LINKS");
|
||||
if (env && (strtoul(env, NULL, 0) > 0 ||
|
||||
!strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2)))
|
||||
name_flags |= VDEV_NAME_FOLLOW_LINKS;
|
||||
|
||||
have_stats = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
|
||||
(uint64_t **)&vs, &vsc) == 0;
|
||||
have_path = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0;
|
||||
@ -3704,11 +3755,10 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
|
||||
* come back at the same device path. Display the device by GUID.
|
||||
*/
|
||||
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 ||
|
||||
(name_flags & VDEV_NAME_GUID) != 0 ||
|
||||
have_path && have_stats && vs->vs_state <= VDEV_STATE_CANT_OPEN) {
|
||||
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
|
||||
&value) == 0);
|
||||
(void) snprintf(buf, sizeof (buf), "%llu",
|
||||
(u_longlong_t)value);
|
||||
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value);
|
||||
(void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value);
|
||||
path = buf;
|
||||
} else if (have_path) {
|
||||
|
||||
@ -3750,11 +3800,23 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
|
||||
}
|
||||
|
||||
#ifdef illumos
|
||||
if (name_flags & VDEV_NAME_FOLLOW_LINKS) {
|
||||
char *rp = realpath(path, NULL);
|
||||
if (rp) {
|
||||
strlcpy(buf, rp, sizeof (buf));
|
||||
path = buf;
|
||||
free(rp);
|
||||
}
|
||||
}
|
||||
|
||||
if (strncmp(path, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0)
|
||||
path += strlen(ZFS_DISK_ROOTD);
|
||||
|
||||
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
|
||||
&value) == 0 && value) {
|
||||
/*
|
||||
* Remove the partition from the path it this is a whole disk.
|
||||
*/
|
||||
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value)
|
||||
== 0 && value && !(name_flags & VDEV_NAME_PATH)) {
|
||||
int pathlen = strlen(path);
|
||||
char *tmp = zfs_strdup(hdl, path);
|
||||
|
||||
@ -3798,7 +3860,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
|
||||
* We identify each top-level vdev by using a <type-id>
|
||||
* naming convention.
|
||||
*/
|
||||
if (verbose) {
|
||||
if (name_flags & VDEV_NAME_TYPE_ID) {
|
||||
uint64_t id;
|
||||
|
||||
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
|
||||
|
@ -21,6 +21,7 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2016 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
@ -49,7 +50,6 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
|
||||
uint_t c, children;
|
||||
char used[6], avail[6];
|
||||
char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6];
|
||||
char *prefix = "";
|
||||
|
||||
if (indent == 0 && desc != NULL) {
|
||||
(void) printf(" "
|
||||
@ -59,15 +59,24 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
|
||||
}
|
||||
|
||||
if (desc != NULL) {
|
||||
char *suffix = "", *bias = NULL;
|
||||
char bias_suffix[32];
|
||||
|
||||
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
|
||||
|
||||
if (is_log)
|
||||
prefix = "log ";
|
||||
|
||||
(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
&bias);
|
||||
if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
|
||||
(uint64_t **)&vs, &c) != 0)
|
||||
vs = &v0;
|
||||
|
||||
if (bias != NULL) {
|
||||
(void) snprintf(bias_suffix, sizeof (bias_suffix),
|
||||
" (%s)", bias);
|
||||
suffix = bias_suffix;
|
||||
} else if (is_log) {
|
||||
suffix = " (log)";
|
||||
}
|
||||
|
||||
sec = MAX(1, vs->vs_timestamp / NANOSEC);
|
||||
|
||||
nicenum(vs->vs_alloc, used, sizeof (used));
|
||||
@ -84,9 +93,9 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
|
||||
|
||||
(void) printf("%*s%s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n",
|
||||
indent, "",
|
||||
prefix,
|
||||
(int)(indent + strlen(prefix) - 25 - (vs->vs_space ? 0 : 12)),
|
||||
desc,
|
||||
(int)(indent+strlen(desc)-25-(vs->vs_space ? 0 : 12)),
|
||||
suffix,
|
||||
vs->vs_space ? 6 : 0, vs->vs_space ? used : "",
|
||||
vs->vs_space ? 6 : 0, vs->vs_space ? avail : "",
|
||||
rops, wops, rbytes, wbytes, rerr, werr, cerr);
|
||||
|
@ -125,6 +125,7 @@ static const char *features_for_read[] = {
|
||||
"com.datto:resilver_defer",
|
||||
"com.delphix:device_removal",
|
||||
"com.delphix:obsolete_counts",
|
||||
"com.intel:allocation_classes",
|
||||
NULL
|
||||
};
|
||||
|
||||
|
@ -20,11 +20,12 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
|
||||
* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#ifdef _KERNEL
|
||||
@ -299,4 +300,11 @@ zpool_feature_init(void)
|
||||
"Reduce memory used by removed devices when their blocks are "
|
||||
"freed or remapped.",
|
||||
ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps);
|
||||
|
||||
{
|
||||
zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES,
|
||||
"org.zfsonlinux:allocation_classes", "allocation_classes",
|
||||
"Support for separate allocation classes.",
|
||||
ZFEATURE_FLAG_READONLY_COMPAT, NULL);
|
||||
}
|
||||
}
|
||||
|
@ -24,6 +24,7 @@
|
||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#ifndef _ZFEATURE_COMMON_H
|
||||
@ -62,6 +63,7 @@ typedef enum spa_feature {
|
||||
SPA_FEATURE_OBSOLETE_COUNTS,
|
||||
SPA_FEATURE_POOL_CHECKPOINT,
|
||||
SPA_FEATURE_SPACEMAP_V2,
|
||||
SPA_FEATURE_ALLOCATION_CLASSES,
|
||||
SPA_FEATURES
|
||||
} spa_feature_t;
|
||||
|
||||
|
@ -451,6 +451,9 @@ zfs_prop_init(void)
|
||||
zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
|
||||
SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
|
||||
ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
|
||||
zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS,
|
||||
"special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
|
||||
"zero or 512 to 128K, power of 2", "SPECIAL_SMALL_BLOCKS");
|
||||
|
||||
/* hidden properties */
|
||||
zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER,
|
||||
|
@ -2514,6 +2514,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||
zp->zp_dedup = dedup;
|
||||
zp->zp_dedup_verify = dedup && dedup_verify;
|
||||
zp->zp_nopwrite = nopwrite;
|
||||
zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
|
||||
os->os_zpl_special_smallblock : 0;
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -303,6 +303,20 @@ dnodesize_changed_cb(void *arg, uint64_t newval)
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
smallblk_changed_cb(void *arg, uint64_t newval)
|
||||
{
|
||||
objset_t *os = arg;
|
||||
|
||||
/*
|
||||
* Inheritance and range checking should have been done by now.
|
||||
*/
|
||||
ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE);
|
||||
ASSERT(ISP2(newval));
|
||||
|
||||
os->os_zpl_special_smallblock = newval;
|
||||
}
|
||||
|
||||
static void
|
||||
logbias_changed_cb(void *arg, uint64_t newval)
|
||||
{
|
||||
@ -518,6 +532,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
|
||||
zfs_prop_to_name(ZFS_PROP_DNODESIZE),
|
||||
dnodesize_changed_cb, os);
|
||||
}
|
||||
if (err == 0) {
|
||||
err = dsl_prop_register(ds,
|
||||
zfs_prop_to_name(
|
||||
ZFS_PROP_SPECIAL_SMALL_BLOCKS),
|
||||
smallblk_changed_cb, os);
|
||||
}
|
||||
}
|
||||
if (needlock)
|
||||
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
|
||||
|
@ -23,6 +23,7 @@
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@ -356,7 +357,7 @@ metaslab_class_validate(metaslab_class_t *mc)
|
||||
return (0);
|
||||
}
|
||||
|
||||
void
|
||||
static void
|
||||
metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
|
||||
int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
|
||||
{
|
||||
@ -420,7 +421,8 @@ metaslab_class_get_minblocksize(metaslab_class_t *mc)
|
||||
void
|
||||
metaslab_class_histogram_verify(metaslab_class_t *mc)
|
||||
{
|
||||
vdev_t *rvd = mc->mc_spa->spa_root_vdev;
|
||||
spa_t *spa = mc->mc_spa;
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
uint64_t *mc_hist;
|
||||
int i;
|
||||
|
||||
@ -928,7 +930,8 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
|
||||
for (int m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
|
||||
if (msp->ms_sm == NULL)
|
||||
/* skip if not active or not a member */
|
||||
if (msp->ms_sm == NULL || msp->ms_group != mg)
|
||||
continue;
|
||||
|
||||
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
|
||||
@ -1061,12 +1064,14 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
|
||||
|
||||
if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
|
||||
continue;
|
||||
if (msp->ms_group != mg)
|
||||
continue;
|
||||
|
||||
valid_ms++;
|
||||
fragmentation += msp->ms_fragmentation;
|
||||
}
|
||||
|
||||
if (valid_ms <= vd->vdev_ms_count / 2)
|
||||
if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
|
||||
return (ZFS_FRAG_INVALID);
|
||||
|
||||
fragmentation /= valid_ms;
|
||||
@ -1097,7 +1102,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
|
||||
* groups to select from. Otherwise, we always consider it eligible
|
||||
* for allocations.
|
||||
*/
|
||||
if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
|
||||
if ((mc != spa_normal_class(spa) &&
|
||||
mc != spa_special_class(spa) &&
|
||||
mc != spa_dedup_class(spa)) ||
|
||||
mc->mc_groups <= 1)
|
||||
return (B_TRUE);
|
||||
|
||||
/*
|
||||
@ -1559,12 +1567,26 @@ metaslab_unload(metaslab_t *msp)
|
||||
msp->ms_max_size = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
|
||||
int64_t defer_delta, int64_t space_delta)
|
||||
{
|
||||
vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
|
||||
|
||||
ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
|
||||
ASSERT(vd->vdev_ms_count != 0);
|
||||
|
||||
metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
|
||||
vdev_deflated_space(vd, space_delta));
|
||||
}
|
||||
|
||||
int
|
||||
metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
|
||||
metaslab_t **msp)
|
||||
{
|
||||
vdev_t *vd = mg->mg_vd;
|
||||
objset_t *mos = vd->vdev_spa->spa_meta_objset;
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
metaslab_t *ms;
|
||||
int error;
|
||||
|
||||
@ -1622,8 +1644,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
|
||||
|
||||
/*
|
||||
* If metaslab_debug_load is set and we're initializing a metaslab
|
||||
* that has an allocated space map object then load the its space
|
||||
* map so that can verify frees.
|
||||
* that has an allocated space map object then load the space map
|
||||
* so that we can verify frees.
|
||||
*/
|
||||
if (metaslab_debug_load && ms->ms_sm != NULL) {
|
||||
mutex_enter(&ms->ms_lock);
|
||||
@ -1645,16 +1667,19 @@ void
|
||||
metaslab_fini(metaslab_t *msp)
|
||||
{
|
||||
metaslab_group_t *mg = msp->ms_group;
|
||||
vdev_t *vd = mg->mg_vd;
|
||||
|
||||
metaslab_group_remove(mg, msp);
|
||||
|
||||
mutex_enter(&msp->ms_lock);
|
||||
VERIFY(msp->ms_group == NULL);
|
||||
vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
|
||||
0, -msp->ms_size);
|
||||
metaslab_space_update(vd, mg->mg_class,
|
||||
-space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
|
||||
|
||||
space_map_close(msp->ms_sm);
|
||||
|
||||
metaslab_unload(msp);
|
||||
|
||||
range_tree_destroy(msp->ms_allocatable);
|
||||
range_tree_destroy(msp->ms_freeing);
|
||||
range_tree_destroy(msp->ms_freed);
|
||||
@ -2669,7 +2694,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
||||
ASSERT3P(msp->ms_checkpointing, ==, NULL);
|
||||
msp->ms_checkpointing = range_tree_create(NULL, NULL);
|
||||
|
||||
vdev_space_update(vd, 0, 0, msp->ms_size);
|
||||
metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
|
||||
}
|
||||
ASSERT0(range_tree_space(msp->ms_freeing));
|
||||
ASSERT0(range_tree_space(msp->ms_checkpointing));
|
||||
@ -2691,7 +2716,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
||||
defer_delta -= range_tree_space(*defer_tree);
|
||||
}
|
||||
|
||||
vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
|
||||
metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
|
||||
defer_delta, 0);
|
||||
|
||||
/*
|
||||
* If there's a metaslab_load() in progress, wait for it to complete
|
||||
@ -2790,21 +2816,25 @@ metaslab_sync_reassess(metaslab_group_t *mg)
|
||||
spa_config_exit(spa, SCL_ALLOC, FTAG);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
metaslab_distance(metaslab_t *msp, dva_t *dva)
|
||||
/*
|
||||
* When writing a ditto block (i.e. more than one DVA for a given BP) on
|
||||
* the same vdev as an existing DVA of this BP, then try to allocate it
|
||||
* on a different metaslab than existing DVAs (i.e. a unique metaslab).
|
||||
*/
|
||||
static boolean_t
|
||||
metaslab_is_unique(metaslab_t *msp, dva_t *dva)
|
||||
{
|
||||
uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
|
||||
uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
|
||||
uint64_t start = msp->ms_id;
|
||||
uint64_t dva_ms_id;
|
||||
|
||||
if (DVA_GET_ASIZE(dva) == 0)
|
||||
return (B_TRUE);
|
||||
|
||||
if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
|
||||
return (1ULL << 63);
|
||||
return (B_TRUE);
|
||||
|
||||
if (offset < start)
|
||||
return ((start - offset) << ms_shift);
|
||||
if (offset > start)
|
||||
return ((offset - start) << ms_shift);
|
||||
return (0);
|
||||
dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
|
||||
|
||||
return (msp->ms_id != dva_ms_id);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3065,7 +3095,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
|
||||
*/
|
||||
static metaslab_t *
|
||||
find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
|
||||
dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
|
||||
dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
|
||||
zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
|
||||
{
|
||||
avl_index_t idx;
|
||||
@ -3100,13 +3130,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
|
||||
if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
|
||||
break;
|
||||
|
||||
uint64_t target_distance = min_distance
|
||||
+ (space_map_allocated(msp->ms_sm) != 0 ? 0 :
|
||||
min_distance >> 1);
|
||||
|
||||
for (i = 0; i < d; i++) {
|
||||
if (metaslab_distance(msp, &dva[i]) < target_distance)
|
||||
break;
|
||||
if (want_unique &&
|
||||
!metaslab_is_unique(msp, &dva[i]))
|
||||
break; /* try another metaslab */
|
||||
}
|
||||
if (i == d)
|
||||
break;
|
||||
@ -3124,8 +3151,8 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
|
||||
/* ARGSUSED */
|
||||
static uint64_t
|
||||
metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
||||
uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
|
||||
int allocator)
|
||||
uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
|
||||
int d, int allocator)
|
||||
{
|
||||
metaslab_t *msp = NULL;
|
||||
uint64_t offset = -1ULL;
|
||||
@ -3179,7 +3206,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
||||
was_active = B_TRUE;
|
||||
} else {
|
||||
msp = find_valid_metaslab(mg, activation_weight, dva, d,
|
||||
min_distance, asize, allocator, zal, search,
|
||||
want_unique, asize, allocator, zal, search,
|
||||
&was_active);
|
||||
}
|
||||
|
||||
@ -3317,6 +3344,7 @@ next:
|
||||
* metaslab.
|
||||
*/
|
||||
ASSERT(!metaslab_should_allocate(msp, asize));
|
||||
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
mutex_exit(&msp->ms_lock);
|
||||
@ -3326,14 +3354,14 @@ next:
|
||||
|
||||
static uint64_t
|
||||
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
||||
uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
|
||||
int allocator)
|
||||
uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
|
||||
int d, int allocator)
|
||||
{
|
||||
uint64_t offset;
|
||||
ASSERT(mg->mg_initialized);
|
||||
|
||||
offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
|
||||
min_distance, dva, d, allocator);
|
||||
offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
|
||||
dva, d, allocator);
|
||||
|
||||
mutex_enter(&mg->mg_lock);
|
||||
if (offset == -1ULL) {
|
||||
@ -3360,14 +3388,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
||||
return (offset);
|
||||
}
|
||||
|
||||
/*
|
||||
* If we have to write a ditto block (i.e. more than one DVA for a given BP)
|
||||
* on the same vdev as an existing DVA of this BP, then try to allocate it
|
||||
* at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
|
||||
* existing DVAs.
|
||||
*/
|
||||
int ditto_same_vdev_distance_shift = 3;
|
||||
|
||||
/*
|
||||
* Allocate a block for the specified i/o.
|
||||
*/
|
||||
@ -3384,6 +3404,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||
|
||||
/*
|
||||
* For testing, make some blocks above a certain size be gang blocks.
|
||||
* This will also test spilling from special to normal.
|
||||
*/
|
||||
if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
|
||||
metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
|
||||
@ -3435,6 +3456,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
|
||||
mg = vd->vdev_mg->mg_next;
|
||||
} else {
|
||||
ASSERT(mc->mc_rotor != NULL);
|
||||
mg = mc->mc_rotor;
|
||||
}
|
||||
|
||||
@ -3499,25 +3521,17 @@ top:
|
||||
|
||||
ASSERT(mg->mg_class == mc);
|
||||
|
||||
/*
|
||||
* If we don't need to try hard, then require that the
|
||||
* block be 1/8th of the device away from any other DVAs
|
||||
* in this BP. If we are trying hard, allow any offset
|
||||
* to be used (distance=0).
|
||||
*/
|
||||
uint64_t distance = 0;
|
||||
if (!try_hard) {
|
||||
distance = vd->vdev_asize >>
|
||||
ditto_same_vdev_distance_shift;
|
||||
if (distance <= (1ULL << vd->vdev_ms_shift))
|
||||
distance = 0;
|
||||
}
|
||||
|
||||
uint64_t asize = vdev_psize_to_asize(vd, psize);
|
||||
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
|
||||
|
||||
/*
|
||||
* If we don't need to try hard, then require that the
|
||||
* block be on an different metaslab from any other DVAs
|
||||
* in this BP (unique=true). If we are trying hard, then
|
||||
* allow any metaslab to be used (unique=false).
|
||||
*/
|
||||
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
|
||||
distance, dva, d, allocator);
|
||||
!try_hard, dva, d, allocator);
|
||||
|
||||
if (offset != -1ULL) {
|
||||
/*
|
||||
@ -3896,7 +3910,8 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
|
||||
if (reserved_slots < max)
|
||||
available_slots = max - reserved_slots;
|
||||
|
||||
if (slots <= available_slots || GANG_ALLOCATION(flags)) {
|
||||
if (slots <= available_slots || GANG_ALLOCATION(flags) ||
|
||||
flags & METASLAB_MUST_RESERVE) {
|
||||
/*
|
||||
* We reserve the slots individually so that we can unreserve
|
||||
* them individually when an I/O completes.
|
||||
@ -4179,9 +4194,11 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
|
||||
|
||||
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
|
||||
|
||||
for (int d = 0; d < ndvas; d++)
|
||||
if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
|
||||
for (int d = 0; d < ndvas; d++) {
|
||||
error = metaslab_claim_dva(spa, &dva[d], txg);
|
||||
if (error != 0)
|
||||
break;
|
||||
}
|
||||
|
||||
spa_config_exit(spa, SCL_ALLOC, FTAG);
|
||||
|
||||
|
@ -29,6 +29,7 @@
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright 2016 Toomas Soome <tsoome@me.com>
|
||||
* Copyright 2018 Joyent, Inc.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
* Copyright (c) 2017 Datto Inc.
|
||||
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
|
||||
*/
|
||||
@ -312,8 +313,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
|
||||
ASSERT(MUTEX_HELD(&spa->spa_props_lock));
|
||||
|
||||
if (rvd != NULL) {
|
||||
alloc = metaslab_class_get_alloc(spa_normal_class(spa));
|
||||
size = metaslab_class_get_space(spa_normal_class(spa));
|
||||
alloc = metaslab_class_get_alloc(mc);
|
||||
alloc += metaslab_class_get_alloc(spa_special_class(spa));
|
||||
alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
|
||||
|
||||
size = metaslab_class_get_space(mc);
|
||||
size += metaslab_class_get_space(spa_special_class(spa));
|
||||
size += metaslab_class_get_space(spa_dedup_class(spa));
|
||||
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
|
||||
@ -1227,6 +1234,8 @@ spa_activate(spa_t *spa, int mode)
|
||||
|
||||
spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
|
||||
spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
|
||||
spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
|
||||
spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
|
||||
|
||||
/* Try to create a covering process */
|
||||
mutex_enter(&spa->spa_proc_lock);
|
||||
@ -1338,6 +1347,12 @@ spa_deactivate(spa_t *spa)
|
||||
metaslab_class_destroy(spa->spa_log_class);
|
||||
spa->spa_log_class = NULL;
|
||||
|
||||
metaslab_class_destroy(spa->spa_special_class);
|
||||
spa->spa_special_class = NULL;
|
||||
|
||||
metaslab_class_destroy(spa->spa_dedup_class);
|
||||
spa->spa_dedup_class = NULL;
|
||||
|
||||
/*
|
||||
* If this was part of an import or the open otherwise failed, we may
|
||||
* still have errors left in the queues. Empty them just in case.
|
||||
@ -5096,7 +5111,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
|
||||
char *poolname;
|
||||
nvlist_t *nvl;
|
||||
|
||||
if (nvlist_lookup_string(props,
|
||||
if (props == NULL ||
|
||||
nvlist_lookup_string(props,
|
||||
zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
|
||||
poolname = (char *)pool;
|
||||
|
||||
@ -5184,10 +5200,16 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
|
||||
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
|
||||
(error = spa_validate_aux(spa, nvroot, txg,
|
||||
VDEV_ALLOC_ADD)) == 0) {
|
||||
for (int c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_ashift_optimize(rvd->vdev_child[c]);
|
||||
vdev_metaslab_set_size(rvd->vdev_child[c]);
|
||||
vdev_expand(rvd->vdev_child[c], txg);
|
||||
/*
|
||||
* instantiate the metaslab groups (this will dirty the vdevs)
|
||||
* we can no longer error exit past this point
|
||||
*/
|
||||
for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
|
||||
vdev_t *vd = rvd->vdev_child[c];
|
||||
|
||||
vdev_ashift_optimize(vd);
|
||||
vdev_metaslab_set_size(vd);
|
||||
vdev_expand(vd, txg);
|
||||
}
|
||||
}
|
||||
|
||||
@ -7564,8 +7586,14 @@ spa_async_thread(void *arg)
|
||||
|
||||
mutex_enter(&spa_namespace_lock);
|
||||
old_space = metaslab_class_get_space(spa_normal_class(spa));
|
||||
old_space += metaslab_class_get_space(spa_special_class(spa));
|
||||
old_space += metaslab_class_get_space(spa_dedup_class(spa));
|
||||
|
||||
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
|
||||
|
||||
new_space = metaslab_class_get_space(spa_normal_class(spa));
|
||||
new_space += metaslab_class_get_space(spa_special_class(spa));
|
||||
new_space += metaslab_class_get_space(spa_dedup_class(spa));
|
||||
mutex_exit(&spa_namespace_lock);
|
||||
|
||||
/*
|
||||
@ -8303,6 +8331,9 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
dsl_pool_t *dp = spa->spa_dsl_pool;
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
|
||||
metaslab_class_t *normal = spa_normal_class(spa);
|
||||
metaslab_class_t *special = spa_special_class(spa);
|
||||
metaslab_class_t *dedup = spa_dedup_class(spa);
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
vdev_t *vd;
|
||||
dmu_tx_t *tx;
|
||||
@ -8402,9 +8433,13 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
for (int c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *tvd = rvd->vdev_child[c];
|
||||
metaslab_group_t *mg = tvd->vdev_mg;
|
||||
metaslab_class_t *mc;
|
||||
|
||||
if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
|
||||
!metaslab_group_initialized(mg))
|
||||
if (mg == NULL || !metaslab_group_initialized(mg))
|
||||
continue;
|
||||
|
||||
mc = mg->mg_class;
|
||||
if (mc != normal && mc != special && mc != dedup)
|
||||
continue;
|
||||
|
||||
/*
|
||||
@ -8423,12 +8458,18 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
}
|
||||
slots_per_allocator += zfs_vdev_def_queue_depth;
|
||||
}
|
||||
metaslab_class_t *mc = spa_normal_class(spa);
|
||||
|
||||
for (int i = 0; i < spa->spa_alloc_count; i++) {
|
||||
ASSERT0(zfs_refcount_count(&mc->mc_alloc_slots[i]));
|
||||
mc->mc_alloc_max_slots[i] = slots_per_allocator;
|
||||
ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i]));
|
||||
ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i]));
|
||||
ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i]));
|
||||
normal->mc_alloc_max_slots[i] = slots_per_allocator;
|
||||
special->mc_alloc_max_slots[i] = slots_per_allocator;
|
||||
dedup->mc_alloc_max_slots[i] = slots_per_allocator;
|
||||
}
|
||||
mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
|
||||
normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
|
||||
special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
|
||||
dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
|
||||
|
||||
for (int c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *vd = rvd->vdev_child[c];
|
||||
|
@ -27,6 +27,7 @@
|
||||
* Copyright 2013 Saso Kiselkov. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright (c) 2017 Datto Inc.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@ -468,6 +469,31 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
|
||||
spa->spa_trust_config ? "trusted" : "untrusted", buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* By default dedup and user data indirects land in the special class
|
||||
*/
|
||||
int zfs_ddt_data_is_special = B_TRUE;
|
||||
int zfs_user_indirect_is_special = B_TRUE;
|
||||
|
||||
/*
|
||||
* The percentage of special class final space reserved for metadata only.
|
||||
* Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
|
||||
* let metadata into the class.
|
||||
*/
|
||||
int zfs_special_class_metadata_reserve_pct = 25;
|
||||
|
||||
#if defined(__FreeBSD__) && defined(_KERNEL)
|
||||
SYSCTL_INT(_vfs_zfs, OID_AUTO, ddt_data_is_special, CTLFLAG_RWTUN,
|
||||
&zfs_ddt_data_is_special, 0,
|
||||
"Whether DDT data is eligible for the special class vdevs");
|
||||
SYSCTL_INT(_vfs_zfs, OID_AUTO, user_indirect_is_special, CTLFLAG_RWTUN,
|
||||
&zfs_user_indirect_is_special, 0,
|
||||
"Whether indirect blocks are eligible for the special class vdevs");
|
||||
SYSCTL_INT(_vfs_zfs, OID_AUTO, special_class_metadata_reserve_pct,
|
||||
CTLFLAG_RWTUN, &zfs_special_class_metadata_reserve_pct, 0,
|
||||
"Percentage of space in the special class reserved solely for metadata");
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* SPA config locking
|
||||
@ -1297,6 +1323,8 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
|
||||
*/
|
||||
ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
|
||||
ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
|
||||
ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
|
||||
ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
|
||||
|
||||
spa_config_exit(spa, SCL_ALL, spa);
|
||||
|
||||
@ -1640,6 +1668,16 @@ zfs_strtonum(const char *str, char **nptr)
|
||||
return (val);
|
||||
}
|
||||
|
||||
void
|
||||
spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
|
||||
{
|
||||
/*
|
||||
* We bump the feature refcount for each special vdev added to the pool
|
||||
*/
|
||||
ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
|
||||
spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
|
||||
}
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* Accessor functions
|
||||
@ -1889,6 +1927,79 @@ spa_log_class(spa_t *spa)
|
||||
return (spa->spa_log_class);
|
||||
}
|
||||
|
||||
metaslab_class_t *
|
||||
spa_special_class(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_special_class);
|
||||
}
|
||||
|
||||
metaslab_class_t *
|
||||
spa_dedup_class(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_dedup_class);
|
||||
}
|
||||
|
||||
/*
|
||||
* Locate an appropriate allocation class
|
||||
*/
|
||||
metaslab_class_t *
|
||||
spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
|
||||
uint_t level, uint_t special_smallblk)
|
||||
{
|
||||
if (DMU_OT_IS_ZIL(objtype)) {
|
||||
if (spa->spa_log_class->mc_groups != 0)
|
||||
return (spa_log_class(spa));
|
||||
else
|
||||
return (spa_normal_class(spa));
|
||||
}
|
||||
|
||||
boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
|
||||
|
||||
if (DMU_OT_IS_DDT(objtype)) {
|
||||
if (spa->spa_dedup_class->mc_groups != 0)
|
||||
return (spa_dedup_class(spa));
|
||||
else if (has_special_class && zfs_ddt_data_is_special)
|
||||
return (spa_special_class(spa));
|
||||
else
|
||||
return (spa_normal_class(spa));
|
||||
}
|
||||
|
||||
/* Indirect blocks for user data can land in special if allowed */
|
||||
if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
|
||||
if (has_special_class && zfs_user_indirect_is_special)
|
||||
return (spa_special_class(spa));
|
||||
else
|
||||
return (spa_normal_class(spa));
|
||||
}
|
||||
|
||||
if (DMU_OT_IS_METADATA(objtype) || level > 0) {
|
||||
if (has_special_class)
|
||||
return (spa_special_class(spa));
|
||||
else
|
||||
return (spa_normal_class(spa));
|
||||
}
|
||||
|
||||
/*
|
||||
* Allow small file blocks in special class in some cases (like
|
||||
* for the dRAID vdev feature). But always leave a reserve of
|
||||
* zfs_special_class_metadata_reserve_pct exclusively for metadata.
|
||||
*/
|
||||
if (DMU_OT_IS_FILE(objtype) &&
|
||||
has_special_class && size <= special_smallblk) {
|
||||
metaslab_class_t *special = spa_special_class(spa);
|
||||
uint64_t alloc = metaslab_class_get_alloc(special);
|
||||
uint64_t space = metaslab_class_get_space(special);
|
||||
uint64_t limit =
|
||||
(space * (100 - zfs_special_class_metadata_reserve_pct))
|
||||
/ 100;
|
||||
|
||||
if (alloc < limit)
|
||||
return (special);
|
||||
}
|
||||
|
||||
return (spa_normal_class(spa));
|
||||
}
|
||||
|
||||
void
|
||||
spa_evicting_os_register(spa_t *spa, objset_t *os)
|
||||
{
|
||||
|
@ -21,13 +21,14 @@
|
||||
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
||||
* Copyright 2013 DEY Storage Systems, Inc.
|
||||
* Copyright 2014 HybridCluster. All rights reserved.
|
||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||
* Copyright 2013 Saso Kiselkov. All rights reserved.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
*/
|
||||
|
||||
@ -126,6 +127,16 @@ typedef enum dmu_object_byteswap {
|
||||
((ot) & DMU_OT_METADATA) : \
|
||||
dmu_ot[(ot)].ot_metadata)
|
||||
|
||||
#define DMU_OT_IS_DDT(ot) \
|
||||
((ot) == DMU_OT_DDT_ZAP)
|
||||
|
||||
#define DMU_OT_IS_ZIL(ot) \
|
||||
((ot) == DMU_OT_INTENT_LOG)
|
||||
|
||||
/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
|
||||
#define DMU_OT_IS_FILE(ot) \
|
||||
((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
|
||||
|
||||
#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
|
||||
B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
|
||||
|
||||
@ -216,6 +227,7 @@ typedef enum dmu_object_type {
|
||||
*
|
||||
* The DMU_OTN_* types do not have entries in the dmu_ot table,
|
||||
* use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
|
||||
* use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead
|
||||
* of indexing into dmu_ot directly (this works for both DMU_OT_* types
|
||||
* and DMU_OTN_* types).
|
||||
*/
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
@ -113,6 +113,11 @@ struct objset {
|
||||
uint64_t os_normalization;
|
||||
uint64_t os_utf8only;
|
||||
uint64_t os_casesensitivity;
|
||||
/*
|
||||
* The largest zpl file block allowed in special class.
|
||||
* cached here instead of zfsvfs for easier access.
|
||||
*/
|
||||
int os_zpl_special_smallblock;
|
||||
|
||||
/*
|
||||
* Pointer is constant; the blkptr it points to is protected by
|
||||
|
@ -21,6 +21,7 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_METASLAB_H
|
||||
@ -56,12 +57,17 @@ void metaslab_sync_done(metaslab_t *, uint64_t);
|
||||
void metaslab_sync_reassess(metaslab_group_t *);
|
||||
uint64_t metaslab_block_maxsize(metaslab_t *);
|
||||
|
||||
/*
|
||||
* metaslab alloc flags
|
||||
*/
|
||||
#define METASLAB_HINTBP_FAVOR 0x0
|
||||
#define METASLAB_HINTBP_AVOID 0x1
|
||||
#define METASLAB_GANG_HEADER 0x2
|
||||
#define METASLAB_GANG_CHILD 0x4
|
||||
#define METASLAB_ASYNC_ALLOC 0x8
|
||||
#define METASLAB_DONT_THROTTLE 0x10
|
||||
#define METASLAB_MUST_RESERVE 0x20
|
||||
#define METASLAB_FASTWRITE 0x40
|
||||
|
||||
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
|
||||
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
|
||||
@ -92,8 +98,6 @@ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
|
||||
zio_t *, int);
|
||||
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
|
||||
|
||||
void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
|
||||
int64_t, int64_t);
|
||||
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
|
||||
uint64_t metaslab_class_get_space(metaslab_class_t *);
|
||||
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
|
||||
|
@ -27,6 +27,7 @@
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright 2017 Joyent, Inc.
|
||||
* Copyright (c) 2017 Datto Inc.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_SPA_H
|
||||
@ -821,6 +822,11 @@ extern uint64_t spa_version(spa_t *spa);
|
||||
extern boolean_t spa_deflate(spa_t *spa);
|
||||
extern metaslab_class_t *spa_normal_class(spa_t *spa);
|
||||
extern metaslab_class_t *spa_log_class(spa_t *spa);
|
||||
extern metaslab_class_t *spa_special_class(spa_t *spa);
|
||||
extern metaslab_class_t *spa_dedup_class(spa_t *spa);
|
||||
extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
|
||||
dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
|
||||
|
||||
extern void spa_evicting_os_register(spa_t *, objset_t *os);
|
||||
extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
|
||||
extern void spa_evicting_os_wait(spa_t *spa);
|
||||
@ -883,6 +889,7 @@ extern boolean_t spa_trust_config(spa_t *spa);
|
||||
extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
|
||||
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
|
||||
extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
|
||||
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
|
||||
|
||||
extern int spa_mode(spa_t *spa);
|
||||
extern uint64_t zfs_strtonum(const char *str, char **nptr);
|
||||
|
@ -26,6 +26,7 @@
|
||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||
* Copyright 2013 Saso Kiselkov. All rights reserved.
|
||||
* Copyright (c) 2017 Datto Inc.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_SPA_IMPL_H
|
||||
@ -219,6 +220,8 @@ struct spa {
|
||||
boolean_t spa_is_initializing; /* true while opening pool */
|
||||
metaslab_class_t *spa_normal_class; /* normal data class */
|
||||
metaslab_class_t *spa_log_class; /* intent log data class */
|
||||
metaslab_class_t *spa_special_class; /* special allocation class */
|
||||
metaslab_class_t *spa_dedup_class; /* dedup allocation class */
|
||||
uint64_t spa_first_txg; /* first txg after spa_open() */
|
||||
uint64_t spa_final_txg; /* txg of export/destroy */
|
||||
uint64_t spa_freeze_txg; /* freeze pool at this txg */
|
||||
|
@ -22,6 +22,7 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_VDEV_H
|
||||
@ -110,6 +111,8 @@ extern boolean_t vdev_children_are_offline(vdev_t *vd);
|
||||
extern void vdev_space_update(vdev_t *vd,
|
||||
int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
|
||||
|
||||
extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
|
||||
|
||||
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
|
||||
|
||||
extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
|
||||
|
@ -21,6 +21,7 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_VDEV_IMPL_H
|
||||
@ -149,6 +150,14 @@ struct vdev_queue {
|
||||
uint64_t vq_lastoffset;
|
||||
};
|
||||
|
||||
typedef enum vdev_alloc_bias {
|
||||
VDEV_BIAS_NONE,
|
||||
VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */
|
||||
VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */
|
||||
VDEV_BIAS_DEDUP /* dedicated to dedup metadata */
|
||||
} vdev_alloc_bias_t;
|
||||
|
||||
|
||||
/*
|
||||
* On-disk indirect vdev state.
|
||||
*
|
||||
@ -261,6 +270,7 @@ struct vdev {
|
||||
boolean_t vdev_ishole; /* is a hole in the namespace */
|
||||
kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
|
||||
uint64_t vdev_top_zap;
|
||||
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
|
||||
|
||||
/* pool checkpoint related */
|
||||
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
|
||||
|
@ -307,6 +307,7 @@ typedef struct zio_prop {
|
||||
boolean_t zp_dedup;
|
||||
boolean_t zp_dedup_verify;
|
||||
boolean_t zp_nopwrite;
|
||||
uint32_t zp_zpl_smallblk;
|
||||
} zio_prop_t;
|
||||
|
||||
typedef struct zio_cksum_report zio_cksum_report_t;
|
||||
@ -460,6 +461,7 @@ struct zio {
|
||||
vdev_t *io_vd;
|
||||
void *io_vsd;
|
||||
const zio_vsd_ops_t *io_vsd_ops;
|
||||
metaslab_class_t *io_metaslab_class; /* dva throttle class */
|
||||
|
||||
uint64_t io_offset;
|
||||
hrtime_t io_timestamp;
|
||||
|
@ -26,7 +26,8 @@
|
||||
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright 2016 Toomas Soome <tsoome@me.com>
|
||||
* Copyright 2017 Joyent, Inc.
|
||||
* Copyright 2019 Joyent, Inc.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@ -314,6 +315,25 @@ vdev_getops(const char *type)
|
||||
return (ops);
|
||||
}
|
||||
|
||||
/*
|
||||
* Derive the enumerated alloction bias from string input.
|
||||
* String origin is either the per-vdev zap or zpool(1M).
|
||||
*/
|
||||
static vdev_alloc_bias_t
|
||||
vdev_derive_alloc_bias(const char *bias)
|
||||
{
|
||||
vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
|
||||
|
||||
if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
|
||||
alloc_bias = VDEV_BIAS_LOG;
|
||||
else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
|
||||
alloc_bias = VDEV_BIAS_SPECIAL;
|
||||
else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
|
||||
alloc_bias = VDEV_BIAS_DEDUP;
|
||||
|
||||
return (alloc_bias);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
void
|
||||
vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
|
||||
@ -645,6 +665,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
uint64_t guid = 0, islog, nparity;
|
||||
vdev_t *vd;
|
||||
vdev_indirect_config_t *vic;
|
||||
vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
|
||||
boolean_t top_level = (parent && !parent->vdev_parent);
|
||||
|
||||
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
|
||||
|
||||
@ -731,11 +753,33 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
}
|
||||
ASSERT(nparity != -1ULL);
|
||||
|
||||
/*
|
||||
* If creating a top-level vdev, check for allocation classes input
|
||||
*/
|
||||
if (top_level && alloctype == VDEV_ALLOC_ADD) {
|
||||
char *bias;
|
||||
|
||||
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
&bias) == 0) {
|
||||
alloc_bias = vdev_derive_alloc_bias(bias);
|
||||
|
||||
/* spa_vdev_add() expects feature to be enabled */
|
||||
if (alloc_bias != VDEV_BIAS_LOG &&
|
||||
spa->spa_load_state != SPA_LOAD_CREATE &&
|
||||
!spa_feature_is_enabled(spa,
|
||||
SPA_FEATURE_ALLOCATION_CLASSES)) {
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vd = vdev_alloc_common(spa, id, guid, ops);
|
||||
vic = &vd->vdev_indirect_config;
|
||||
|
||||
vd->vdev_islog = islog;
|
||||
vd->vdev_nparity = nparity;
|
||||
if (top_level && alloc_bias != VDEV_BIAS_NONE)
|
||||
vd->vdev_alloc_bias = alloc_bias;
|
||||
|
||||
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
|
||||
vd->vdev_path = spa_strdup(vd->vdev_path);
|
||||
@ -786,7 +830,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
/*
|
||||
* If we're a top-level vdev, try to load the allocation parameters.
|
||||
*/
|
||||
if (parent && !parent->vdev_parent &&
|
||||
if (top_level &&
|
||||
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
|
||||
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
|
||||
&vd->vdev_ms_array);
|
||||
@ -802,14 +846,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
ASSERT0(vd->vdev_top_zap);
|
||||
}
|
||||
|
||||
if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
|
||||
if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
|
||||
ASSERT(alloctype == VDEV_ALLOC_LOAD ||
|
||||
alloctype == VDEV_ALLOC_ADD ||
|
||||
alloctype == VDEV_ALLOC_SPLIT ||
|
||||
alloctype == VDEV_ALLOC_ROOTPOOL);
|
||||
vd->vdev_mg = metaslab_group_create(islog ?
|
||||
spa_log_class(spa) : spa_normal_class(spa), vd,
|
||||
spa->spa_alloc_count);
|
||||
/* Note: metaslab_group_create() is now deferred */
|
||||
}
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf &&
|
||||
@ -1043,6 +1085,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
|
||||
tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
|
||||
svd->vdev_checkpoint_sm = NULL;
|
||||
|
||||
tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
|
||||
svd->vdev_alloc_bias = VDEV_BIAS_NONE;
|
||||
|
||||
tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
|
||||
tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
|
||||
tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
|
||||
@ -1198,6 +1243,55 @@ vdev_remove_parent(vdev_t *cvd)
|
||||
vdev_free(mvd);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_metaslab_group_create(vdev_t *vd)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
|
||||
/*
|
||||
* metaslab_group_create was delayed until allocation bias was available
|
||||
*/
|
||||
if (vd->vdev_mg == NULL) {
|
||||
metaslab_class_t *mc;
|
||||
|
||||
if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
|
||||
vd->vdev_alloc_bias = VDEV_BIAS_LOG;
|
||||
|
||||
ASSERT3U(vd->vdev_islog, ==,
|
||||
(vd->vdev_alloc_bias == VDEV_BIAS_LOG));
|
||||
|
||||
switch (vd->vdev_alloc_bias) {
|
||||
case VDEV_BIAS_LOG:
|
||||
mc = spa_log_class(spa);
|
||||
break;
|
||||
case VDEV_BIAS_SPECIAL:
|
||||
mc = spa_special_class(spa);
|
||||
break;
|
||||
case VDEV_BIAS_DEDUP:
|
||||
mc = spa_dedup_class(spa);
|
||||
break;
|
||||
default:
|
||||
mc = spa_normal_class(spa);
|
||||
}
|
||||
|
||||
vd->vdev_mg = metaslab_group_create(mc, vd,
|
||||
spa->spa_alloc_count);
|
||||
|
||||
/*
|
||||
* The spa ashift values currently only reflect the
|
||||
* general vdev classes. Class destination is late
|
||||
* binding so ashift checking had to wait until now
|
||||
*/
|
||||
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
|
||||
mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
|
||||
if (vd->vdev_ashift > spa->spa_max_ashift)
|
||||
spa->spa_max_ashift = vd->vdev_ashift;
|
||||
if (vd->vdev_ashift < spa->spa_min_ashift)
|
||||
spa->spa_min_ashift = vd->vdev_ashift;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
vdev_metaslab_init(vdev_t *vd, uint64_t txg)
|
||||
{
|
||||
@ -1208,6 +1302,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
|
||||
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
|
||||
metaslab_t **mspp;
|
||||
int error;
|
||||
boolean_t expanding = (oldc != 0);
|
||||
|
||||
ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
|
||||
|
||||
@ -1223,7 +1318,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
|
||||
|
||||
mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
|
||||
|
||||
if (oldc != 0) {
|
||||
if (expanding) {
|
||||
bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
|
||||
kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
|
||||
}
|
||||
@ -1249,6 +1344,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef _KERNEL
|
||||
/*
|
||||
* To accomodate zdb_leak_init() fake indirect
|
||||
* metaslabs, we allocate a metaslab group for
|
||||
* indirect vdevs which normally don't have one.
|
||||
*/
|
||||
if (vd->vdev_mg == NULL) {
|
||||
ASSERT0(vdev_is_concrete(vd));
|
||||
vdev_metaslab_group_create(vd);
|
||||
}
|
||||
#endif
|
||||
error = metaslab_init(vd->vdev_mg, m, object, txg,
|
||||
&(vd->vdev_ms[m]));
|
||||
if (error != 0) {
|
||||
@ -1266,8 +1372,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
|
||||
* the metaslabs since we want to ensure that no new
|
||||
* allocations are performed on this device.
|
||||
*/
|
||||
if (oldc == 0 && !vd->vdev_removing)
|
||||
if (!expanding && !vd->vdev_removing) {
|
||||
metaslab_group_activate(vd->vdev_mg);
|
||||
}
|
||||
|
||||
if (txg == 0)
|
||||
spa_config_exit(spa, SCL_ALLOC, FTAG);
|
||||
@ -1750,9 +1857,13 @@ vdev_open(vdev_t *vd)
|
||||
|
||||
/*
|
||||
* Track the min and max ashift values for normal data devices.
|
||||
*
|
||||
* DJB - TBD these should perhaps be tracked per allocation class
|
||||
* (e.g. spa_min_ashift is used to round up post compression buffers)
|
||||
*/
|
||||
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
|
||||
!vd->vdev_islog && vd->vdev_aux == NULL) {
|
||||
vd->vdev_alloc_bias == VDEV_BIAS_NONE &&
|
||||
vd->vdev_aux == NULL) {
|
||||
if (vd->vdev_ashift > spa->spa_max_ashift)
|
||||
spa->spa_max_ashift = vd->vdev_ashift;
|
||||
if (vd->vdev_ashift < spa->spa_min_ashift)
|
||||
@ -2250,13 +2361,13 @@ vdev_metaslab_set_size(vdev_t *vd)
|
||||
*
|
||||
* The net effect of applying above constrains is summarized below.
|
||||
*
|
||||
* vdev size metaslab count
|
||||
* vdev size metaslab count
|
||||
* --------------|-----------------
|
||||
* < 8GB ~16
|
||||
* 8GB - 100GB one per 512MB
|
||||
* 100GB - 3TB ~200
|
||||
* 3TB - 2PB one per 16GB
|
||||
* > 2PB ~131,072
|
||||
* < 8GB ~16
|
||||
* 8GB - 100GB one per 512MB
|
||||
* 100GB - 3TB ~200
|
||||
* 3TB - 2PB one per 16GB
|
||||
* > 2PB ~131,072
|
||||
* --------------------------------
|
||||
*
|
||||
* Finally, note that all of the above calculate the initial
|
||||
@ -2679,6 +2790,30 @@ vdev_dtl_load(vdev_t *vd)
|
||||
return (error);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
|
||||
const char *string;
|
||||
|
||||
ASSERT(alloc_bias != VDEV_BIAS_NONE);
|
||||
|
||||
string =
|
||||
(alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
|
||||
(alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
|
||||
(alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
|
||||
|
||||
ASSERT(string != NULL);
|
||||
VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
|
||||
1, strlen(string) + 1, string, tx));
|
||||
|
||||
if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
|
||||
spa_activate_allocation_classes(spa, tx);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
|
||||
{
|
||||
@ -2715,8 +2850,11 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
|
||||
}
|
||||
if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
|
||||
vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
|
||||
if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
|
||||
vdev_zap_allocation_data(vd, tx);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint64_t i = 0; i < vd->vdev_children; i++) {
|
||||
vdev_construct_zaps(vd->vdev_child[i], tx);
|
||||
}
|
||||
@ -2913,10 +3051,27 @@ vdev_load(vdev_t *vd)
|
||||
|
||||
vdev_set_deflate_ratio(vd);
|
||||
|
||||
/*
|
||||
* On spa_load path, grab the allocation bias from our zap
|
||||
*/
|
||||
if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
char bias_str[64];
|
||||
|
||||
if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
|
||||
VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
|
||||
bias_str) == 0) {
|
||||
ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
|
||||
vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is a top-level vdev, initialize its metaslabs.
|
||||
*/
|
||||
if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
|
||||
vdev_metaslab_group_create(vd);
|
||||
|
||||
if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
|
||||
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
|
||||
VDEV_AUX_CORRUPT_DATA);
|
||||
@ -3111,6 +3266,7 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
|
||||
|
||||
metaslab_group_histogram_verify(mg);
|
||||
metaslab_class_histogram_verify(mg->mg_class);
|
||||
|
||||
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
|
||||
ASSERT0(mg->mg_histogram[i]);
|
||||
}
|
||||
@ -3700,7 +3856,8 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
|
||||
vs->vs_physical_ashift = vd->vdev_physical_ashift;
|
||||
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
|
||||
vdev_is_concrete(vd)) {
|
||||
vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
|
||||
vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
|
||||
vd->vdev_mg->mg_fragmentation : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3886,19 +4043,25 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
|
||||
}
|
||||
}
|
||||
|
||||
int64_t
|
||||
vdev_deflated_space(vdev_t *vd, int64_t space)
|
||||
{
|
||||
ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
|
||||
ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
|
||||
|
||||
return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the in-core space usage stats for this vdev, its metaslab class,
|
||||
* and the root vdev.
|
||||
* Update the in-core space usage stats for this vdev and the root vdev.
|
||||
*/
|
||||
void
|
||||
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
|
||||
int64_t space_delta)
|
||||
{
|
||||
int64_t dspace_delta = space_delta;
|
||||
int64_t dspace_delta;
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
metaslab_group_t *mg = vd->vdev_mg;
|
||||
metaslab_class_t *mc = mg ? mg->mg_class : NULL;
|
||||
|
||||
ASSERT(vd == vd->vdev_top);
|
||||
|
||||
@ -3908,10 +4071,7 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
|
||||
* because the root vdev's psize-to-asize is simply the max of its
|
||||
* childrens', thus not accurate enough for us.
|
||||
*/
|
||||
ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
|
||||
ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
|
||||
dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
|
||||
vd->vdev_deflate_ratio;
|
||||
dspace_delta = vdev_deflated_space(vd, space_delta);
|
||||
|
||||
mutex_enter(&vd->vdev_stat_lock);
|
||||
vd->vdev_stat.vs_alloc += alloc_delta;
|
||||
@ -3919,21 +4079,15 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
|
||||
vd->vdev_stat.vs_dspace += dspace_delta;
|
||||
mutex_exit(&vd->vdev_stat_lock);
|
||||
|
||||
if (mc == spa_normal_class(spa)) {
|
||||
/* every class but log contributes to root space stats */
|
||||
if (vd->vdev_mg != NULL && !vd->vdev_islog) {
|
||||
mutex_enter(&rvd->vdev_stat_lock);
|
||||
rvd->vdev_stat.vs_alloc += alloc_delta;
|
||||
rvd->vdev_stat.vs_space += space_delta;
|
||||
rvd->vdev_stat.vs_dspace += dspace_delta;
|
||||
mutex_exit(&rvd->vdev_stat_lock);
|
||||
}
|
||||
|
||||
if (mc != NULL) {
|
||||
ASSERT(rvd == vd->vdev_parent);
|
||||
ASSERT(vd->vdev_ms_count != 0);
|
||||
|
||||
metaslab_class_space_update(mc,
|
||||
alloc_delta, defer_delta, space_delta, dspace_delta);
|
||||
}
|
||||
/* Note: metaslab_class_space_update moved to metaslab_space_update */
|
||||
}
|
||||
|
||||
/*
|
||||
@ -4349,7 +4503,9 @@ vdev_expand(vdev_t *vd, uint64_t txg)
|
||||
|
||||
vdev_set_deflate_ratio(vd);
|
||||
|
||||
if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
|
||||
if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
|
||||
vdev_is_concrete(vd)) {
|
||||
vdev_metaslab_group_create(vd);
|
||||
VERIFY(vdev_metaslab_init(vd, txg) == 0);
|
||||
vdev_config_dirty(vd);
|
||||
}
|
||||
|
@ -22,6 +22,7 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
* Copyright 2019 Joyent, Inc.
|
||||
*/
|
||||
|
||||
@ -323,6 +324,28 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
|
||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
|
||||
vd->vdev_removing);
|
||||
}
|
||||
|
||||
/* zpool command expects alloc class data */
|
||||
if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
|
||||
const char *bias = NULL;
|
||||
|
||||
switch (vd->vdev_alloc_bias) {
|
||||
case VDEV_BIAS_LOG:
|
||||
bias = VDEV_ALLOC_BIAS_LOG;
|
||||
break;
|
||||
case VDEV_BIAS_SPECIAL:
|
||||
bias = VDEV_ALLOC_BIAS_SPECIAL;
|
||||
break;
|
||||
case VDEV_BIAS_DEDUP:
|
||||
bias = VDEV_ALLOC_BIAS_DEDUP;
|
||||
break;
|
||||
default:
|
||||
ASSERT3U(vd->vdev_alloc_bias, ==,
|
||||
VDEV_BIAS_NONE);
|
||||
}
|
||||
fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
|
||||
bias);
|
||||
}
|
||||
}
|
||||
|
||||
if (vd->vdev_dtl_sm != NULL) {
|
||||
|
@ -950,14 +950,17 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
|
||||
ASSERT3U(size, <=, maxalloc);
|
||||
|
||||
/*
|
||||
* We use allocator 0 for this I/O because we don't expect device remap
|
||||
* to be the steady state of the system, so parallelizing is not as
|
||||
* critical as it is for other allocation types. We also want to ensure
|
||||
* that the IOs are allocated together as much as possible, to reduce
|
||||
* mapping sizes.
|
||||
* An allocation class might not have any remaining vdevs or space
|
||||
*/
|
||||
int error = metaslab_alloc_dva(spa, mg->mg_class, size,
|
||||
&dst, 0, NULL, txg, 0, zal, 0);
|
||||
metaslab_class_t *mc = mg->mg_class;
|
||||
if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
|
||||
mc = spa_normal_class(spa);
|
||||
int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
|
||||
zal, 0);
|
||||
if (error == ENOSPC && mc != spa_normal_class(spa)) {
|
||||
error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
|
||||
&dst, 0, NULL, txg, 0, zal, 0);
|
||||
}
|
||||
if (error != 0)
|
||||
return (error);
|
||||
|
||||
@ -1869,15 +1872,31 @@ spa_vdev_remove_top_check(vdev_t *vd)
|
||||
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
|
||||
/* available space in the pool's normal class */
|
||||
uint64_t available = dsl_dir_space_available(
|
||||
spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
|
||||
|
||||
metaslab_class_t *mc = vd->vdev_mg->mg_class;
|
||||
|
||||
/*
|
||||
* When removing a vdev from an allocation class that has
|
||||
* remaining vdevs, include available space from the class.
|
||||
*/
|
||||
if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
|
||||
uint64_t class_avail = metaslab_class_get_space(mc) -
|
||||
metaslab_class_get_alloc(mc);
|
||||
|
||||
/* add class space, adjusted for overhead */
|
||||
available += (class_avail * 94) / 100;
|
||||
}
|
||||
|
||||
/*
|
||||
* There has to be enough free space to remove the
|
||||
* device and leave double the "slop" space (i.e. we
|
||||
* must leave at least 3% of the pool free, in addition to
|
||||
* the normal slop space).
|
||||
*/
|
||||
if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
|
||||
NULL, 0, B_TRUE) <
|
||||
vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
|
||||
if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
|
||||
return (SET_ERROR(ENOSPC));
|
||||
}
|
||||
|
||||
|
@ -4325,6 +4325,15 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
|
||||
}
|
||||
break;
|
||||
|
||||
case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
|
||||
/*
|
||||
* This property could require the allocation classes
|
||||
* feature to be active for setting, however we allow
|
||||
* it so that tests of settable properties succeed.
|
||||
* The CLI will issue a warning in this case.
|
||||
*/
|
||||
break;
|
||||
|
||||
case ZFS_PROP_SHARESMB:
|
||||
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
|
@ -23,6 +23,7 @@
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
#include <sys/sysmacros.h>
|
||||
@ -710,6 +711,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
||||
zio->io_bookmark = *zb;
|
||||
|
||||
if (pio != NULL) {
|
||||
if (zio->io_metaslab_class == NULL)
|
||||
zio->io_metaslab_class = pio->io_metaslab_class;
|
||||
if (zio->io_logical == NULL)
|
||||
zio->io_logical = pio->io_logical;
|
||||
if (zio->io_child_type == ZIO_CHILD_GANG)
|
||||
@ -1206,9 +1209,8 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
|
||||
*/
|
||||
if (flags & ZIO_FLAG_IO_ALLOCATING &&
|
||||
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
|
||||
metaslab_class_t *mc = spa_normal_class(pio->io_spa);
|
||||
|
||||
ASSERT(mc->mc_alloc_throttle_enabled);
|
||||
ASSERT(pio->io_metaslab_class != NULL);
|
||||
ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
|
||||
ASSERT(type == ZIO_TYPE_WRITE);
|
||||
ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
|
||||
ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
|
||||
@ -1524,8 +1526,9 @@ zio_write_compress(zio_t *zio)
|
||||
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
|
||||
BP_GET_PSIZE(bp) == psize &&
|
||||
pass >= zfs_sync_pass_rewrite) {
|
||||
ASSERT(psize != 0);
|
||||
VERIFY3U(psize, !=, 0);
|
||||
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
|
||||
|
||||
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
|
||||
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
|
||||
} else {
|
||||
@ -2952,7 +2955,7 @@ zio_io_to_allocate(spa_t *spa, int allocator)
|
||||
* reserve then we throttle.
|
||||
*/
|
||||
ASSERT3U(zio->io_allocator, ==, allocator);
|
||||
if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
|
||||
if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
|
||||
zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
|
||||
return (NULL);
|
||||
}
|
||||
@ -2968,9 +2971,14 @@ zio_dva_throttle(zio_t *zio)
|
||||
{
|
||||
spa_t *spa = zio->io_spa;
|
||||
zio_t *nio;
|
||||
metaslab_class_t *mc;
|
||||
|
||||
/* locate an appropriate allocation class */
|
||||
mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
|
||||
zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
|
||||
|
||||
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
|
||||
!spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
|
||||
!mc->mc_alloc_throttle_enabled ||
|
||||
zio->io_child_type == ZIO_CHILD_GANG ||
|
||||
zio->io_flags & ZIO_FLAG_NODATA) {
|
||||
return (zio);
|
||||
@ -2992,17 +3000,16 @@ zio_dva_throttle(zio_t *zio)
|
||||
zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
|
||||
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
|
||||
mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
|
||||
|
||||
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
||||
zio->io_metaslab_class = mc;
|
||||
avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
|
||||
|
||||
nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator);
|
||||
nio = zio_io_to_allocate(spa, zio->io_allocator);
|
||||
mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
|
||||
|
||||
return (nio);
|
||||
}
|
||||
|
||||
void
|
||||
static void
|
||||
zio_allocate_dispatch(spa_t *spa, int allocator)
|
||||
{
|
||||
zio_t *zio;
|
||||
@ -3022,7 +3029,7 @@ static zio_t *
|
||||
zio_dva_allocate(zio_t *zio)
|
||||
{
|
||||
spa_t *spa = zio->io_spa;
|
||||
metaslab_class_t *mc = spa_normal_class(spa);
|
||||
metaslab_class_t *mc;
|
||||
blkptr_t *bp = zio->io_bp;
|
||||
int error;
|
||||
int flags = 0;
|
||||
@ -3038,20 +3045,57 @@ zio_dva_allocate(zio_t *zio)
|
||||
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
|
||||
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
|
||||
|
||||
if (zio->io_flags & ZIO_FLAG_NODATA) {
|
||||
if (zio->io_flags & ZIO_FLAG_NODATA)
|
||||
flags |= METASLAB_DONT_THROTTLE;
|
||||
}
|
||||
if (zio->io_flags & ZIO_FLAG_GANG_CHILD) {
|
||||
if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
|
||||
flags |= METASLAB_GANG_CHILD;
|
||||
}
|
||||
if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) {
|
||||
if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
|
||||
flags |= METASLAB_ASYNC_ALLOC;
|
||||
|
||||
/*
|
||||
* if not already chosen, locate an appropriate allocation class
|
||||
*/
|
||||
mc = zio->io_metaslab_class;
|
||||
if (mc == NULL) {
|
||||
mc = spa_preferred_class(spa, zio->io_size,
|
||||
zio->io_prop.zp_type, zio->io_prop.zp_level,
|
||||
zio->io_prop.zp_zpl_smallblk);
|
||||
zio->io_metaslab_class = mc;
|
||||
}
|
||||
|
||||
error = metaslab_alloc(spa, mc, zio->io_size, bp,
|
||||
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
|
||||
&zio->io_alloc_list, zio, zio->io_allocator);
|
||||
|
||||
/*
|
||||
* Fallback to normal class when an alloc class is full
|
||||
*/
|
||||
if (error == ENOSPC && mc != spa_normal_class(spa)) {
|
||||
/*
|
||||
* If throttling, transfer reservation over to normal class.
|
||||
* The io_allocator slot can remain the same even though we
|
||||
* are switching classes.
|
||||
*/
|
||||
if (mc->mc_alloc_throttle_enabled &&
|
||||
(zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
|
||||
metaslab_class_throttle_unreserve(mc,
|
||||
zio->io_prop.zp_copies, zio->io_allocator, zio);
|
||||
zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
|
||||
|
||||
mc = spa_normal_class(spa);
|
||||
VERIFY(metaslab_class_throttle_reserve(mc,
|
||||
zio->io_prop.zp_copies, zio->io_allocator, zio,
|
||||
flags | METASLAB_MUST_RESERVE));
|
||||
} else {
|
||||
mc = spa_normal_class(spa);
|
||||
}
|
||||
zio->io_metaslab_class = mc;
|
||||
|
||||
error = metaslab_alloc(spa, mc, zio->io_size, bp,
|
||||
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
|
||||
&zio->io_alloc_list, zio, zio->io_allocator);
|
||||
}
|
||||
|
||||
if (error != 0) {
|
||||
zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
|
||||
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
|
||||
@ -3119,6 +3163,15 @@ zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp,
|
||||
ASSERT(txg > spa_syncing_txg(spa));
|
||||
|
||||
metaslab_trace_init(&io_alloc_list);
|
||||
|
||||
/*
|
||||
* Block pointer fields are useful to metaslabs for stats and debugging.
|
||||
* Fill in the obvious ones before calling into metaslab_alloc().
|
||||
*/
|
||||
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
|
||||
BP_SET_PSIZE(new_bp, size);
|
||||
BP_SET_LEVEL(new_bp, 0);
|
||||
|
||||
/*
|
||||
* When allocating a zil block, we don't have information about
|
||||
* the final destination of the block except the objset it's part
|
||||
@ -3721,13 +3774,15 @@ zio_ready(zio_t *zio)
|
||||
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
||||
ASSERT(IO_IS_ALLOCATING(zio));
|
||||
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
||||
ASSERT(zio->io_metaslab_class != NULL);
|
||||
|
||||
/*
|
||||
* We were unable to allocate anything, unreserve and
|
||||
* issue the next I/O to allocate.
|
||||
*/
|
||||
metaslab_class_throttle_unreserve(
|
||||
spa_normal_class(zio->io_spa),
|
||||
zio->io_prop.zp_copies, zio->io_allocator, zio);
|
||||
zio->io_metaslab_class, zio->io_prop.zp_copies,
|
||||
zio->io_allocator, zio);
|
||||
zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
|
||||
}
|
||||
}
|
||||
@ -3809,14 +3864,15 @@ zio_dva_throttle_done(zio_t *zio)
|
||||
ASSERT(zio->io_logical != NULL);
|
||||
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
|
||||
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
|
||||
ASSERT(zio->io_metaslab_class != NULL);
|
||||
|
||||
mutex_enter(&pio->io_lock);
|
||||
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
|
||||
pio->io_allocator, B_TRUE);
|
||||
mutex_exit(&pio->io_lock);
|
||||
|
||||
metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
|
||||
1, pio->io_allocator, pio);
|
||||
metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
|
||||
pio->io_allocator, pio);
|
||||
|
||||
/*
|
||||
* Call into the pipeline to see if there is more work that
|
||||
@ -3835,7 +3891,6 @@ zio_done(zio_t *zio)
|
||||
vdev_t *vd = zio->io_vd;
|
||||
uint64_t psize = zio->io_size;
|
||||
zio_t *pio, *pio_next;
|
||||
metaslab_class_t *mc = spa_normal_class(spa);
|
||||
zio_link_t *zl = NULL;
|
||||
|
||||
/*
|
||||
@ -3854,7 +3909,8 @@ zio_done(zio_t *zio)
|
||||
*/
|
||||
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
|
||||
zio->io_child_type == ZIO_CHILD_VDEV) {
|
||||
ASSERT(mc->mc_alloc_throttle_enabled);
|
||||
ASSERT(zio->io_metaslab_class != NULL);
|
||||
ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
|
||||
zio_dva_throttle_done(zio);
|
||||
}
|
||||
|
||||
@ -3866,10 +3922,12 @@ zio_done(zio_t *zio)
|
||||
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
||||
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
||||
ASSERT(bp != NULL);
|
||||
|
||||
metaslab_group_alloc_verify(spa, zio->io_bp, zio,
|
||||
zio->io_allocator);
|
||||
VERIFY(zfs_refcount_not_held(
|
||||
&mc->mc_alloc_slots[zio->io_allocator], zio));
|
||||
&zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator],
|
||||
zio));
|
||||
}
|
||||
|
||||
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
|
||||
|
@ -27,6 +27,7 @@
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright 2017 Joyent, Inc.
|
||||
* Copyright (c) 2019 Datto Inc.
|
||||
* Copyright (c) 2017, Intel Corporation.
|
||||
*/
|
||||
|
||||
/* Portions Copyright 2010 Robert Milkowski */
|
||||
@ -167,6 +168,7 @@ typedef enum {
|
||||
ZFS_PROP_PREV_SNAP,
|
||||
ZFS_PROP_RECEIVE_RESUME_TOKEN,
|
||||
ZFS_PROP_REMAPTXG, /* not exposed to the user */
|
||||
ZFS_PROP_SPECIAL_SMALL_BLOCKS,
|
||||
ZFS_NUM_PROPS
|
||||
} zfs_prop_t;
|
||||
|
||||
@ -611,6 +613,8 @@ typedef struct zpool_load_policy {
|
||||
#define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */
|
||||
#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */
|
||||
#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */
|
||||
#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */
|
||||
|
||||
/*
|
||||
* The persistent vdev state is stored as separate values rather than a single
|
||||
* 'vdev_state' entry. This is because a device can be in multiple states, such
|
||||
@ -656,6 +660,14 @@ typedef struct zpool_load_policy {
|
||||
#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
|
||||
"com.delphix:pool_checkpoint_sm"
|
||||
|
||||
#define VDEV_TOP_ZAP_ALLOCATION_BIAS \
|
||||
"org.zfsonlinux:allocation_bias"
|
||||
|
||||
/* vdev metaslab allocation bias */
|
||||
#define VDEV_ALLOC_BIAS_LOG "log"
|
||||
#define VDEV_ALLOC_BIAS_SPECIAL "special"
|
||||
#define VDEV_ALLOC_BIAS_DEDUP "dedup"
|
||||
|
||||
#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
|
||||
"com.delphix:next_offset_to_initialize"
|
||||
#define VDEV_LEAF_ZAP_INITIALIZE_STATE \
|
||||
|
Loading…
x
Reference in New Issue
Block a user