vdev_ashift should only be set once

== Motivation and Context

The new vdev ashift optimization prevents the removal of devices when
a zfs configuration is comprised of disks which have different logical
and physical block sizes. This is caused because we set 'spa_min_ashift'
in vdev_open and then later call 'vdev_ashift_optimize'. This would
result in an inconsistency between spa's ashift calculations and that
of the top-level vdev.

In addition, the optimization logical ignores the overridden ashift
value that would be provided by '-o ashift=<val>'.

== Description

This change reworks the vdev ashift optimization so that it's only
set the first time the device is configured. It still allows the
physical and logical ahsift values to be set every time the device
is opened but those values are only consulted on first open.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Cedric Berger <cedric@precidata.com>
Signed-off-by: George Wilson <gwilson@delphix.com>
External-Issue: DLPX-71831
Closes #10932
This commit is contained in:
George Wilson 2020-09-18 14:13:47 -05:00 committed by GitHub
parent 908d43d0a9
commit c494aa7f57
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 198 additions and 52 deletions

View File

@ -2654,6 +2654,13 @@ show_import(nvlist_t *config)
errata);
break;
case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
printf_color(ANSI_BOLD, gettext("status: "));
printf_color(ANSI_YELLOW, gettext("One or more devices are "
"configured to use a non-native block size.\n"
"\tExpect reduced performance.\n"));
break;
default:
/*
* No other status can be seen when importing pools.

View File

@ -21,6 +21,7 @@
/*
* Copyright (C) 2016 Gvozden Neskovic <neskovic@gmail.com>.
* Copyright (c) 2020 by Delphix. All rights reserved.
*/
#ifndef _MOD_COMPAT_H
@ -71,6 +72,7 @@ enum scope_prefix_types {
zfs_txg,
zfs_vdev,
zfs_vdev_cache,
zfs_vdev_file,
zfs_vdev_mirror,
zfs_zevent,
zfs_zio,

View File

@ -94,7 +94,6 @@ extern void vdev_rele(vdev_t *);
extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_ashift_optimize(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd, char *tag);

View File

@ -772,6 +772,28 @@ regular reads (but there's no reason it has to be the same).
Default value: \fB32,768\fR.
.RE
.sp
.ne 2
.na
\fBvdev_file_logical_ashift\fR (ulong)
.ad
.RS 12n
Logical ashift for file-based devices.
.sp
Default value: \fB9\fR.
.RE
.sp
.ne 2
.na
\fBvdev_file_physical_ashift\fR (ulong)
.ad
.RS 12n
Physical ashift for file-based devices.
.sp
Default value: \fB9\fR.
.RE
.sp
.ne 2
.na

View File

@ -121,6 +121,7 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
"ZFS livelist condense");
SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file");
SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
"ZFS VDEV mirror");

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -40,6 +40,9 @@
static taskq_t *vdev_file_taskq;
unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
void
vdev_file_init(void)
{
@ -167,8 +170,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
}
*max_psize = *psize = zfa.zfa_size;
*logical_ashift = SPA_MINBLOCKSHIFT;
*physical_ashift = SPA_MINBLOCKSHIFT;
*logical_ashift = vdev_file_logical_ashift;
*physical_ashift = vdev_file_physical_ashift;
return (0);
}
@ -326,3 +329,8 @@ vdev_ops_t vdev_disk_ops = {
};
#endif
ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW,
"Logical ashift for file-based devices");
ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW,
"Physical ashift for file-based devices");

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -45,6 +45,17 @@
static taskq_t *vdev_file_taskq;
/*
* By default, the logical/physical ashift for file vdevs is set to
* SPA_MINBLOCKSHIFT (9). This allows all file vdevs to use 512B (1 << 9)
* blocksizes. Users may opt to change one or both of these for testing
* or performance reasons. Care should be taken as these values will
* impact the vdev_ashift setting which can only be set at vdev creation
* time.
*/
unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
static void
vdev_file_hold(vdev_t *vd)
{
@ -159,8 +170,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
}
*max_psize = *psize = zfa.zfa_size;
*logical_ashift = SPA_MINBLOCKSHIFT;
*physical_ashift = SPA_MINBLOCKSHIFT;
*logical_ashift = vdev_file_logical_ashift;
*physical_ashift = vdev_file_physical_ashift;
return (0);
}
@ -346,3 +357,8 @@ vdev_ops_t vdev_disk_ops = {
};
#endif
ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW,
"Logical ashift for file-based devices");
ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW,
"Physical ashift for file-based devices");

View File

@ -9434,8 +9434,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
ASSERT(!l2arc_vdev_present(vd));
vdev_ashift_optimize(vd);
/*
* Create a new l2arc device entry.
*/

View File

@ -5762,7 +5762,6 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
vdev_ashift_optimize(vd);
vdev_metaslab_set_size(vd);
vdev_expand(vd, txg);
}

View File

@ -577,10 +577,8 @@ spa_config_update(spa_t *spa, int what)
(tvd->vdev_islog && tvd->vdev_removing))
continue;
if (tvd->vdev_ms_array == 0) {
vdev_ashift_optimize(tvd);
if (tvd->vdev_ms_array == 0)
vdev_metaslab_set_size(tvd);
}
vdev_expand(tvd, txg);
}
}

View File

@ -1672,6 +1672,38 @@ vdev_set_deflate_ratio(vdev_t *vd)
}
}
/*
* Maximize performance by inflating the configured ashift for top level
* vdevs to be as close to the physical ashift as possible while maintaining
* administrator defined limits and ensuring it doesn't go below the
* logical ashift.
*/
static void
vdev_ashift_optimize(vdev_t *vd)
{
ASSERT(vd == vd->vdev_top);
if (vd->vdev_ashift < vd->vdev_physical_ashift) {
vd->vdev_ashift = MIN(
MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
MAX(zfs_vdev_min_auto_ashift,
vd->vdev_physical_ashift));
} else {
/*
* If the logical and physical ashifts are the same, then
* we ensure that the top-level vdev's ashift is not smaller
* than our minimum ashift value. For the unusual case
* where logical ashift > physical ashift, we can't cap
* the calculated ashift based on max ashift as that
* would cause failures.
* We still check if we need to increase it to match
* the min ashift.
*/
vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
vd->vdev_ashift);
}
}
/*
* Prepare a virtual device for access.
*/
@ -1830,16 +1862,17 @@ vdev_open(vdev_t *vd)
return (SET_ERROR(EINVAL));
}
/*
* We can always set the logical/physical ashift members since
* their values are only used to calculate the vdev_ashift when
* the device is first added to the config. These values should
* not be used for anything else since they may change whenever
* the device is reopened and we don't store them in the label.
*/
vd->vdev_physical_ashift =
MAX(physical_ashift, vd->vdev_physical_ashift);
vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
if (vd->vdev_logical_ashift > ASHIFT_MAX) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_ASHIFT_TOO_BIG);
return (SET_ERROR(EDOM));
}
vd->vdev_logical_ashift = MAX(logical_ashift,
vd->vdev_logical_ashift);
if (vd->vdev_asize == 0) {
/*
@ -1848,6 +1881,24 @@ vdev_open(vdev_t *vd)
*/
vd->vdev_asize = asize;
vd->vdev_max_asize = max_asize;
/*
* If the vdev_ashift was not overriden at creation time,
* then set it the logical ashift and optimize the ashift.
*/
if (vd->vdev_ashift == 0) {
vd->vdev_ashift = vd->vdev_logical_ashift;
if (vd->vdev_logical_ashift > ASHIFT_MAX) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_ASHIFT_TOO_BIG);
return (SET_ERROR(EDOM));
}
if (vd->vdev_top == vd) {
vdev_ashift_optimize(vd);
}
}
if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
vd->vdev_ashift > ASHIFT_MAX)) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
@ -2444,35 +2495,6 @@ vdev_metaslab_set_size(vdev_t *vd)
ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
}
/*
* Maximize performance by inflating the configured ashift for top level
* vdevs to be as close to the physical ashift as possible while maintaining
* administrator defined limits and ensuring it doesn't go below the
* logical ashift.
*/
void
vdev_ashift_optimize(vdev_t *vd)
{
if (vd == vd->vdev_top) {
if (vd->vdev_ashift < vd->vdev_physical_ashift) {
vd->vdev_ashift = MIN(
MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
MAX(zfs_vdev_min_auto_ashift,
vd->vdev_physical_ashift));
} else {
/*
* Unusual case where logical ashift > physical ashift
* so we can't cap the calculated ashift based on max
* ashift as that would cause failures.
* We still check if we need to increase it to match
* the min ashift.
*/
vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
vd->vdev_ashift);
}
}
}
void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{

View File

@ -391,7 +391,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
*physical_ashift = MAX(*physical_ashift,
vd->vdev_physical_ashift);
cvd->vdev_physical_ashift);
}
if (numerrors == vd->vdev_children) {

View File

@ -77,6 +77,7 @@ TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch
TXG_HISTORY txg.history zfs_txg_history
TXG_TIMEOUT txg.timeout zfs_txg_timeout
UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress
VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift
VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count
VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip
VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev

View File

@ -22,6 +22,7 @@
#
# Copyright 2017, loli10K. All rights reserved.
# Copyright (c) 2020 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
@ -35,13 +36,15 @@
# STRATEGY:
# 1. Create a pool with default values.
# 2. Verify 'zpool add -o ashift=<n>' works with allowed values (9-16).
# 3. Verify 'zpool add -o ashift=<n>' doesn't accept other invalid values.
# 3. Verify setting kernel tunable for file vdevs works correctly.
# 4. Verify 'zpool add -o ashift=<n>' doesn't accept other invalid values.
#
verify_runnable "global"
function cleanup
{
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift
poolexists $TESTPOOL && destroy_pool $TESTPOOL
rm -f $disk1 $disk2
}
@ -54,6 +57,8 @@ disk2=$TEST_BASE_DIR/disk2
log_must mkfile $SIZE $disk1
log_must mkfile $SIZE $disk2
orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
for ashift in ${ashifts[@]}
do
@ -69,6 +74,24 @@ do
log_must zpool destroy $TESTPOOL
log_must zpool labelclear $disk1
log_must zpool labelclear $disk2
#
# Make sure we can also set the ashift using the tunable.
#
log_must zpool create $TESTPOOL $disk1
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $ashift
log_must zpool add $TESTPOOL $disk2
verify_ashift $disk2 $ashift
if [[ $? -ne 0 ]]
then
log_fail "Device was added without setting ashift value to "\
"$ashift"
fi
# clean things for the next run
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift
log_must zpool destroy $TESTPOOL
log_must zpool labelclear $disk1
log_must zpool labelclear $disk2
done
typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-")

View File

@ -22,6 +22,7 @@
#
# Copyright 2017, loli10K. All rights reserved.
# Copyright (c) 2020 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
@ -43,6 +44,7 @@ verify_runnable "global"
function cleanup
{
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift
poolexists $TESTPOOL && destroy_pool $TESTPOOL
log_must rm -f $disk1 $disk2
}
@ -55,6 +57,14 @@ disk2=$TEST_BASE_DIR/disk2
log_must mkfile $SIZE $disk1
log_must mkfile $SIZE $disk2
orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
#
# Set the file vdev's ashift to the max. Overriding
# the ashift using the -o ashift property should still
# be honored.
#
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT 16
typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
for ashift in ${ashifts[@]}
do

View File

@ -22,6 +22,7 @@
#
# Copyright 2017, loli10K. All rights reserved.
# Copyright (c) 2020 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
@ -41,6 +42,7 @@ verify_runnable "global"
function cleanup
{
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift
poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
rm -f $disk1 $disk2
}
@ -53,6 +55,14 @@ disk2=$TEST_BASE_DIR/disk2
log_must truncate -s $SIZE $disk1
log_must truncate -s $SIZE $disk2
orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
#
# Set the file vdev's ashift to the max. Overriding
# the ashift using the -o ashift property should still
# be honored.
#
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT 16
typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
for ashift in ${ashifts[@]}
do

View File

@ -22,6 +22,7 @@
#
# Copyright 2017, loli10K. All rights reserved.
# Copyright (c) 2020 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
@ -41,6 +42,7 @@ verify_runnable "global"
function cleanup
{
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift
poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
rm -f $disk1 $disk2
}
@ -53,6 +55,14 @@ disk2=$TEST_BASE_DIR/disk2
log_must truncate -s $SIZE $disk1
log_must truncate -s $SIZE $disk2
orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
#
# Set the file vdev's ashift to the max. Overriding
# the ashift using the -o ashift property should still
# be honored.
#
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT 16
typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
for ashift in ${ashifts[@]}
do

View File

@ -22,6 +22,7 @@
#
# Copyright 2017, loli10K. All rights reserved.
# Copyright (c) 2020 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
@ -43,6 +44,7 @@ verify_runnable "global"
function cleanup
{
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift
poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
rm -f $disk1 $disk2
}
@ -55,6 +57,14 @@ disk2=$TEST_BASE_DIR/disk2
log_must truncate -s $SIZE $disk1
log_must truncate -s $SIZE $disk2
orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
#
# Set the file vdev's ashift to the max. Overriding
# the ashift using the -o ashift property should still
# be honored.
#
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT 16
typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
for ashift in ${ashifts[@]}
do

View File

@ -22,6 +22,7 @@
#
# Copyright 2017, loli10K. All rights reserved.
# Copyright (c) 2020 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
@ -41,6 +42,7 @@ verify_runnable "global"
function cleanup
{
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift
destroy_pool $TESTPOOL1
rm -f $disk
}
@ -52,6 +54,14 @@ log_onexit cleanup
log_assert "zpool set can modify 'ashift' property"
orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
#
# Set the file vdev's ashift to the max. Overriding
# the ashift using the -o ashift property should still
# be honored.
#
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT 16
disk=$TEST_BASE_DIR/disk
log_must mkfile $SIZE $disk
log_must zpool create $TESTPOOL1 $disk