From 32a9872bbae90a7cd9793c371f558701ec268976 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Thu, 3 May 2012 05:49:19 -0700 Subject: [PATCH 1/3] Illumos #2671: zpool import should not fail if vdev ashift has increased Reviewed by: Adam Leventhal Reviewed by: Eric Schrock Reviewed by: Richard Elling Reviewed by: Gordon Ross Reviewed by: Garrett D'Amore Approved by: Richard Lowe Refererces to Illumos issue: https://www.illumos.org/issues/2671 This patch has been slightly modified from the upstream Illumos version. In the upstream implementation a warning message is logged to the console. To prevent pointless console noise this notification is now posted as a "ereport.fs.zfs.vdev.bad_ashift" event. The event indicates a non-optimial (but entirely safe) ashift value was used to create the pool. Depending on your workload this may impact pool performance. Unfortunately, the only way to correct the issue is to recreate the pool with a new ashift. NOTE: The unrelated fix to the comment in zpool_main.c appears in the upstream commit and was preserved for consistnecy. Ported-by: Cyril Plisko Reworked-by: Brian Behlendorf Closes #955 --- cmd/zpool/zpool_main.c | 2 +- include/sys/fm/fs/zfs.h | 2 ++ module/zfs/vdev.c | 13 ++++++++----- module/zfs/zfs_fm.c | 4 ++++ 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 56617323fa83..96798c4ad8b3 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3736,7 +3736,7 @@ print_dedup_stats(nvlist_t *config) /* * If the pool was faulted then we may not have been able to - * obtain the config. Otherwise, if have anything in the dedup + * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index 15803c034e08..d5c6004c261a 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -47,6 +47,7 @@ extern "C" { #define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" #define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" #define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" +#define FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT "vdev.bad_ashift" #define FM_EREPORT_ZFS_DEVICE_REMOVE "vdev.remove" #define FM_EREPORT_ZFS_DEVICE_CLEAR "vdev.clear" #define FM_EREPORT_ZFS_DEVICE_CHECK "vdev.check" @@ -71,6 +72,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 7d6d5278a0db..e0d82e6737c4 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1271,13 +1271,16 @@ vdev_open(vdev_t *vd) vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); } else { /* - * Make sure the alignment requirement hasn't increased. + * Detect if the alignment requirement has increased. + * We don't want to make the pool unavailable, just + * post an event instead. */ - if (ashift > vd->vdev_top->vdev_ashift) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (EINVAL); + if (ashift > vd->vdev_top->vdev_ashift && + vd->vdev_ops->vdev_op_leaf) { + zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, + spa, vd, NULL, 0, 0); } + vd->vdev_max_asize = max_asize; } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index db6a831d2253..820291bf4b04 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -267,6 +267,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, DATA_TYPE_STRING, vd->vdev_fru, NULL); + if (vd->vdev_ashift) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, + DATA_TYPE_UINT64, vd->vdev_ashift, NULL); if (pvd != NULL) { fm_payload_set(ereport, From 2404b01499019f6a8589cec79201b8871ec48081 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Sun, 2 Sep 2012 16:34:12 -0700 Subject: [PATCH 2/3] Improve AF hard disk detection Use the bdev_physical_block_size() interface to determine the minimize write size which can be issued without incurring a read-modify-write operation. This is used to set the ashift correctly to prevent a performance penalty when using AF hard disks. Unfortunately, this interface isn't entirely reliable because it's not uncommon for disks to misreport this value. For this reason you may still need to manually set your ashift with: zpool create -o ashift=12 ... The solution to this in the upstream Illumos source was to add a white list of known offending drives. Maintaining such a list will be a burden, but it still may be worth doing if we can detect a large number of these drives. This should be considered as future work. Reported-by: Richard Yao Signed-off-by: Brian Behlendorf Closes #916 --- config/kernel-bdev-physical-size.m4 | 39 +++++++++++++++++++++++++++++ config/kernel.m4 | 1 + include/linux/blkdev_compat.h | 24 ++++++++++++++---- 3 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 config/kernel-bdev-physical-size.m4 diff --git a/config/kernel-bdev-physical-size.m4 b/config/kernel-bdev-physical-size.m4 new file mode 100644 index 000000000000..0a1fe8e26385 --- /dev/null +++ b/config/kernel-bdev-physical-size.m4 @@ -0,0 +1,39 @@ +dnl # +dnl # 2.6.30 API change +dnl # +dnl # The bdev_physical_block_size() interface was added to provide a way +dnl # to determine the smallest write which can be performed without a +dnl # read-modify-write operation. From the kernel documentation: +dnl # +dnl # What: /sys/block//queue/physical_block_size +dnl # Date: May 2009 +dnl # Contact: Martin K. Petersen +dnl # Description: +dnl # This is the smallest unit the storage device can write +dnl # without resorting to read-modify-write operation. It is +dnl # usually the same as the logical block size but may be +dnl # bigger. One example is SATA drives with 4KB sectors +dnl # that expose a 512-byte logical block size to the +dnl # operating system. +dnl # +dnl # Unfortunately, this interface isn't entirely reliable because +dnl # drives are sometimes known to misreport this value. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE], [ + AC_MSG_CHECKING([whether bdev_physical_block_size() is available]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Wno-unused-but-set-variable" + ZFS_LINUX_TRY_COMPILE([ + #include + ],[ + struct block_device *bdev = NULL; + bdev_physical_block_size(bdev); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BDEV_PHYSICAL_BLOCK_SIZE, 1, + [bdev_physical_block_size() is available]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 13238d8ac21b..34969c3163a9 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -14,6 +14,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE ZFS_AC_KERNEL_INVALIDATE_BDEV_ARGS ZFS_AC_KERNEL_BDEV_LOGICAL_BLOCK_SIZE + ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE ZFS_AC_KERNEL_BIO_EMPTY_BARRIER ZFS_AC_KERNEL_BIO_FAILFAST ZFS_AC_KERNEL_BIO_FAILFAST_DTD diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h index a5294ceba3f0..1ff8eeaf329e 100644 --- a/include/linux/blkdev_compat.h +++ b/include/linux/blkdev_compat.h @@ -394,13 +394,27 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) /* * 2.6.30 API change - * Change to make it explicit there this is the logical block size. + * To ensure good performance preferentially use the physical block size + * for proper alignment. The physical size is supposed to be the internal + * sector size used by the device. This is often 4096 byte for AF devices, + * while a smaller 512 byte logical size is supported for compatibility. + * + * Unfortunately, many drives still misreport their physical sector size. + * For devices which are known to lie you may need to manually set this + * at pool creation time with 'zpool create -o ashift=12 ...'. + * + * When the physical block size interface isn't available, we fall back to + * the logical block size interface and then the older hard sector size. */ -#ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE -# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) +#ifdef HAVE_BDEV_PHYSICAL_BLOCK_SIZE +# define vdev_bdev_block_size(bdev) bdev_physical_block_size(bdev) #else -# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) -#endif +# ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE +# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) +# else +# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) +# endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */ +#endif /* HAVE_BDEV_PHYSICAL_BLOCK_SIZE */ /* * 2.6.37 API change From df83110856950c8e7b16a7e94cdf42b8531b9cc8 Mon Sep 17 00:00:00 2001 From: Cyril Plisko Date: Tue, 6 Nov 2012 14:39:00 +0200 Subject: [PATCH 3/3] Add "-o ashift" to zpool add and zpool attach When adding devices to an existing pool "ashift" property is auto-detected. However, if this property was overridden at the pool creation time (i.e. zpool create -o ashift=12 tank ...) this may not be what the user wants. This commit lets the user specify the value of "ashift" property to be used with newly added drives. For example, zpool add -o ashift=12 tank disk1 zpool attach -o ashift=12 tank disk1 disk2 Signed-off-by: Cyril Plisko Signed-off-by: Brian Behlendorf Closes #566 --- cmd/zpool/zpool_main.c | 51 +++++++++++++++++++++++++++++++++++------- man/man8/zpool.8 | 31 +++++++++++++++++++++---- 2 files changed, 70 insertions(+), 12 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 96798c4ad8b3..a684f3bbb0fb 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -24,6 +24,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. + * Copyright (c) 2012 by Cyril Plisko. All rights reserved. */ #include @@ -199,10 +200,11 @@ static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: - return (gettext("\tadd [-fn] ...\n")); + return (gettext("\tadd [-fn] [-o property=value] " + " ...\n")); case HELP_ATTACH: - return (gettext("\tattach [-f] " - "\n")); + return (gettext("\tattach [-f] [-o property=value] " + " \n")); case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: @@ -436,11 +438,12 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props, } /* - * zpool add [-fn] ... + * zpool add [-fn] [-o property=value] ... * * -f Force addition of devices, even if they appear in use * -n Do not add the devices, but display the resulting layout if * they were to be added. + * -o Set property=value. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by get_vdev_spec(), which constructs the nvlist needed to pass to @@ -457,9 +460,11 @@ zpool_do_add(int argc, char **argv) int ret; zpool_handle_t *zhp; nvlist_t *config; + nvlist_t *props = NULL; + char *propval; /* check options */ - while ((c = getopt(argc, argv, "fn")) != -1) { + while ((c = getopt(argc, argv, "fno:")) != -1) { switch (c) { case 'f': force = B_TRUE; @@ -467,6 +472,19 @@ zpool_do_add(int argc, char **argv) case 'n': dryrun = B_TRUE; break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + usage(B_FALSE); + } + *propval = '\0'; + propval++; + + if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || + (add_prop_list(optarg, propval, &props, B_TRUE))) + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -503,7 +521,7 @@ zpool_do_add(int argc, char **argv) } /* pass off to get_vdev_spec for processing */ - nvroot = make_root_vdev(zhp, NULL, force, !force, B_FALSE, dryrun, + nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); @@ -536,6 +554,7 @@ zpool_do_add(int argc, char **argv) ret = (zpool_add(zhp, nvroot) != 0); } + nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); @@ -2865,6 +2884,8 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; + nvlist_t *props = NULL; + char *propval; int ret; /* check options */ @@ -2873,6 +2894,19 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) case 'f': force = B_TRUE; break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + usage(B_FALSE); + } + *propval = '\0'; + propval++; + + if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || + (add_prop_list(optarg, propval, &props, B_TRUE))) + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -2929,7 +2963,7 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) return (1); } - nvroot = make_root_vdev(zhp, NULL, force, B_FALSE, replacing, B_FALSE, + nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { zpool_close(zhp); @@ -2959,9 +2993,10 @@ zpool_do_replace(int argc, char **argv) } /* - * zpool attach [-f] + * zpool attach [-f] [-o property=value] * * -f Force attach, even if appears to be in use. + * -o Set property=value. * * Attach to the mirror containing . If is not * part of a mirror, then will be transformed into a mirror of diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 1ac30507b39e..826a6e78800c 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -2,6 +2,7 @@ .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Nexenta Systems, Inc. All rights reserved. .\" Copyright (c) 2012 by Delphix. All Rights Reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] @@ -16,12 +17,12 @@ zpool \- configures ZFS storage pools .LP .nf -\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ... +\fBzpool add\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIvdev\fR ... .fi .LP .nf -\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR +\fBzpool attach\fR [\fB-f\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR .fi .LP @@ -711,7 +712,7 @@ Displays a help message. .ne 2 .mk .na -\fB\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ...\fR +\fB\fBzpool add\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIvdev\fR ...\fR .ad .sp .6 .RS 4n @@ -738,6 +739,17 @@ Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting r Displays the configuration that would be used without actually adding the \fBvdev\fRs. The actual pool creation can still fail due to insufficient privileges or device sharing. .RE +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty=value\fR +.ad +.sp .6 +.RS 4n +Sets the given pool properties. See the "Properties" section for a list of valid properties that can be set. The only property supported at the moment is "ashift". +.RE + Do not add a disk that is currently configured as a quorum device to a zpool. After a disk is in the pool, that disk can then be configured as a quorum device. .RE @@ -745,7 +757,7 @@ Do not add a disk that is currently configured as a quorum device to a zpool. Af .ne 2 .mk .na -\fB\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR +\fB\fBzpool attach\fR [\fB-f\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR .ad .sp .6 .RS 4n @@ -761,6 +773,17 @@ Attaches \fInew_device\fR to an existing \fBzpool\fR device. The existing device Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner. .RE +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty=value\fR +.ad +.sp .6 +.RS 4n +Sets the given pool properties. See the "Properties" section for a list of valid properties that can be set. The only property supported at the moment is "ashift". +.RE + .RE .sp