diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 56617323fa83..a684f3bbb0fb 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -24,6 +24,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. + * Copyright (c) 2012 by Cyril Plisko. All rights reserved. */ #include @@ -199,10 +200,11 @@ static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: - return (gettext("\tadd [-fn] ...\n")); + return (gettext("\tadd [-fn] [-o property=value] " + " ...\n")); case HELP_ATTACH: - return (gettext("\tattach [-f] " - "\n")); + return (gettext("\tattach [-f] [-o property=value] " + " \n")); case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: @@ -436,11 +438,12 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props, } /* - * zpool add [-fn] ... + * zpool add [-fn] [-o property=value] ... * * -f Force addition of devices, even if they appear in use * -n Do not add the devices, but display the resulting layout if * they were to be added. + * -o Set property=value. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by get_vdev_spec(), which constructs the nvlist needed to pass to @@ -457,9 +460,11 @@ zpool_do_add(int argc, char **argv) int ret; zpool_handle_t *zhp; nvlist_t *config; + nvlist_t *props = NULL; + char *propval; /* check options */ - while ((c = getopt(argc, argv, "fn")) != -1) { + while ((c = getopt(argc, argv, "fno:")) != -1) { switch (c) { case 'f': force = B_TRUE; @@ -467,6 +472,19 @@ zpool_do_add(int argc, char **argv) case 'n': dryrun = B_TRUE; break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + usage(B_FALSE); + } + *propval = '\0'; + propval++; + + if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || + (add_prop_list(optarg, propval, &props, B_TRUE))) + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -503,7 +521,7 @@ zpool_do_add(int argc, char **argv) } /* pass off to get_vdev_spec for processing */ - nvroot = make_root_vdev(zhp, NULL, force, !force, B_FALSE, dryrun, + nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); @@ -536,6 +554,7 @@ zpool_do_add(int argc, char **argv) ret = (zpool_add(zhp, nvroot) != 0); } + nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); @@ -2865,6 +2884,8 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; + nvlist_t *props = NULL; + char *propval; int ret; /* check options */ @@ -2873,6 +2894,19 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) case 'f': force = B_TRUE; break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + usage(B_FALSE); + } + *propval = '\0'; + propval++; + + if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || + (add_prop_list(optarg, propval, &props, B_TRUE))) + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -2929,7 +2963,7 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) return (1); } - nvroot = make_root_vdev(zhp, NULL, force, B_FALSE, replacing, B_FALSE, + nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { zpool_close(zhp); @@ -2959,9 +2993,10 @@ zpool_do_replace(int argc, char **argv) } /* - * zpool attach [-f] + * zpool attach [-f] [-o property=value] * * -f Force attach, even if appears to be in use. + * -o Set property=value. * * Attach to the mirror containing . If is not * part of a mirror, then will be transformed into a mirror of @@ -3736,7 +3771,7 @@ print_dedup_stats(nvlist_t *config) /* * If the pool was faulted then we may not have been able to - * obtain the config. Otherwise, if have anything in the dedup + * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, diff --git a/config/kernel-bdev-physical-size.m4 b/config/kernel-bdev-physical-size.m4 new file mode 100644 index 000000000000..0a1fe8e26385 --- /dev/null +++ b/config/kernel-bdev-physical-size.m4 @@ -0,0 +1,39 @@ +dnl # +dnl # 2.6.30 API change +dnl # +dnl # The bdev_physical_block_size() interface was added to provide a way +dnl # to determine the smallest write which can be performed without a +dnl # read-modify-write operation. From the kernel documentation: +dnl # +dnl # What: /sys/block//queue/physical_block_size +dnl # Date: May 2009 +dnl # Contact: Martin K. Petersen +dnl # Description: +dnl # This is the smallest unit the storage device can write +dnl # without resorting to read-modify-write operation. It is +dnl # usually the same as the logical block size but may be +dnl # bigger. One example is SATA drives with 4KB sectors +dnl # that expose a 512-byte logical block size to the +dnl # operating system. +dnl # +dnl # Unfortunately, this interface isn't entirely reliable because +dnl # drives are sometimes known to misreport this value. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE], [ + AC_MSG_CHECKING([whether bdev_physical_block_size() is available]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Wno-unused-but-set-variable" + ZFS_LINUX_TRY_COMPILE([ + #include + ],[ + struct block_device *bdev = NULL; + bdev_physical_block_size(bdev); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BDEV_PHYSICAL_BLOCK_SIZE, 1, + [bdev_physical_block_size() is available]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 13238d8ac21b..34969c3163a9 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -14,6 +14,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE ZFS_AC_KERNEL_INVALIDATE_BDEV_ARGS ZFS_AC_KERNEL_BDEV_LOGICAL_BLOCK_SIZE + ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE ZFS_AC_KERNEL_BIO_EMPTY_BARRIER ZFS_AC_KERNEL_BIO_FAILFAST ZFS_AC_KERNEL_BIO_FAILFAST_DTD diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h index a5294ceba3f0..1ff8eeaf329e 100644 --- a/include/linux/blkdev_compat.h +++ b/include/linux/blkdev_compat.h @@ -394,13 +394,27 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) /* * 2.6.30 API change - * Change to make it explicit there this is the logical block size. + * To ensure good performance preferentially use the physical block size + * for proper alignment. The physical size is supposed to be the internal + * sector size used by the device. This is often 4096 byte for AF devices, + * while a smaller 512 byte logical size is supported for compatibility. + * + * Unfortunately, many drives still misreport their physical sector size. + * For devices which are known to lie you may need to manually set this + * at pool creation time with 'zpool create -o ashift=12 ...'. + * + * When the physical block size interface isn't available, we fall back to + * the logical block size interface and then the older hard sector size. */ -#ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE -# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) +#ifdef HAVE_BDEV_PHYSICAL_BLOCK_SIZE +# define vdev_bdev_block_size(bdev) bdev_physical_block_size(bdev) #else -# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) -#endif +# ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE +# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) +# else +# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) +# endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */ +#endif /* HAVE_BDEV_PHYSICAL_BLOCK_SIZE */ /* * 2.6.37 API change diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index 15803c034e08..d5c6004c261a 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -47,6 +47,7 @@ extern "C" { #define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" #define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" #define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" +#define FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT "vdev.bad_ashift" #define FM_EREPORT_ZFS_DEVICE_REMOVE "vdev.remove" #define FM_EREPORT_ZFS_DEVICE_CLEAR "vdev.clear" #define FM_EREPORT_ZFS_DEVICE_CHECK "vdev.check" @@ -71,6 +72,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 1ac30507b39e..826a6e78800c 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -2,6 +2,7 @@ .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Nexenta Systems, Inc. All rights reserved. .\" Copyright (c) 2012 by Delphix. All Rights Reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] @@ -16,12 +17,12 @@ zpool \- configures ZFS storage pools .LP .nf -\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ... +\fBzpool add\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIvdev\fR ... .fi .LP .nf -\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR +\fBzpool attach\fR [\fB-f\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR .fi .LP @@ -711,7 +712,7 @@ Displays a help message. .ne 2 .mk .na -\fB\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ...\fR +\fB\fBzpool add\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIvdev\fR ...\fR .ad .sp .6 .RS 4n @@ -738,6 +739,17 @@ Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting r Displays the configuration that would be used without actually adding the \fBvdev\fRs. The actual pool creation can still fail due to insufficient privileges or device sharing. .RE +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty=value\fR +.ad +.sp .6 +.RS 4n +Sets the given pool properties. See the "Properties" section for a list of valid properties that can be set. The only property supported at the moment is "ashift". +.RE + Do not add a disk that is currently configured as a quorum device to a zpool. After a disk is in the pool, that disk can then be configured as a quorum device. .RE @@ -745,7 +757,7 @@ Do not add a disk that is currently configured as a quorum device to a zpool. Af .ne 2 .mk .na -\fB\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR +\fB\fBzpool attach\fR [\fB-f\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR .ad .sp .6 .RS 4n @@ -761,6 +773,17 @@ Attaches \fInew_device\fR to an existing \fBzpool\fR device. The existing device Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner. .RE +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty=value\fR +.ad +.sp .6 +.RS 4n +Sets the given pool properties. See the "Properties" section for a list of valid properties that can be set. The only property supported at the moment is "ashift". +.RE + .RE .sp diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 7d6d5278a0db..e0d82e6737c4 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1271,13 +1271,16 @@ vdev_open(vdev_t *vd) vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); } else { /* - * Make sure the alignment requirement hasn't increased. + * Detect if the alignment requirement has increased. + * We don't want to make the pool unavailable, just + * post an event instead. */ - if (ashift > vd->vdev_top->vdev_ashift) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (EINVAL); + if (ashift > vd->vdev_top->vdev_ashift && + vd->vdev_ops->vdev_op_leaf) { + zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, + spa, vd, NULL, 0, 0); } + vd->vdev_max_asize = max_asize; } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index db6a831d2253..820291bf4b04 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -267,6 +267,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, DATA_TYPE_STRING, vd->vdev_fru, NULL); + if (vd->vdev_ashift) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, + DATA_TYPE_UINT64, vd->vdev_ashift, NULL); if (pvd != NULL) { fm_payload_set(ereport,