freebsd-nq/module/zfs/vdev_disk.c

857 lines
21 KiB
C
Raw Normal View History

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
* LLNL-CODE-403049.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
#include <linux/mod_compat.h>
char *zfs_vdev_scheduler = VDEV_SCHEDULER;
static void *zfs_vdev_holder = VDEV_HOLDER;
/*
* Virtual device vector for disks.
*/
typedef struct dio_request {
zio_t *dr_zio; /* Parent ZIO */
atomic_t dr_ref; /* References */
int dr_error; /* Bio error */
int dr_bio_count; /* Count of bio's */
struct bio *dr_bio[0]; /* Attached bio's */
} dio_request_t;
#ifdef HAVE_OPEN_BDEV_EXCLUSIVE
static fmode_t
vdev_bdev_mode(int smode)
{
fmode_t mode = 0;
ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
if (smode & FREAD)
mode |= FMODE_READ;
if (smode & FWRITE)
mode |= FMODE_WRITE;
return (mode);
}
#else
static int
vdev_bdev_mode(int smode)
{
int mode = 0;
ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
if ((smode & FREAD) && !(smode & FWRITE))
mode = MS_RDONLY;
return (mode);
}
#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
static uint64_t
bdev_capacity(struct block_device *bdev)
{
struct hd_struct *part = bdev->bd_part;
/* The partition capacity referenced by the block device */
if (part)
return (part->nr_sects << 9);
/* Otherwise assume the full device capacity */
return (get_capacity(bdev->bd_disk) << 9);
}
static void
vdev_disk_error(zio_t *zio)
{
#ifdef ZFS_DEBUG
printk(KERN_WARNING "ZFS: zio error=%d type=%d offset=%llu size=%llu "
Add -lhHpw options to "zpool iostat" for avg latency, histograms, & queues Update the zfs module to collect statistics on average latencies, queue sizes, and keep an internal histogram of all IO latencies. Along with this, update "zpool iostat" with some new options to print out the stats: -l: Include average IO latencies stats: total_wait disk_wait syncq_wait asyncq_wait scrub read write read write read write read write wait ----- ----- ----- ----- ----- ----- ----- ----- ----- - 41ms - 2ms - 46ms - 4ms - - 5ms - 1ms - 1us - 4ms - - 5ms - 1ms - 1us - 4ms - - - - - - - - - - - 49ms - 2ms - 47ms - - - - - - - - - - - - - 2ms - 1ms - - - 1ms - ----- ----- ----- ----- ----- ----- ----- ----- ----- 1ms 1ms 1ms 413us 16us 25us - 5ms - 1ms 1ms 1ms 413us 16us 25us - 5ms - 2ms 1ms 2ms 412us 26us 25us - 5ms - - 1ms - 413us - 25us - 5ms - - 1ms - 460us - 29us - 5ms - 196us 1ms 196us 370us 7us 23us - 5ms - ----- ----- ----- ----- ----- ----- ----- ----- ----- -w: Print out latency histograms: sdb total disk sync_queue async_queue latency read write read write read write read write scrub ------- ------ ------ ------ ------ ------ ------ ------ ------ ------ 1ns 0 0 0 0 0 0 0 0 0 ... 33us 0 0 0 0 0 0 0 0 0 66us 0 0 107 2486 2 788 12 12 0 131us 2 797 359 4499 10 558 184 184 6 262us 22 801 264 1563 10 286 287 287 24 524us 87 575 71 52086 15 1063 136 136 92 1ms 152 1190 5 41292 4 1693 252 252 141 2ms 245 2018 0 50007 0 2322 371 371 220 4ms 189 7455 22 162957 0 3912 6726 6726 199 8ms 108 9461 0 102320 0 5775 2526 2526 86 17ms 23 11287 0 37142 0 8043 1813 1813 19 34ms 0 14725 0 24015 0 11732 3071 3071 0 67ms 0 23597 0 7914 0 18113 5025 5025 0 134ms 0 33798 0 254 0 25755 7326 7326 0 268ms 0 51780 0 12 0 41593 10002 10002 0 537ms 0 77808 0 0 0 64255 13120 13120 0 1s 0 105281 0 0 0 83805 20841 20841 0 2s 0 88248 0 0 0 73772 14006 14006 0 4s 0 47266 0 0 0 29783 17176 17176 0 9s 0 10460 0 0 0 4130 6295 6295 0 17s 0 0 0 0 0 0 0 0 0 34s 0 0 0 0 0 0 0 0 0 69s 0 0 0 0 0 0 0 0 0 137s 0 0 0 0 0 0 0 0 0 ------------------------------------------------------------------------------- -h: Help -H: Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. -q: Include current number of entries in sync & async read/write queues, and scrub queue: syncq_read syncq_write asyncq_read asyncq_write scrubq_read pend activ pend activ pend activ pend activ pend activ ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 0 0 0 0 78 29 0 0 0 0 0 0 0 0 78 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - - - - - - - - - - 0 0 0 0 0 0 0 0 0 0 - - - - - - - - - - 0 0 0 0 0 0 0 0 0 0 ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 0 0 227 394 0 19 0 0 0 0 0 0 227 394 0 19 0 0 0 0 0 0 108 98 0 19 0 0 0 0 0 0 19 98 0 0 0 0 0 0 0 0 78 98 0 0 0 0 0 0 0 0 19 88 0 0 0 0 0 0 ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- -p: Display numbers in parseable (exact) values. Also, update iostat syntax to allow the user to specify specific vdevs to show statistics for. The three options for choosing pools/vdevs are: Display a list of pools: zpool iostat ... [pool ...] Display a list of vdevs from a specific pool: zpool iostat ... [pool vdev ...] Display a list of vdevs from any pools: zpool iostat ... [vdev ...] Lastly, allow zpool command "interval" value to be floating point: zpool iostat -v 0.5 Signed-off-by: Tony Hutter <hutter2@llnl.gov Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4433
2016-02-29 18:05:23 +00:00
"flags=%x\n", zio->io_error, zio->io_type,
(u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
Add -lhHpw options to "zpool iostat" for avg latency, histograms, & queues Update the zfs module to collect statistics on average latencies, queue sizes, and keep an internal histogram of all IO latencies. Along with this, update "zpool iostat" with some new options to print out the stats: -l: Include average IO latencies stats: total_wait disk_wait syncq_wait asyncq_wait scrub read write read write read write read write wait ----- ----- ----- ----- ----- ----- ----- ----- ----- - 41ms - 2ms - 46ms - 4ms - - 5ms - 1ms - 1us - 4ms - - 5ms - 1ms - 1us - 4ms - - - - - - - - - - - 49ms - 2ms - 47ms - - - - - - - - - - - - - 2ms - 1ms - - - 1ms - ----- ----- ----- ----- ----- ----- ----- ----- ----- 1ms 1ms 1ms 413us 16us 25us - 5ms - 1ms 1ms 1ms 413us 16us 25us - 5ms - 2ms 1ms 2ms 412us 26us 25us - 5ms - - 1ms - 413us - 25us - 5ms - - 1ms - 460us - 29us - 5ms - 196us 1ms 196us 370us 7us 23us - 5ms - ----- ----- ----- ----- ----- ----- ----- ----- ----- -w: Print out latency histograms: sdb total disk sync_queue async_queue latency read write read write read write read write scrub ------- ------ ------ ------ ------ ------ ------ ------ ------ ------ 1ns 0 0 0 0 0 0 0 0 0 ... 33us 0 0 0 0 0 0 0 0 0 66us 0 0 107 2486 2 788 12 12 0 131us 2 797 359 4499 10 558 184 184 6 262us 22 801 264 1563 10 286 287 287 24 524us 87 575 71 52086 15 1063 136 136 92 1ms 152 1190 5 41292 4 1693 252 252 141 2ms 245 2018 0 50007 0 2322 371 371 220 4ms 189 7455 22 162957 0 3912 6726 6726 199 8ms 108 9461 0 102320 0 5775 2526 2526 86 17ms 23 11287 0 37142 0 8043 1813 1813 19 34ms 0 14725 0 24015 0 11732 3071 3071 0 67ms 0 23597 0 7914 0 18113 5025 5025 0 134ms 0 33798 0 254 0 25755 7326 7326 0 268ms 0 51780 0 12 0 41593 10002 10002 0 537ms 0 77808 0 0 0 64255 13120 13120 0 1s 0 105281 0 0 0 83805 20841 20841 0 2s 0 88248 0 0 0 73772 14006 14006 0 4s 0 47266 0 0 0 29783 17176 17176 0 9s 0 10460 0 0 0 4130 6295 6295 0 17s 0 0 0 0 0 0 0 0 0 34s 0 0 0 0 0 0 0 0 0 69s 0 0 0 0 0 0 0 0 0 137s 0 0 0 0 0 0 0 0 0 ------------------------------------------------------------------------------- -h: Help -H: Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. -q: Include current number of entries in sync & async read/write queues, and scrub queue: syncq_read syncq_write asyncq_read asyncq_write scrubq_read pend activ pend activ pend activ pend activ pend activ ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 0 0 0 0 78 29 0 0 0 0 0 0 0 0 78 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - - - - - - - - - - 0 0 0 0 0 0 0 0 0 0 - - - - - - - - - - 0 0 0 0 0 0 0 0 0 0 ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 0 0 227 394 0 19 0 0 0 0 0 0 227 394 0 19 0 0 0 0 0 0 108 98 0 19 0 0 0 0 0 0 19 98 0 0 0 0 0 0 0 0 78 98 0 0 0 0 0 0 0 0 19 88 0 0 0 0 0 0 ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- -p: Display numbers in parseable (exact) values. Also, update iostat syntax to allow the user to specify specific vdevs to show statistics for. The three options for choosing pools/vdevs are: Display a list of pools: zpool iostat ... [pool ...] Display a list of vdevs from a specific pool: zpool iostat ... [pool vdev ...] Display a list of vdevs from any pools: zpool iostat ... [vdev ...] Lastly, allow zpool command "interval" value to be floating point: zpool iostat -v 0.5 Signed-off-by: Tony Hutter <hutter2@llnl.gov Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4433
2016-02-29 18:05:23 +00:00
zio->io_flags);
#endif
}
/*
* Use the Linux 'noop' elevator for zfs managed block devices. This
* strikes the ideal balance by allowing the zfs elevator to do all
* request ordering and prioritization. While allowing the Linux
* elevator to do the maximum front/back merging allowed by the
* physical device. This yields the largest possible requests for
* the device with the lowest total overhead.
*/
static void
vdev_elevator_switch(vdev_t *v, char *elevator)
{
vdev_disk_t *vd = v->vdev_tsd;
struct request_queue *q;
char *device;
int error;
for (int c = 0; c < v->vdev_children; c++)
vdev_elevator_switch(v->vdev_child[c], elevator);
if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
return;
q = bdev_get_queue(vd->vd_bdev);
device = vd->vd_bdev->bd_disk->disk_name;
/*
* Skip devices which are not whole disks (partitions).
* Device-mapper devices are excepted since they may be whole
* disks despite the vdev_wholedisk flag, in which case we can
* and should switch the elevator. If the device-mapper device
* does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
* "Skip devices without schedulers" check below will fail.
*/
if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
return;
/* Skip devices without schedulers (loop, ram, dm, etc) */
if (!q->elevator || !blk_queue_stackable(q))
return;
/* Leave existing scheduler when set to "none" */
if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
return;
#ifdef HAVE_ELEVATOR_CHANGE
error = elevator_change(q, elevator);
#else
/*
* For pre-2.6.36 kernels elevator_change() is not available.
* Therefore we fall back to using a usermodehelper to echo the
* elevator into sysfs; This requires /bin/echo and sysfs to be
* mounted which may not be true early in the boot process.
*/
#define SET_SCHEDULER_CMD \
"exec 0</dev/null " \
" 1>/sys/block/%s/queue/scheduler " \
" 2>/dev/null; " \
"echo %s"
char *argv[] = { "/bin/sh", "-c", NULL, NULL };
char *envp[] = { NULL };
argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
strfree(argv[2]);
#endif /* HAVE_ELEVATOR_CHANGE */
if (error)
printk(KERN_NOTICE "ZFS: Unable to set \"%s\" scheduler"
" for %s (%s): %d\n", elevator, v->vdev_path, device,
error);
}
/*
* Expanding a whole disk vdev involves invoking BLKRRPART on the
* whole disk device. This poses a problem, because BLKRRPART will
* return EBUSY if one of the disk's partitions is open. That's why
* we have to do it here, just before opening the data partition.
* Unfortunately, BLKRRPART works by dropping all partitions and
* recreating them, which means that for a short time window, all
* /dev/sdxN device files disappear (until udev recreates them).
* This means two things:
* - When we open the data partition just after a BLKRRPART, we
* can't do it using the normal device file path because of the
* obvious race condition with udev. Instead, we use reliable
* kernel APIs to get a handle to the new partition device from
* the whole disk device.
* - Because vdev_disk_open() initially needs to find the device
* using its path, multiple vdev_disk_open() invocations in
* short succession on the same disk with BLKRRPARTs in the
* middle have a high probability of failure (because of the
* race condition with udev). A typical situation where this
* might happen is when the zpool userspace tool does a
* TRYIMPORT immediately followed by an IMPORT. For this
* reason, we only invoke BLKRRPART in the module when strictly
* necessary (zpool online -e case), and rely on userspace to
* do it when possible.
*/
static struct block_device *
vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
{
#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
struct block_device *bdev, *result = ERR_PTR(-ENXIO);
struct gendisk *disk;
int error, partno;
bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
if (IS_ERR(bdev))
return (bdev);
disk = get_gendisk(bdev->bd_dev, &partno);
vdev_bdev_close(bdev, vdev_bdev_mode(mode));
if (disk) {
bdev = bdget(disk_devt(disk));
if (bdev) {
error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
if (error == 0)
error = ioctl_by_bdev(bdev, BLKRRPART, 0);
vdev_bdev_close(bdev, vdev_bdev_mode(mode));
}
bdev = bdget_disk(disk, partno);
if (bdev) {
error = blkdev_get(bdev,
vdev_bdev_mode(mode) | FMODE_EXCL, vd);
if (error == 0)
result = bdev;
}
put_disk(disk);
}
return (result);
#else
return (ERR_PTR(-EOPNOTSUPP));
#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
}
static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
uint64_t *ashift)
{
struct block_device *bdev = ERR_PTR(-ENXIO);
vdev_disk_t *vd;
Use udev for partition detection When ZFS partitions a block device it must wait for udev to create both a device node and all the device symlinks. This process takes a variable length of time and depends on factors such how many links must be created, the complexity of the rules, etc. Complicating the situation further it is not uncommon for udev to create and then remove a link multiple times while processing the udev rules. Given the above, the existing scheme of waiting for an expected partition to appear by name isn't 100% reliable. At this point udev may still remove and recreate think link resulting in the kernel modules being unable to open the device. In order to address this the zpool_label_disk_wait() function has been updated to use libudev. Until the registered system device acknowledges that it in fully initialized the function will wait. Once fully initialized all device links are checked and allowed to settle for 50ms. This makes it far more likely that all the device nodes will exist when the kernel modules need to open them. For systems without libudev an alternate zpool_label_disk_wait() was updated to include a settle time. In addition, the kernel modules were updated to include retry logic for this ENOENT case. Due to the improved checks in the utilities it is unlikely this logic will be invoked. However, if the rare event it is needed it will prevent a failure. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Richard Laager <rlaager@wiktel.com> Closes #4523 Closes #3708 Closes #4077 Closes #4144 Closes #4214 Closes #4517
2016-04-19 18:19:12 +00:00
int count = 0, mode, block_size;
/* Must have a pathname and it must be absolute. */
if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
vdev_dbgmsg(v, "vdev_disk_open: invalid "
"vdev_path '%s'", v->vdev_path);
Use udev for partition detection When ZFS partitions a block device it must wait for udev to create both a device node and all the device symlinks. This process takes a variable length of time and depends on factors such how many links must be created, the complexity of the rules, etc. Complicating the situation further it is not uncommon for udev to create and then remove a link multiple times while processing the udev rules. Given the above, the existing scheme of waiting for an expected partition to appear by name isn't 100% reliable. At this point udev may still remove and recreate think link resulting in the kernel modules being unable to open the device. In order to address this the zpool_label_disk_wait() function has been updated to use libudev. Until the registered system device acknowledges that it in fully initialized the function will wait. Once fully initialized all device links are checked and allowed to settle for 50ms. This makes it far more likely that all the device nodes will exist when the kernel modules need to open them. For systems without libudev an alternate zpool_label_disk_wait() was updated to include a settle time. In addition, the kernel modules were updated to include retry logic for this ENOENT case. Due to the improved checks in the utilities it is unlikely this logic will be invoked. However, if the rare event it is needed it will prevent a failure. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Richard Laager <rlaager@wiktel.com> Closes #4523 Closes #3708 Closes #4077 Closes #4144 Closes #4214 Closes #4517
2016-04-19 18:19:12 +00:00
return (SET_ERROR(EINVAL));
}
/*
* Reopen the device if it's not currently open. Otherwise,
* just update the physical size of the device.
*/
if (v->vdev_tsd != NULL) {
ASSERT(v->vdev_reopening);
vd = v->vdev_tsd;
goto skip_open;
}
vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
if (vd == NULL)
Use udev for partition detection When ZFS partitions a block device it must wait for udev to create both a device node and all the device symlinks. This process takes a variable length of time and depends on factors such how many links must be created, the complexity of the rules, etc. Complicating the situation further it is not uncommon for udev to create and then remove a link multiple times while processing the udev rules. Given the above, the existing scheme of waiting for an expected partition to appear by name isn't 100% reliable. At this point udev may still remove and recreate think link resulting in the kernel modules being unable to open the device. In order to address this the zpool_label_disk_wait() function has been updated to use libudev. Until the registered system device acknowledges that it in fully initialized the function will wait. Once fully initialized all device links are checked and allowed to settle for 50ms. This makes it far more likely that all the device nodes will exist when the kernel modules need to open them. For systems without libudev an alternate zpool_label_disk_wait() was updated to include a settle time. In addition, the kernel modules were updated to include retry logic for this ENOENT case. Due to the improved checks in the utilities it is unlikely this logic will be invoked. However, if the rare event it is needed it will prevent a failure. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Richard Laager <rlaager@wiktel.com> Closes #4523 Closes #3708 Closes #4077 Closes #4144 Closes #4214 Closes #4517
2016-04-19 18:19:12 +00:00
return (SET_ERROR(ENOMEM));
/*
* Devices are always opened by the path provided at configuration
* time. This means that if the provided path is a udev by-id path
* then drives may be recabled without an issue. If the provided
* path is a udev by-path path, then the physical location information
* will be preserved. This can be critical for more complicated
* configurations where drives are located in specific physical
* locations to maximize the systems tolerence to component failure.
* Alternatively, you can provide your own udev rule to flexibly map
* the drives as you see fit. It is not advised that you use the
* /dev/[hd]d devices which may be reordered due to probing order.
* Devices in the wrong locations will be detected by the higher
* level vdev validation.
Use udev for partition detection When ZFS partitions a block device it must wait for udev to create both a device node and all the device symlinks. This process takes a variable length of time and depends on factors such how many links must be created, the complexity of the rules, etc. Complicating the situation further it is not uncommon for udev to create and then remove a link multiple times while processing the udev rules. Given the above, the existing scheme of waiting for an expected partition to appear by name isn't 100% reliable. At this point udev may still remove and recreate think link resulting in the kernel modules being unable to open the device. In order to address this the zpool_label_disk_wait() function has been updated to use libudev. Until the registered system device acknowledges that it in fully initialized the function will wait. Once fully initialized all device links are checked and allowed to settle for 50ms. This makes it far more likely that all the device nodes will exist when the kernel modules need to open them. For systems without libudev an alternate zpool_label_disk_wait() was updated to include a settle time. In addition, the kernel modules were updated to include retry logic for this ENOENT case. Due to the improved checks in the utilities it is unlikely this logic will be invoked. However, if the rare event it is needed it will prevent a failure. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Richard Laager <rlaager@wiktel.com> Closes #4523 Closes #3708 Closes #4077 Closes #4144 Closes #4214 Closes #4517
2016-04-19 18:19:12 +00:00
*
* The specified paths may be briefly removed and recreated in
* response to udev events. This should be exceptionally unlikely
* because the zpool command makes every effort to verify these paths
* have already settled prior to reaching this point. Therefore,
* a ENOENT failure at this point is highly likely to be transient
* and it is reasonable to sleep and retry before giving up. In
* practice delays have been observed to be on the order of 100ms.
*/
mode = spa_mode(v->vdev_spa);
if (v->vdev_wholedisk && v->vdev_expanding)
bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
Use udev for partition detection When ZFS partitions a block device it must wait for udev to create both a device node and all the device symlinks. This process takes a variable length of time and depends on factors such how many links must be created, the complexity of the rules, etc. Complicating the situation further it is not uncommon for udev to create and then remove a link multiple times while processing the udev rules. Given the above, the existing scheme of waiting for an expected partition to appear by name isn't 100% reliable. At this point udev may still remove and recreate think link resulting in the kernel modules being unable to open the device. In order to address this the zpool_label_disk_wait() function has been updated to use libudev. Until the registered system device acknowledges that it in fully initialized the function will wait. Once fully initialized all device links are checked and allowed to settle for 50ms. This makes it far more likely that all the device nodes will exist when the kernel modules need to open them. For systems without libudev an alternate zpool_label_disk_wait() was updated to include a settle time. In addition, the kernel modules were updated to include retry logic for this ENOENT case. Due to the improved checks in the utilities it is unlikely this logic will be invoked. However, if the rare event it is needed it will prevent a failure. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Richard Laager <rlaager@wiktel.com> Closes #4523 Closes #3708 Closes #4077 Closes #4144 Closes #4214 Closes #4517
2016-04-19 18:19:12 +00:00
while (IS_ERR(bdev) && count < 50) {
bdev = vdev_bdev_open(v->vdev_path,
vdev_bdev_mode(mode), zfs_vdev_holder);
Use udev for partition detection When ZFS partitions a block device it must wait for udev to create both a device node and all the device symlinks. This process takes a variable length of time and depends on factors such how many links must be created, the complexity of the rules, etc. Complicating the situation further it is not uncommon for udev to create and then remove a link multiple times while processing the udev rules. Given the above, the existing scheme of waiting for an expected partition to appear by name isn't 100% reliable. At this point udev may still remove and recreate think link resulting in the kernel modules being unable to open the device. In order to address this the zpool_label_disk_wait() function has been updated to use libudev. Until the registered system device acknowledges that it in fully initialized the function will wait. Once fully initialized all device links are checked and allowed to settle for 50ms. This makes it far more likely that all the device nodes will exist when the kernel modules need to open them. For systems without libudev an alternate zpool_label_disk_wait() was updated to include a settle time. In addition, the kernel modules were updated to include retry logic for this ENOENT case. Due to the improved checks in the utilities it is unlikely this logic will be invoked. However, if the rare event it is needed it will prevent a failure. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Richard Laager <rlaager@wiktel.com> Closes #4523 Closes #3708 Closes #4077 Closes #4144 Closes #4214 Closes #4517
2016-04-19 18:19:12 +00:00
if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
msleep(10);
count++;
} else if (IS_ERR(bdev)) {
break;
}
}
if (IS_ERR(bdev)) {
Use udev for partition detection When ZFS partitions a block device it must wait for udev to create both a device node and all the device symlinks. This process takes a variable length of time and depends on factors such how many links must be created, the complexity of the rules, etc. Complicating the situation further it is not uncommon for udev to create and then remove a link multiple times while processing the udev rules. Given the above, the existing scheme of waiting for an expected partition to appear by name isn't 100% reliable. At this point udev may still remove and recreate think link resulting in the kernel modules being unable to open the device. In order to address this the zpool_label_disk_wait() function has been updated to use libudev. Until the registered system device acknowledges that it in fully initialized the function will wait. Once fully initialized all device links are checked and allowed to settle for 50ms. This makes it far more likely that all the device nodes will exist when the kernel modules need to open them. For systems without libudev an alternate zpool_label_disk_wait() was updated to include a settle time. In addition, the kernel modules were updated to include retry logic for this ENOENT case. Due to the improved checks in the utilities it is unlikely this logic will be invoked. However, if the rare event it is needed it will prevent a failure. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Richard Laager <rlaager@wiktel.com> Closes #4523 Closes #3708 Closes #4077 Closes #4144 Closes #4214 Closes #4517
2016-04-19 18:19:12 +00:00
dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
v->vdev_path, -PTR_ERR(bdev), count);
kmem_free(vd, sizeof (vdev_disk_t));
Use udev for partition detection When ZFS partitions a block device it must wait for udev to create both a device node and all the device symlinks. This process takes a variable length of time and depends on factors such how many links must be created, the complexity of the rules, etc. Complicating the situation further it is not uncommon for udev to create and then remove a link multiple times while processing the udev rules. Given the above, the existing scheme of waiting for an expected partition to appear by name isn't 100% reliable. At this point udev may still remove and recreate think link resulting in the kernel modules being unable to open the device. In order to address this the zpool_label_disk_wait() function has been updated to use libudev. Until the registered system device acknowledges that it in fully initialized the function will wait. Once fully initialized all device links are checked and allowed to settle for 50ms. This makes it far more likely that all the device nodes will exist when the kernel modules need to open them. For systems without libudev an alternate zpool_label_disk_wait() was updated to include a settle time. In addition, the kernel modules were updated to include retry logic for this ENOENT case. Due to the improved checks in the utilities it is unlikely this logic will be invoked. However, if the rare event it is needed it will prevent a failure. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Richard Laager <rlaager@wiktel.com> Closes #4523 Closes #3708 Closes #4077 Closes #4144 Closes #4214 Closes #4517
2016-04-19 18:19:12 +00:00
return (SET_ERROR(-PTR_ERR(bdev)));
}
v->vdev_tsd = vd;
vd->vd_bdev = bdev;
skip_open:
/* Determine the physical block size */
block_size = vdev_bdev_block_size(vd->vd_bdev);
/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
v->vdev_nowritecache = B_FALSE;
/* Inform the ZIO pipeline that we are non-rotational */
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
/* Physical volume size in bytes */
*psize = bdev_capacity(vd->vd_bdev);
/* TODO: report possible expansion size */
*max_psize = *psize;
/* Based on the minimum sector size set the block size */
*ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
/* Try to set the io scheduler elevator algorithm */
(void) vdev_elevator_switch(v, zfs_vdev_scheduler);
return (0);
}
static void
vdev_disk_close(vdev_t *v)
{
vdev_disk_t *vd = v->vdev_tsd;
if (v->vdev_reopening || vd == NULL)
return;
if (vd->vd_bdev != NULL)
vdev_bdev_close(vd->vd_bdev,
vdev_bdev_mode(spa_mode(v->vdev_spa)));
kmem_free(vd, sizeof (vdev_disk_t));
v->vdev_tsd = NULL;
}
static dio_request_t *
vdev_disk_dio_alloc(int bio_count)
{
dio_request_t *dr;
int i;
dr = kmem_zalloc(sizeof (dio_request_t) +
sizeof (struct bio *) * bio_count, KM_SLEEP);
if (dr) {
atomic_set(&dr->dr_ref, 0);
dr->dr_bio_count = bio_count;
dr->dr_error = 0;
for (i = 0; i < dr->dr_bio_count; i++)
dr->dr_bio[i] = NULL;
}
return (dr);
}
static void
vdev_disk_dio_free(dio_request_t *dr)
{
int i;
for (i = 0; i < dr->dr_bio_count; i++)
if (dr->dr_bio[i])
bio_put(dr->dr_bio[i]);
kmem_free(dr, sizeof (dio_request_t) +
sizeof (struct bio *) * dr->dr_bio_count);
}
static void
vdev_disk_dio_get(dio_request_t *dr)
{
atomic_inc(&dr->dr_ref);
}
static int
vdev_disk_dio_put(dio_request_t *dr)
{
int rc = atomic_dec_return(&dr->dr_ref);
/*
* Free the dio_request when the last reference is dropped and
* ensure zio_interpret is called only once with the correct zio
*/
if (rc == 0) {
zio_t *zio = dr->dr_zio;
int error = dr->dr_error;
vdev_disk_dio_free(dr);
if (zio) {
zio->io_error = error;
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
vdev_disk_error(zio);
zio_delay_interrupt(zio);
}
}
return (rc);
}
BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
{
dio_request_t *dr = bio->bi_private;
int rc;
if (dr->dr_error == 0) {
#ifdef HAVE_1ARG_BIO_END_IO_T
dr->dr_error = BIO_END_IO_ERROR(bio);
#else
if (error)
dr->dr_error = -(error);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
dr->dr_error = EIO;
#endif
}
/* Drop reference acquired by __vdev_disk_physio */
rc = vdev_disk_dio_put(dr);
}
static unsigned int
bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
{
unsigned int offset, size, i;
struct page *page;
offset = offset_in_page(bio_ptr);
for (i = 0; i < bio->bi_max_vecs; i++) {
size = PAGE_SIZE - offset;
if (bio_size <= 0)
break;
if (size > bio_size)
size = bio_size;
if (is_vmalloc_addr(bio_ptr))
page = vmalloc_to_page(bio_ptr);
else
page = virt_to_page(bio_ptr);
/*
* Some network related block device uses tcp_sendpage, which
* doesn't behave well when using 0-count page, this is a
* safety net to catch them.
*/
ASSERT3S(page_count(page), >, 0);
if (bio_add_page(bio, page, size, offset) != size)
break;
bio_ptr += size;
bio_size -= size;
offset = 0;
}
return (bio_size);
}
static unsigned int
bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
{
if (abd_is_linear(abd))
return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
return (abd_scatter_bio_map_off(bio, abd, size, off));
}
static inline void
vdev_submit_bio_impl(struct bio *bio)
{
#ifdef HAVE_1ARG_SUBMIT_BIO
submit_bio(bio);
#else
submit_bio(0, bio);
#endif
}
#ifndef HAVE_BIO_SET_DEV
static inline void
bio_set_dev(struct bio *bio, struct block_device *bdev)
{
bio->bi_bdev = bdev;
}
#endif /* !HAVE_BIO_SET_DEV */
zvol processing should use struct bio Internally, zvols are files exposed through the block device API. This is intended to reduce overhead when things require block devices. However, the ZoL zvol code emulates a traditional block device in that it has a top half and a bottom half. This is an unnecessary source of overhead that does not exist on any other OpenZFS platform does this. This patch removes it. Early users of this patch reported double digit performance gains in IOPS on zvols in the range of 50% to 80%. Comments in the code suggest that the current implementation was done to obtain IO merging from Linux's IO elevator. However, the DMU already does write merging while arc_read() should implicitly merge read IOs because only 1 thread is permitted to fetch the buffer into ARC. In addition, commercial ZFSOnLinux distributions report that regular files are more performant than zvols under the current implementation, and the main consumers of zvols are VMs and iSCSI targets, which have their own elevators to merge IOs. Some minor refactoring allows us to register zfs_request() as our ->make_request() handler in place of the generic_make_request() function. This eliminates the layer of code that broke IO requests on zvols into a top half and a bottom half. This has several benefits: 1. No per zvol spinlocks. 2. No redundant IO elevator processing. 3. Interrupts are disabled only when actually necessary. 4. No redispatching of IOs when all taskq threads are busy. 5. Linux's page out routines will properly block. 6. Many autotools checks become obsolete. An unfortunate consequence of eliminating the layer that generic_make_request() is that we no longer calls the instrumentation hooks for block IO accounting. Those hooks are GPL-exported, so we cannot call them ourselves and consequently, we lose the ability to do IO monitoring via iostat. Since zvols are internally files mapped as block devices, this should be okay. Anyone who is willing to accept the performance penalty for the block IO layer's accounting could use the loop device in between the zvol and its consumer. Alternatively, perf and ftrace likely could be used. Also, tools like latencytop will still work. Tools such as latencytop sometimes provide a better view of performance bottlenecks than the traditional block IO accounting tools do. Lastly, if direct reclaim occurs during spacemap loading and swap is on a zvol, this code will deadlock. That deadlock could already occur with sync=always on zvols. Given that swap on zvols is not yet production ready, this is not a blocker. Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-04 22:43:47 +00:00
static inline void
vdev_submit_bio(struct bio *bio)
zvol processing should use struct bio Internally, zvols are files exposed through the block device API. This is intended to reduce overhead when things require block devices. However, the ZoL zvol code emulates a traditional block device in that it has a top half and a bottom half. This is an unnecessary source of overhead that does not exist on any other OpenZFS platform does this. This patch removes it. Early users of this patch reported double digit performance gains in IOPS on zvols in the range of 50% to 80%. Comments in the code suggest that the current implementation was done to obtain IO merging from Linux's IO elevator. However, the DMU already does write merging while arc_read() should implicitly merge read IOs because only 1 thread is permitted to fetch the buffer into ARC. In addition, commercial ZFSOnLinux distributions report that regular files are more performant than zvols under the current implementation, and the main consumers of zvols are VMs and iSCSI targets, which have their own elevators to merge IOs. Some minor refactoring allows us to register zfs_request() as our ->make_request() handler in place of the generic_make_request() function. This eliminates the layer of code that broke IO requests on zvols into a top half and a bottom half. This has several benefits: 1. No per zvol spinlocks. 2. No redundant IO elevator processing. 3. Interrupts are disabled only when actually necessary. 4. No redispatching of IOs when all taskq threads are busy. 5. Linux's page out routines will properly block. 6. Many autotools checks become obsolete. An unfortunate consequence of eliminating the layer that generic_make_request() is that we no longer calls the instrumentation hooks for block IO accounting. Those hooks are GPL-exported, so we cannot call them ourselves and consequently, we lose the ability to do IO monitoring via iostat. Since zvols are internally files mapped as block devices, this should be okay. Anyone who is willing to accept the performance penalty for the block IO layer's accounting could use the loop device in between the zvol and its consumer. Alternatively, perf and ftrace likely could be used. Also, tools like latencytop will still work. Tools such as latencytop sometimes provide a better view of performance bottlenecks than the traditional block IO accounting tools do. Lastly, if direct reclaim occurs during spacemap loading and swap is on a zvol, this code will deadlock. That deadlock could already occur with sync=always on zvols. Given that swap on zvols is not yet production ready, this is not a blocker. Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-04 22:43:47 +00:00
{
#ifdef HAVE_CURRENT_BIO_TAIL
struct bio **bio_tail = current->bio_tail;
current->bio_tail = NULL;
vdev_submit_bio_impl(bio);
zvol processing should use struct bio Internally, zvols are files exposed through the block device API. This is intended to reduce overhead when things require block devices. However, the ZoL zvol code emulates a traditional block device in that it has a top half and a bottom half. This is an unnecessary source of overhead that does not exist on any other OpenZFS platform does this. This patch removes it. Early users of this patch reported double digit performance gains in IOPS on zvols in the range of 50% to 80%. Comments in the code suggest that the current implementation was done to obtain IO merging from Linux's IO elevator. However, the DMU already does write merging while arc_read() should implicitly merge read IOs because only 1 thread is permitted to fetch the buffer into ARC. In addition, commercial ZFSOnLinux distributions report that regular files are more performant than zvols under the current implementation, and the main consumers of zvols are VMs and iSCSI targets, which have their own elevators to merge IOs. Some minor refactoring allows us to register zfs_request() as our ->make_request() handler in place of the generic_make_request() function. This eliminates the layer of code that broke IO requests on zvols into a top half and a bottom half. This has several benefits: 1. No per zvol spinlocks. 2. No redundant IO elevator processing. 3. Interrupts are disabled only when actually necessary. 4. No redispatching of IOs when all taskq threads are busy. 5. Linux's page out routines will properly block. 6. Many autotools checks become obsolete. An unfortunate consequence of eliminating the layer that generic_make_request() is that we no longer calls the instrumentation hooks for block IO accounting. Those hooks are GPL-exported, so we cannot call them ourselves and consequently, we lose the ability to do IO monitoring via iostat. Since zvols are internally files mapped as block devices, this should be okay. Anyone who is willing to accept the performance penalty for the block IO layer's accounting could use the loop device in between the zvol and its consumer. Alternatively, perf and ftrace likely could be used. Also, tools like latencytop will still work. Tools such as latencytop sometimes provide a better view of performance bottlenecks than the traditional block IO accounting tools do. Lastly, if direct reclaim occurs during spacemap loading and swap is on a zvol, this code will deadlock. That deadlock could already occur with sync=always on zvols. Given that swap on zvols is not yet production ready, this is not a blocker. Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-04 22:43:47 +00:00
current->bio_tail = bio_tail;
#else
struct bio_list *bio_list = current->bio_list;
current->bio_list = NULL;
vdev_submit_bio_impl(bio);
zvol processing should use struct bio Internally, zvols are files exposed through the block device API. This is intended to reduce overhead when things require block devices. However, the ZoL zvol code emulates a traditional block device in that it has a top half and a bottom half. This is an unnecessary source of overhead that does not exist on any other OpenZFS platform does this. This patch removes it. Early users of this patch reported double digit performance gains in IOPS on zvols in the range of 50% to 80%. Comments in the code suggest that the current implementation was done to obtain IO merging from Linux's IO elevator. However, the DMU already does write merging while arc_read() should implicitly merge read IOs because only 1 thread is permitted to fetch the buffer into ARC. In addition, commercial ZFSOnLinux distributions report that regular files are more performant than zvols under the current implementation, and the main consumers of zvols are VMs and iSCSI targets, which have their own elevators to merge IOs. Some minor refactoring allows us to register zfs_request() as our ->make_request() handler in place of the generic_make_request() function. This eliminates the layer of code that broke IO requests on zvols into a top half and a bottom half. This has several benefits: 1. No per zvol spinlocks. 2. No redundant IO elevator processing. 3. Interrupts are disabled only when actually necessary. 4. No redispatching of IOs when all taskq threads are busy. 5. Linux's page out routines will properly block. 6. Many autotools checks become obsolete. An unfortunate consequence of eliminating the layer that generic_make_request() is that we no longer calls the instrumentation hooks for block IO accounting. Those hooks are GPL-exported, so we cannot call them ourselves and consequently, we lose the ability to do IO monitoring via iostat. Since zvols are internally files mapped as block devices, this should be okay. Anyone who is willing to accept the performance penalty for the block IO layer's accounting could use the loop device in between the zvol and its consumer. Alternatively, perf and ftrace likely could be used. Also, tools like latencytop will still work. Tools such as latencytop sometimes provide a better view of performance bottlenecks than the traditional block IO accounting tools do. Lastly, if direct reclaim occurs during spacemap loading and swap is on a zvol, this code will deadlock. That deadlock could already occur with sync=always on zvols. Given that swap on zvols is not yet production ready, this is not a blocker. Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-04 22:43:47 +00:00
current->bio_list = bio_list;
#endif
}
static int
__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
size_t io_size, uint64_t io_offset, int rw, int flags)
{
dio_request_t *dr;
uint64_t abd_offset;
uint64_t bio_offset;
int bio_size, bio_count = 16;
int i = 0, error = 0;
#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
struct blk_plug plug;
#endif
ASSERT(zio != NULL);
ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size);
retry:
dr = vdev_disk_dio_alloc(bio_count);
if (dr == NULL)
return (SET_ERROR(ENOMEM));
if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
bio_set_flags_failfast(bdev, &flags);
dr->dr_zio = zio;
/*
* When the IO size exceeds the maximum bio size for the request
* queue we are forced to break the IO in multiple bio's and wait
* for them all to complete. Ideally, all pool users will set
* their volume block size to match the maximum request size and
* the common case will be one bio per vdev IO request.
*/
abd_offset = 0;
bio_offset = io_offset;
bio_size = io_size;
for (i = 0; i <= dr->dr_bio_count; i++) {
/* Finished constructing bio's for given buffer */
if (bio_size <= 0)
break;
/*
* By default only 'bio_count' bio's per dio are allowed.
* However, if we find ourselves in a situation where more
* are needed we allocate a larger dio and warn the user.
*/
if (dr->dr_bio_count == i) {
vdev_disk_dio_free(dr);
bio_count *= 2;
goto retry;
}
/* bio_alloc() with __GFP_WAIT never returns NULL */
Illumos 5027 - zfs large block support 5027 zfs large block support Reviewed by: Alek Pinchuk <pinchuk.alek@gmail.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com> Reviewed by: Richard Elling <richard.elling@richardelling.com> Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/5027 https://github.com/illumos/illumos-gate/commit/b515258 Porting Notes: * Included in this patch is a tiny ISP2() cleanup in zio_init() from Illumos 5255. * Unlike the upstream Illumos commit this patch does not impose an arbitrary 128K block size limit on volumes. Volumes, like filesystems, are limited by the zfs_max_recordsize=1M module option. * By default the maximum record size is limited to 1M by the module option zfs_max_recordsize. This value may be safely increased up to 16M which is the largest block size supported by the on-disk format. At the moment, 1M blocks clearly offer a significant performance improvement but the benefits of going beyond this for the majority of workloads are less clear. * The illumos version of this patch increased DMU_MAX_ACCESS to 32M. This was determined not to be large enough when using 16M blocks because the zfs_make_xattrdir() function will fail (EFBIG) when assigning a TX. This was immediately observed under Linux because all newly created files must have a security xattr created and that was failing. Therefore, we've set DMU_MAX_ACCESS to 64M. * On 32-bit platforms a hard limit of 1M is set for blocks due to the limited virtual address space. We should be able to relax this one the ABD patches are merged. Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #354
2014-11-03 20:15:08 +00:00
dr->dr_bio[i] = bio_alloc(GFP_NOIO,
MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
BIO_MAX_PAGES));
if (unlikely(dr->dr_bio[i] == NULL)) {
vdev_disk_dio_free(dr);
return (SET_ERROR(ENOMEM));
}
/* Matching put called by vdev_disk_physio_completion */
vdev_disk_dio_get(dr);
bio_set_dev(dr->dr_bio[i], bdev);
BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
dr->dr_bio[i]->bi_private = dr;
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
/* Remaining size is returned to become the new size */
bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
bio_size, abd_offset);
/* Advance in buffer and construct another bio if needed */
abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
}
zvol processing should use struct bio Internally, zvols are files exposed through the block device API. This is intended to reduce overhead when things require block devices. However, the ZoL zvol code emulates a traditional block device in that it has a top half and a bottom half. This is an unnecessary source of overhead that does not exist on any other OpenZFS platform does this. This patch removes it. Early users of this patch reported double digit performance gains in IOPS on zvols in the range of 50% to 80%. Comments in the code suggest that the current implementation was done to obtain IO merging from Linux's IO elevator. However, the DMU already does write merging while arc_read() should implicitly merge read IOs because only 1 thread is permitted to fetch the buffer into ARC. In addition, commercial ZFSOnLinux distributions report that regular files are more performant than zvols under the current implementation, and the main consumers of zvols are VMs and iSCSI targets, which have their own elevators to merge IOs. Some minor refactoring allows us to register zfs_request() as our ->make_request() handler in place of the generic_make_request() function. This eliminates the layer of code that broke IO requests on zvols into a top half and a bottom half. This has several benefits: 1. No per zvol spinlocks. 2. No redundant IO elevator processing. 3. Interrupts are disabled only when actually necessary. 4. No redispatching of IOs when all taskq threads are busy. 5. Linux's page out routines will properly block. 6. Many autotools checks become obsolete. An unfortunate consequence of eliminating the layer that generic_make_request() is that we no longer calls the instrumentation hooks for block IO accounting. Those hooks are GPL-exported, so we cannot call them ourselves and consequently, we lose the ability to do IO monitoring via iostat. Since zvols are internally files mapped as block devices, this should be okay. Anyone who is willing to accept the performance penalty for the block IO layer's accounting could use the loop device in between the zvol and its consumer. Alternatively, perf and ftrace likely could be used. Also, tools like latencytop will still work. Tools such as latencytop sometimes provide a better view of performance bottlenecks than the traditional block IO accounting tools do. Lastly, if direct reclaim occurs during spacemap loading and swap is on a zvol, this code will deadlock. That deadlock could already occur with sync=always on zvols. Given that swap on zvols is not yet production ready, this is not a blocker. Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-04 22:43:47 +00:00
/* Extra reference to protect dio_request during vdev_submit_bio */
vdev_disk_dio_get(dr);
#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
if (dr->dr_bio_count > 1)
blk_start_plug(&plug);
#endif
/* Submit all bio's associated with this dio */
for (i = 0; i < dr->dr_bio_count; i++)
if (dr->dr_bio[i])
vdev_submit_bio(dr->dr_bio[i]);
#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
if (dr->dr_bio_count > 1)
blk_finish_plug(&plug);
#endif
(void) vdev_disk_dio_put(dr);
return (error);
}
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
{
zio_t *zio = bio->bi_private;
#ifdef HAVE_1ARG_BIO_END_IO_T
zio->io_error = BIO_END_IO_ERROR(bio);
#else
zio->io_error = -error;
#endif
if (zio->io_error && (zio->io_error == EOPNOTSUPP))
zio->io_vd->vdev_nowritecache = B_TRUE;
bio_put(bio);
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
vdev_disk_error(zio);
zio_interrupt(zio);
}
static int
vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
{
struct request_queue *q;
struct bio *bio;
q = bdev_get_queue(bdev);
if (!q)
return (SET_ERROR(ENXIO));
bio = bio_alloc(GFP_NOIO, 0);
/* bio_alloc() with __GFP_WAIT never returns NULL */
if (unlikely(bio == NULL))
return (SET_ERROR(ENOMEM));
bio->bi_end_io = vdev_disk_io_flush_completion;
bio->bi_private = zio;
bio_set_dev(bio, bdev);
bio_set_flush(bio);
vdev_submit_bio(bio);
Invalidate Linux buffer cache on vdevs upon each flush Userland tools such as blkid, grub2-probe and zdb will go through the buffer cache. However, ZFS uses on submit_bio() to bypass the buffer cache when performing IO operations on vdevs for efficiency purposes. This permits the on-disk state and buffer cache to fall out of synchronization. That causes seemingly random failures when tools reading stale metadata from the buffer cache try to access references to data that is no longer there. A particularly bad failure this causes involves grub2-probe, which is used by grub2-mkconfig. Ordinarily, a rootfs might be called rpool/ROOT/gentoo. However, when a failure occurs in grub2-probe, grub2-mkconfig will generate a configuration file containing /ROOT/gentoo, which omits the pool name and causes a boot failure. This is avoidable by calling invalidate_bdev() on each flush, which is a simple way to ensure that all non-dirty pages are wiped. Since userland tools rarely access vdevs directly, this should be a fancy noop >99.999% of the time and have little impact on IO. We could have tried a finer grained approach for the rare instances in which the vdevs are accessed frequently by userland. However, that would require consideration of corner cases and it is not worth the effort. Memory-wise, it would have been better to use a Linux kernel API hook to disable the buffer cache on such devices, but it provides us no way of doing that, so we opt for this approach instead. We should revisit that idea in the future when higher priority issues have been tackled. Signed-off-by: Richard Yao <ryao@gentoo.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2150
2014-02-27 19:03:39 +00:00
invalidate_bdev(bdev);
return (0);
}
static void
vdev_disk_io_start(zio_t *zio)
{
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
int rw, flags, error;
switch (zio->io_type) {
case ZIO_TYPE_IOCTL:
if (!vdev_readable(v)) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return;
}
switch (zio->io_cmd) {
case DKIOCFLUSHWRITECACHE:
if (zfs_nocacheflush)
break;
if (v->vdev_nowritecache) {
zio->io_error = SET_ERROR(ENOTSUP);
break;
}
error = vdev_disk_io_flush(vd->vd_bdev, zio);
if (error == 0)
return;
zio->io_error = error;
break;
default:
zio->io_error = SET_ERROR(ENOTSUP);
}
zio_execute(zio);
return;
case ZIO_TYPE_WRITE:
rw = WRITE;
Fix sync behavior for disk vdevs Prior to b39c22b, which was first generally available in the 0.6.5 release as b39c22b, ZoL never actually submitted synchronous read or write requests to the Linux block layer. This means the vdev_disk_dio_is_sync() function had always returned false and, therefore, the completion in dio_request_t.dr_comp was never actually used. In b39c22b, synchronous ZIO operations were translated to synchronous BIO requests in vdev_disk_io_start(). The follow-on commits 5592404 and aa159af fixed several problems introduced by b39c22b. In particular, 5592404 introduced the new flag parameter "wait" to __vdev_disk_physio() but under ZoL, since vdev_disk_physio() is never actually used, the wait flag was always zero so the new code had no effect other than to cause a bug in the use of the dio_request_t.dr_comp which was fixed by aa159af. The original rationale for introducing synchronous operations in b39c22b was to hurry certains requests through the BIO layer which would have otherwise been subject to its unplug timer which would increase the latency. This behavior of the unplug timer, however, went away during the transition of the plug/unplug system between kernels 2.6.32 and 2.6.39. To handle the unplug timer behavior on 2.6.32-2.6.35 kernels the BIO_RW_UNPLUG flag is used as a hint to suppress the plugging behavior. For kernels 2.6.36-2.6.38, the REQ_UNPLUG macro will be available and ise used for the same purpose. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4858
2016-07-08 15:33:01 +00:00
#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
flags = (1 << BIO_RW_UNPLUG);
Fix sync behavior for disk vdevs Prior to b39c22b, which was first generally available in the 0.6.5 release as b39c22b, ZoL never actually submitted synchronous read or write requests to the Linux block layer. This means the vdev_disk_dio_is_sync() function had always returned false and, therefore, the completion in dio_request_t.dr_comp was never actually used. In b39c22b, synchronous ZIO operations were translated to synchronous BIO requests in vdev_disk_io_start(). The follow-on commits 5592404 and aa159af fixed several problems introduced by b39c22b. In particular, 5592404 introduced the new flag parameter "wait" to __vdev_disk_physio() but under ZoL, since vdev_disk_physio() is never actually used, the wait flag was always zero so the new code had no effect other than to cause a bug in the use of the dio_request_t.dr_comp which was fixed by aa159af. The original rationale for introducing synchronous operations in b39c22b was to hurry certains requests through the BIO layer which would have otherwise been subject to its unplug timer which would increase the latency. This behavior of the unplug timer, however, went away during the transition of the plug/unplug system between kernels 2.6.32 and 2.6.39. To handle the unplug timer behavior on 2.6.32-2.6.35 kernels the BIO_RW_UNPLUG flag is used as a hint to suppress the plugging behavior. For kernels 2.6.36-2.6.38, the REQ_UNPLUG macro will be available and ise used for the same purpose. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4858
2016-07-08 15:33:01 +00:00
#elif defined(REQ_UNPLUG)
flags = REQ_UNPLUG;
Fix sync behavior for disk vdevs Prior to b39c22b, which was first generally available in the 0.6.5 release as b39c22b, ZoL never actually submitted synchronous read or write requests to the Linux block layer. This means the vdev_disk_dio_is_sync() function had always returned false and, therefore, the completion in dio_request_t.dr_comp was never actually used. In b39c22b, synchronous ZIO operations were translated to synchronous BIO requests in vdev_disk_io_start(). The follow-on commits 5592404 and aa159af fixed several problems introduced by b39c22b. In particular, 5592404 introduced the new flag parameter "wait" to __vdev_disk_physio() but under ZoL, since vdev_disk_physio() is never actually used, the wait flag was always zero so the new code had no effect other than to cause a bug in the use of the dio_request_t.dr_comp which was fixed by aa159af. The original rationale for introducing synchronous operations in b39c22b was to hurry certains requests through the BIO layer which would have otherwise been subject to its unplug timer which would increase the latency. This behavior of the unplug timer, however, went away during the transition of the plug/unplug system between kernels 2.6.32 and 2.6.39. To handle the unplug timer behavior on 2.6.32-2.6.35 kernels the BIO_RW_UNPLUG flag is used as a hint to suppress the plugging behavior. For kernels 2.6.36-2.6.38, the REQ_UNPLUG macro will be available and ise used for the same purpose. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4858
2016-07-08 15:33:01 +00:00
#else
flags = 0;
Fix sync behavior for disk vdevs Prior to b39c22b, which was first generally available in the 0.6.5 release as b39c22b, ZoL never actually submitted synchronous read or write requests to the Linux block layer. This means the vdev_disk_dio_is_sync() function had always returned false and, therefore, the completion in dio_request_t.dr_comp was never actually used. In b39c22b, synchronous ZIO operations were translated to synchronous BIO requests in vdev_disk_io_start(). The follow-on commits 5592404 and aa159af fixed several problems introduced by b39c22b. In particular, 5592404 introduced the new flag parameter "wait" to __vdev_disk_physio() but under ZoL, since vdev_disk_physio() is never actually used, the wait flag was always zero so the new code had no effect other than to cause a bug in the use of the dio_request_t.dr_comp which was fixed by aa159af. The original rationale for introducing synchronous operations in b39c22b was to hurry certains requests through the BIO layer which would have otherwise been subject to its unplug timer which would increase the latency. This behavior of the unplug timer, however, went away during the transition of the plug/unplug system between kernels 2.6.32 and 2.6.39. To handle the unplug timer behavior on 2.6.32-2.6.35 kernels the BIO_RW_UNPLUG flag is used as a hint to suppress the plugging behavior. For kernels 2.6.36-2.6.38, the REQ_UNPLUG macro will be available and ise used for the same purpose. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4858
2016-07-08 15:33:01 +00:00
#endif
break;
case ZIO_TYPE_READ:
rw = READ;
Fix sync behavior for disk vdevs Prior to b39c22b, which was first generally available in the 0.6.5 release as b39c22b, ZoL never actually submitted synchronous read or write requests to the Linux block layer. This means the vdev_disk_dio_is_sync() function had always returned false and, therefore, the completion in dio_request_t.dr_comp was never actually used. In b39c22b, synchronous ZIO operations were translated to synchronous BIO requests in vdev_disk_io_start(). The follow-on commits 5592404 and aa159af fixed several problems introduced by b39c22b. In particular, 5592404 introduced the new flag parameter "wait" to __vdev_disk_physio() but under ZoL, since vdev_disk_physio() is never actually used, the wait flag was always zero so the new code had no effect other than to cause a bug in the use of the dio_request_t.dr_comp which was fixed by aa159af. The original rationale for introducing synchronous operations in b39c22b was to hurry certains requests through the BIO layer which would have otherwise been subject to its unplug timer which would increase the latency. This behavior of the unplug timer, however, went away during the transition of the plug/unplug system between kernels 2.6.32 and 2.6.39. To handle the unplug timer behavior on 2.6.32-2.6.35 kernels the BIO_RW_UNPLUG flag is used as a hint to suppress the plugging behavior. For kernels 2.6.36-2.6.38, the REQ_UNPLUG macro will be available and ise used for the same purpose. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4858
2016-07-08 15:33:01 +00:00
#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
flags = (1 << BIO_RW_UNPLUG);
Fix sync behavior for disk vdevs Prior to b39c22b, which was first generally available in the 0.6.5 release as b39c22b, ZoL never actually submitted synchronous read or write requests to the Linux block layer. This means the vdev_disk_dio_is_sync() function had always returned false and, therefore, the completion in dio_request_t.dr_comp was never actually used. In b39c22b, synchronous ZIO operations were translated to synchronous BIO requests in vdev_disk_io_start(). The follow-on commits 5592404 and aa159af fixed several problems introduced by b39c22b. In particular, 5592404 introduced the new flag parameter "wait" to __vdev_disk_physio() but under ZoL, since vdev_disk_physio() is never actually used, the wait flag was always zero so the new code had no effect other than to cause a bug in the use of the dio_request_t.dr_comp which was fixed by aa159af. The original rationale for introducing synchronous operations in b39c22b was to hurry certains requests through the BIO layer which would have otherwise been subject to its unplug timer which would increase the latency. This behavior of the unplug timer, however, went away during the transition of the plug/unplug system between kernels 2.6.32 and 2.6.39. To handle the unplug timer behavior on 2.6.32-2.6.35 kernels the BIO_RW_UNPLUG flag is used as a hint to suppress the plugging behavior. For kernels 2.6.36-2.6.38, the REQ_UNPLUG macro will be available and ise used for the same purpose. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4858
2016-07-08 15:33:01 +00:00
#elif defined(REQ_UNPLUG)
flags = REQ_UNPLUG;
Fix sync behavior for disk vdevs Prior to b39c22b, which was first generally available in the 0.6.5 release as b39c22b, ZoL never actually submitted synchronous read or write requests to the Linux block layer. This means the vdev_disk_dio_is_sync() function had always returned false and, therefore, the completion in dio_request_t.dr_comp was never actually used. In b39c22b, synchronous ZIO operations were translated to synchronous BIO requests in vdev_disk_io_start(). The follow-on commits 5592404 and aa159af fixed several problems introduced by b39c22b. In particular, 5592404 introduced the new flag parameter "wait" to __vdev_disk_physio() but under ZoL, since vdev_disk_physio() is never actually used, the wait flag was always zero so the new code had no effect other than to cause a bug in the use of the dio_request_t.dr_comp which was fixed by aa159af. The original rationale for introducing synchronous operations in b39c22b was to hurry certains requests through the BIO layer which would have otherwise been subject to its unplug timer which would increase the latency. This behavior of the unplug timer, however, went away during the transition of the plug/unplug system between kernels 2.6.32 and 2.6.39. To handle the unplug timer behavior on 2.6.32-2.6.35 kernels the BIO_RW_UNPLUG flag is used as a hint to suppress the plugging behavior. For kernels 2.6.36-2.6.38, the REQ_UNPLUG macro will be available and ise used for the same purpose. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4858
2016-07-08 15:33:01 +00:00
#else
flags = 0;
Fix sync behavior for disk vdevs Prior to b39c22b, which was first generally available in the 0.6.5 release as b39c22b, ZoL never actually submitted synchronous read or write requests to the Linux block layer. This means the vdev_disk_dio_is_sync() function had always returned false and, therefore, the completion in dio_request_t.dr_comp was never actually used. In b39c22b, synchronous ZIO operations were translated to synchronous BIO requests in vdev_disk_io_start(). The follow-on commits 5592404 and aa159af fixed several problems introduced by b39c22b. In particular, 5592404 introduced the new flag parameter "wait" to __vdev_disk_physio() but under ZoL, since vdev_disk_physio() is never actually used, the wait flag was always zero so the new code had no effect other than to cause a bug in the use of the dio_request_t.dr_comp which was fixed by aa159af. The original rationale for introducing synchronous operations in b39c22b was to hurry certains requests through the BIO layer which would have otherwise been subject to its unplug timer which would increase the latency. This behavior of the unplug timer, however, went away during the transition of the plug/unplug system between kernels 2.6.32 and 2.6.39. To handle the unplug timer behavior on 2.6.32-2.6.35 kernels the BIO_RW_UNPLUG flag is used as a hint to suppress the plugging behavior. For kernels 2.6.36-2.6.38, the REQ_UNPLUG macro will be available and ise used for the same purpose. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4858
2016-07-08 15:33:01 +00:00
#endif
break;
default:
zio->io_error = SET_ERROR(ENOTSUP);
zio_interrupt(zio);
return;
}
zio->io_target_timestamp = zio_handle_io_delay(zio);
error = __vdev_disk_physio(vd->vd_bdev, zio,
zio->io_size, zio->io_offset, rw, flags);
if (error) {
zio->io_error = error;
zio_interrupt(zio);
return;
}
}
static void
vdev_disk_io_done(zio_t *zio)
{
/*
* If the device returned EIO, we revalidate the media. If it is
* determined the media has changed this triggers the asynchronous
* removal of the device from the configuration.
*/
if (zio->io_error == EIO) {
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
if (check_disk_change(vd->vd_bdev)) {
vdev_bdev_invalidate(vd->vd_bdev);
v->vdev_remove_wanted = B_TRUE;
spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
}
}
}
static void
vdev_disk_hold(vdev_t *vd)
{
ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
/* We must have a pathname, and it must be absolute. */
if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
return;
/*
* Only prefetch path and devid info if the device has
* never been opened.
*/
if (vd->vdev_tsd != NULL)
return;
/* XXX: Implement me as a vnode lookup for the device */
vd->vdev_name_vp = NULL;
vd->vdev_devid_vp = NULL;
}
static void
vdev_disk_rele(vdev_t *vd)
{
ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
/* XXX: Implement me as a vnode rele for the device */
}
static int
param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
{
spa_t *spa = NULL;
char *p;
if (val == NULL)
return (SET_ERROR(-EINVAL));
if ((p = strchr(val, '\n')) != NULL)
*p = '\0';
if (spa_mode_global != 0) {
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL) {
if (spa_state(spa) != POOL_STATE_ACTIVE ||
!spa_writeable(spa) || spa_suspended(spa))
continue;
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
mutex_enter(&spa_namespace_lock);
spa_close(spa, FTAG);
}
mutex_exit(&spa_namespace_lock);
}
return (param_set_charp(val, kp));
}
vdev_ops_t vdev_disk_ops = {
vdev_disk_open,
vdev_disk_close,
vdev_default_asize,
vdev_disk_io_start,
vdev_disk_io_done,
NULL,
NULL,
vdev_disk_hold,
vdev_disk_rele,
OpenZFS 7614, 9064 - zfs device evacuation/removal OpenZFS 7614 - zfs device evacuation/removal OpenZFS 9064 - remove_mirror should wait for device removal to complete This project allows top-level vdevs to be removed from the storage pool with "zpool remove", reducing the total amount of storage in the pool. This operation copies all allocated regions of the device to be removed onto other devices, recording the mapping from old to new location. After the removal is complete, read and free operations to the removed (now "indirect") vdev must be remapped and performed at the new location on disk. The indirect mapping table is kept in memory whenever the pool is loaded, so there is minimal performance overhead when doing operations on the indirect vdev. The size of the in-memory mapping table will be reduced when its entries become "obsolete" because they are no longer used by any block pointers in the pool. An entry becomes obsolete when all the blocks that use it are freed. An entry can also become obsolete when all the snapshots that reference it are deleted, and the block pointers that reference it have been "remapped" in all filesystems/zvols (and clones). Whenever an indirect block is written, all the block pointers in it will be "remapped" to their new (concrete) locations if possible. This process can be accelerated by using the "zfs remap" command to proactively rewrite all indirect blocks that reference indirect (removed) vdevs. Note that when a device is removed, we do not verify the checksum of the data that is copied. This makes the process much faster, but if it were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be possible to copy the wrong data, when we have the correct data on e.g. the other side of the mirror. At the moment, only mirrors and simple top-level vdevs can be removed and no removal is allowed if any of the top-level vdevs are raidz. Porting Notes: * Avoid zero-sized kmem_alloc() in vdev_compact_children(). The device evacuation code adds a dependency that vdev_compact_children() be able to properly empty the vdev_child array by setting it to NULL and zeroing vdev_children. Under Linux, kmem_alloc() and related functions return a sentinel pointer rather than NULL for zero-sized allocations. * Remove comment regarding "mpt" driver where zfs_remove_max_segment is initialized to SPA_MAXBLOCKSIZE. Change zfs_condense_indirect_commit_entry_delay_ticks to zfs_condense_indirect_commit_entry_delay_ms for consistency with most other tunables in which delays are specified in ms. * ZTS changes: Use set_tunable rather than mdb Use zpool sync as appropriate Use sync_pool instead of sync Kill jobs during test_removal_with_operation to allow unmount/export Don't add non-disk names such as "mirror" or "raidz" to $DISKS Use $TEST_BASE_DIR instead of /tmp Increase HZ from 100 to 1000 which is more common on Linux removal_multiple_indirection.ksh Reduce iterations in order to not time out on the code coverage builders. removal_resume_export: Functionally, the test case is correct but there exists a race where the kernel thread hasn't been fully started yet and is not visible. Wait for up to 1 second for the removal thread to be started before giving up on it. Also, increase the amount of data copied in order that the removal not finish before the export has a chance to fail. * MMP compatibility, the concept of concrete versus non-concrete devices has slightly changed the semantics of vdev_writeable(). Update mmp_random_leaf_impl() accordingly. * Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool feature which is not supported by OpenZFS. * Added support for new vdev removal tracepoints. * Test cases removal_with_zdb and removal_condense_export have been intentionally disabled. When run manually they pass as intended, but when running in the automated test environment they produce unreliable results on the latest Fedora release. They may work better once the upstream pool import refectoring is merged into ZoL at which point they will be re-enabled. Authored by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Alex Reece <alex@delphix.com> Reviewed-by: George Wilson <george.wilson@delphix.com> Reviewed-by: John Kennedy <john.kennedy@delphix.com> Reviewed-by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Richard Laager <rlaager@wiktel.com> Reviewed by: Tim Chase <tim@chase2k.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Garrett D'Amore <garrett@damore.org> Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Tim Chase <tim@chase2k.com> OpenZFS-issue: https://www.illumos.org/issues/7614 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb Closes #6900
2016-09-22 16:30:13 +00:00
NULL,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
param_get_charp, &zfs_vdev_scheduler, 0644);
Add missing ZFS tunables This commit adds module options for all existing zfs tunables. Ideally the average user should never need to modify any of these values. However, in practice sometimes you do need to tweak these values for one reason or another. In those cases it's nice not to have to resort to rebuilding from source. All tunables are visable to modinfo and the list is as follows: $ modinfo module/zfs/zfs.ko filename: module/zfs/zfs.ko license: CDDL author: Sun Microsystems/Oracle, Lawrence Livermore National Laboratory description: ZFS srcversion: 8EAB1D71DACE05B5AA61567 depends: spl,znvpair,zcommon,zunicode,zavl vermagic: 2.6.32-131.0.5.el6.x86_64 SMP mod_unload modversions parm: zvol_major:Major number for zvol device (uint) parm: zvol_threads:Number of threads for zvol device (uint) parm: zio_injection_enabled:Enable fault injection (int) parm: zio_bulk_flags:Additional flags to pass to bulk buffers (int) parm: zio_delay_max:Max zio millisec delay before posting event (int) parm: zio_requeue_io_start_cut_in_line:Prioritize requeued I/O (bool) parm: zil_replay_disable:Disable intent logging replay (int) parm: zfs_nocacheflush:Disable cache flushes (bool) parm: zfs_read_chunk_size:Bytes to read per chunk (long) parm: zfs_vdev_max_pending:Max pending per-vdev I/Os (int) parm: zfs_vdev_min_pending:Min pending per-vdev I/Os (int) parm: zfs_vdev_aggregation_limit:Max vdev I/O aggregation size (int) parm: zfs_vdev_time_shift:Deadline time shift for vdev I/O (int) parm: zfs_vdev_ramp_rate:Exponential I/O issue ramp-up rate (int) parm: zfs_vdev_read_gap_limit:Aggregate read I/O over gap (int) parm: zfs_vdev_write_gap_limit:Aggregate write I/O over gap (int) parm: zfs_vdev_scheduler:I/O scheduler (charp) parm: zfs_vdev_cache_max:Inflate reads small than max (int) parm: zfs_vdev_cache_size:Total size of the per-disk cache (int) parm: zfs_vdev_cache_bshift:Shift size to inflate reads too (int) parm: zfs_scrub_limit:Max scrub/resilver I/O per leaf vdev (int) parm: zfs_recover:Set to attempt to recover from fatal errors (int) parm: spa_config_path:SPA config file (/etc/zfs/zpool.cache) (charp) parm: zfs_zevent_len_max:Max event queue length (int) parm: zfs_zevent_cols:Max event column width (int) parm: zfs_zevent_console:Log events to the console (int) parm: zfs_top_maxinflight:Max I/Os per top-level (int) parm: zfs_resilver_delay:Number of ticks to delay resilver (int) parm: zfs_scrub_delay:Number of ticks to delay scrub (int) parm: zfs_scan_idle:Idle window in clock ticks (int) parm: zfs_scan_min_time_ms:Min millisecs to scrub per txg (int) parm: zfs_free_min_time_ms:Min millisecs to free per txg (int) parm: zfs_resilver_min_time_ms:Min millisecs to resilver per txg (int) parm: zfs_no_scrub_io:Set to disable scrub I/O (bool) parm: zfs_no_scrub_prefetch:Set to disable scrub prefetching (bool) parm: zfs_txg_timeout:Max seconds worth of delta per txg (int) parm: zfs_no_write_throttle:Disable write throttling (int) parm: zfs_write_limit_shift:log2(fraction of memory) per txg (int) parm: zfs_txg_synctime_ms:Target milliseconds between tgx sync (int) parm: zfs_write_limit_min:Min tgx write limit (ulong) parm: zfs_write_limit_max:Max tgx write limit (ulong) parm: zfs_write_limit_inflated:Inflated tgx write limit (ulong) parm: zfs_write_limit_override:Override tgx write limit (ulong) parm: zfs_prefetch_disable:Disable all ZFS prefetching (int) parm: zfetch_max_streams:Max number of streams per zfetch (uint) parm: zfetch_min_sec_reap:Min time before stream reclaim (uint) parm: zfetch_block_cap:Max number of blocks to fetch at a time (uint) parm: zfetch_array_rd_sz:Number of bytes in a array_read (ulong) parm: zfs_pd_blks_max:Max number of blocks to prefetch (int) parm: zfs_dedup_prefetch:Enable prefetching dedup-ed blks (int) parm: zfs_arc_min:Min arc size (ulong) parm: zfs_arc_max:Max arc size (ulong) parm: zfs_arc_meta_limit:Meta limit for arc size (ulong) parm: zfs_arc_reduce_dnlc_percent:Meta reclaim percentage (int) parm: zfs_arc_grow_retry:Seconds before growing arc size (int) parm: zfs_arc_shrink_shift:log2(fraction of arc to reclaim) (int) parm: zfs_arc_p_min_shift:arc_c shift to calc min/max arc_p (int)
2011-05-03 22:09:28 +00:00
MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");