diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h index 021e66d1a9a4..03e7048ac665 100644 --- a/include/sys/vdev_disk.h +++ b/include/sys/vdev_disk.h @@ -81,6 +81,9 @@ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); # define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) #endif +/* Default Linux IO Scheduler */ +#define VDEV_SCHEDULER "noop" + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 3e59bd22615e..aba3c4ab57ef 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -33,6 +33,8 @@ #include #include +char *zfs_vdev_scheduler = VDEV_SCHEDULER; + /* * Virtual device vector for disks. */ @@ -102,6 +104,43 @@ vdev_disk_error(zio_t *zio) #endif } +/* + * Use the Linux 'noop' elevator for zfs managed block devices. This + * strikes the ideal balance by allowing the zfs elevator to do all + * request ordering and prioritization. While allowing the Linux + * elevator to do the maximum front/back merging allowed by the + * physical device. This yields the largest possible requests for + * the device with the lowest total overhead. + * + * Unfortunately we cannot directly call the elevator_switch() function + * because it is not exported from the block layer. This means we have + * to use the sysfs interface and a user space upcall. Pools will be + * automatically imported on module load so we must do this at device + * open time from the kernel. + */ +static int +vdev_elevator_switch(vdev_t *v, char *elevator, char *device) +{ + char sh_path[] = "/bin/sh"; + char sh_cmd[128]; + char *argv[] = { sh_path, "-c", sh_cmd }; + char *envp[] = { NULL }; + int error; + + if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4)) + return (0); + + sprintf(sh_cmd, "%s \"%s\" >/sys/block/%s/queue/scheduler", + "/bin/echo", elevator, device); + + error = call_usermodehelper(sh_path, argv, envp, 1); + if (error) + printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n", + elevator, v->vdev_path, device, error); + + return (error); +} + static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) { @@ -167,6 +206,10 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) /* Based on the minimum sector size set the block size */ *ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; + /* Try to set the io scheduler elevator algorithm */ + (void) vdev_elevator_switch(v, zfs_vdev_scheduler, + bdev->bd_disk->disk_name); + return 0; } @@ -702,3 +745,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) return 0; } + +module_param(zfs_vdev_scheduler, charp, 0644); +MODULE_PARM_DESC(zfs_vdev_scheduler, "IO Scheduler (noop)");