From 5592404784d3125cbeb6df002674867c009c8b48 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 24 Sep 2015 16:32:25 -0700 Subject: [PATCH] Fix synchronous behavior in __vdev_disk_physio() Commit b39c22b set the READ_SYNC and WRITE_SYNC flags for a bio based on the ZIO_PRIORITY_* flag passed in. This had the unnoticed side-effect of making the vdev_disk_io_start() synchronous for certain I/Os. This in turn resulted in vdev_disk_io_start() being able to re-dispatch zio's which would result in a RCU stalls when a disk was removed from the system. Additionally, this could negatively impact performance and explains the performance regressions reported in both #3829 and #3780. This patch resolves the issue by making the blocking behavior dependent on a 'wait' flag being passed rather than overloading the passed bio flags. Finally, the WRITE_SYNC and READ_SYNC behavior is restricted to non-rotational devices where there is no benefit to queuing to aggregate the I/O. Signed-off-by: Brian Behlendorf Issue #3652 Issue #3780 Issue #3785 Issue #3817 Issue #3821 Issue #3829 Issue #3832 Issue #3870 --- config/kernel-bio-rw-syncio.m4 | 50 ---------------------------------- config/kernel.m4 | 3 -- module/zfs/vdev_disk.c | 36 ++++++------------------ 3 files changed, 8 insertions(+), 81 deletions(-) delete mode 100644 config/kernel-bio-rw-syncio.m4 diff --git a/config/kernel-bio-rw-syncio.m4 b/config/kernel-bio-rw-syncio.m4 deleted file mode 100644 index 4bff80a8f863..000000000000 --- a/config/kernel-bio-rw-syncio.m4 +++ /dev/null @@ -1,50 +0,0 @@ -dnl # -dnl # Preferred interface for flagging a synchronous bio: -dnl # 2.6.12-2.6.29: BIO_RW_SYNC -dnl # 2.6.30-2.6.35: BIO_RW_SYNCIO -dnl # 2.6.36-2.6.xx: REQ_SYNC -dnl # -AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_SYNC], [ - AC_MSG_CHECKING([whether BIO_RW_SYNC is defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - int flags __attribute__ ((unused)); - flags = BIO_RW_SYNC; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_RW_SYNC, 1, [BIO_RW_SYNC is defined]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_SYNCIO], [ - AC_MSG_CHECKING([whether BIO_RW_SYNCIO is defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - int flags __attribute__ ((unused)); - flags = BIO_RW_SYNCIO; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_RW_SYNCIO, 1, [BIO_RW_SYNCIO is defined]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_REQ_SYNC], [ - AC_MSG_CHECKING([whether REQ_SYNC is defined]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - int flags __attribute__ ((unused)); - flags = REQ_SYNC; - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_REQ_SYNC, 1, [REQ_SYNC is defined]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel.m4 b/config/kernel.m4 index e088c4da34ef..0a65f39ef21b 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -25,9 +25,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_BIO_END_IO_T_ARGS ZFS_AC_KERNEL_BIO_RW_BARRIER ZFS_AC_KERNEL_BIO_RW_DISCARD - ZFS_AC_KERNEL_BIO_RW_SYNC - ZFS_AC_KERNEL_BIO_RW_SYNCIO - ZFS_AC_KERNEL_REQ_SYNC ZFS_AC_KERNEL_BLK_QUEUE_FLUSH ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index eac0f296ecf1..5fb218f73a92 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -369,27 +369,6 @@ vdev_disk_dio_free(dio_request_t *dr) sizeof (struct bio *) * dr->dr_bio_count); } -static int -vdev_disk_dio_is_sync(dio_request_t *dr) -{ -#ifdef HAVE_BIO_RW_SYNC - /* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */ - return (dr->dr_rw & (1 << BIO_RW_SYNC)); -#else -#ifdef HAVE_BIO_RW_SYNCIO - /* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */ - return (dr->dr_rw & (1 << BIO_RW_SYNCIO)); -#else -#ifdef HAVE_REQ_SYNC - /* REQ_SYNC preferred interface from 2.6.36-2.6.xx */ - return (dr->dr_rw & REQ_SYNC); -#else -#error "Unable to determine bio sync flag" -#endif /* HAVE_REQ_SYNC */ -#endif /* HAVE_BIO_RW_SYNC */ -#endif /* HAVE_BIO_RW_SYNCIO */ -} - static void vdev_disk_dio_get(dio_request_t *dr) { @@ -444,7 +423,7 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) rc = vdev_disk_dio_put(dr); /* Wake up synchronous waiter this is the last outstanding bio */ - if ((rc == 1) && vdev_disk_dio_is_sync(dr)) + if (rc == 1) complete(&dr->dr_comp); } @@ -512,7 +491,7 @@ vdev_submit_bio(int rw, struct bio *bio) static int __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, - size_t kbuf_size, uint64_t kbuf_offset, int flags) + size_t kbuf_size, uint64_t kbuf_offset, int flags, int wait) { dio_request_t *dr; caddr_t bio_ptr; @@ -603,7 +582,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, * only synchronous consumer is vdev_disk_read_rootlabel() all other * IO originating from vdev_disk_io_start() is asynchronous. */ - if (vdev_disk_dio_is_sync(dr)) { + if (wait) { wait_for_completion(&dr->dr_comp); error = dr->dr_error; ASSERT3S(atomic_read(&dr->dr_ref), ==, 1); @@ -619,7 +598,7 @@ vdev_disk_physio(struct block_device *bdev, caddr_t kbuf, size_t size, uint64_t offset, int flags) { bio_set_flags_failfast(bdev, &flags); - return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags)); + return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags, 1)); } BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc) @@ -671,6 +650,7 @@ vdev_disk_io_start(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; + zio_priority_t pri = zio->io_priority; int flags, error; switch (zio->io_type) { @@ -710,14 +690,14 @@ vdev_disk_io_start(zio_t *zio) zio_execute(zio); return; case ZIO_TYPE_WRITE: - if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) + if ((pri == ZIO_PRIORITY_SYNC_WRITE) && (v->vdev_nonrot)) flags = WRITE_SYNC; else flags = WRITE; break; case ZIO_TYPE_READ: - if (zio->io_priority == ZIO_PRIORITY_SYNC_READ) + if ((pri == ZIO_PRIORITY_SYNC_READ) && (v->vdev_nonrot)) flags = READ_SYNC; else flags = READ; @@ -730,7 +710,7 @@ vdev_disk_io_start(zio_t *zio) } error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, - zio->io_size, zio->io_offset, flags); + zio->io_size, zio->io_offset, flags, 0); if (error) { zio->io_error = error; zio_interrupt(zio);