Reinstate zvol_taskq to fix aio on zvol

Commit 37f9dac removed the zvol_taskq for processing zvol requests.
This was removed as part of switching to make_request_fn and was
motivated by a concern at the time over dispatch latency.

However, this also made all bio request synchronous, and caused
serious performance issues as the bio submitter would wait for
every bio it submitted, effectively making the IO depth 1.

This patch reinstate zvol_taskq, and to make sure overlapped I/Os
are ordered properly, we take range lock in zvol_request, and pass
it along with bio to the I/O functions zvol_{write,discard,read}.

In order to facilitate benchmarks a zvol_request_sync module
option was added to switch between sync and async request handling.
For the moment, the default behavior is synchronous but this is
likely to change pending additional testing.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Closes #5824
This commit is contained in:
Chunwei Chen 2017-02-22 16:08:04 -08:00 committed by Brian Behlendorf
parent e815485fe9
commit 692e55b8fe
3 changed files with 201 additions and 84 deletions

View File

@ -498,8 +498,15 @@ blk_queue_discard_granularity(struct request_queue *q, unsigned int dg)
#define VDEV_HOLDER ((void *)0x2401de7) #define VDEV_HOLDER ((void *)0x2401de7)
#ifndef HAVE_GENERIC_IO_ACCT #ifndef HAVE_GENERIC_IO_ACCT
#define generic_start_io_acct(rw, slen, part) ((void)0) static inline void
#define generic_end_io_acct(rw, part, start_jiffies) ((void)0) generic_start_io_acct(int rw, unsigned long sectors, struct hd_struct *part)
{
}
static inline void
generic_end_io_acct(int rw, struct hd_struct *part, unsigned long start_time)
{
}
#endif #endif
#endif /* _ZFS_BLKDEV_H */ #endif /* _ZFS_BLKDEV_H */

View File

@ -2031,6 +2031,31 @@ table.
Default value: \fB131,072\fR. Default value: \fB131,072\fR.
.RE .RE
.sp
.ne 2
.na
\fBzvol_request_sync\fR (uint)
.ad
.RS 12n
When processing I/O requests for a zvol submit them synchronously. This
effectively limits the queue depth to 1 for each I/O submitter. When set
to 0 requests are handled asynchronously by a thread pool. The number of
requests which can be handled concurrently is controller by \fBzvol_threads\fR.
.sp
Default value: \fB1\fR.
.RE
.sp
.ne 2
.na
\fBzvol_threads\fR (uint)
.ad
.RS 12n
Max number of threads which can handle zvol I/O requests concurrently.
.sp
Default value: \fB32\fR.
.RE
.sp .sp
.ne 2 .ne 2
.na .na

View File

@ -56,9 +56,12 @@
unsigned int zvol_inhibit_dev = 0; unsigned int zvol_inhibit_dev = 0;
unsigned int zvol_major = ZVOL_MAJOR; unsigned int zvol_major = ZVOL_MAJOR;
unsigned int zvol_threads = 32;
unsigned int zvol_request_sync = 1;
unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned int zvol_prefetch_bytes = (128 * 1024);
unsigned long zvol_max_discard_blocks = 16384; unsigned long zvol_max_discard_blocks = 16384;
static taskq_t *zvol_taskq;
static kmutex_t zvol_state_lock; static kmutex_t zvol_state_lock;
static list_t zvol_state_list; static list_t zvol_state_list;
@ -636,21 +639,48 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
} }
} }
static int typedef struct zv_request {
zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync) zvol_state_t *zv;
struct bio *bio;
rl_t *rl;
} zv_request_t;
static void
uio_from_bio(uio_t *uio, struct bio *bio)
{ {
uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
uio->uio_skip = BIO_BI_SKIP(bio);
uio->uio_resid = BIO_BI_SIZE(bio);
uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
uio->uio_limit = MAXOFFSET_T;
uio->uio_segflg = UIO_BVEC;
}
static void
zvol_write(void *arg)
{
zv_request_t *zvr = arg;
struct bio *bio = zvr->bio;
uio_t uio;
zvol_state_t *zv = zvr->zv;
uint64_t volsize = zv->zv_volsize; uint64_t volsize = zv->zv_volsize;
rl_t *rl; boolean_t sync;
int error = 0; int error = 0;
unsigned long start_jif;
uio_from_bio(&uio, bio);
ASSERT(zv && zv->zv_open_count > 0); ASSERT(zv && zv->zv_open_count > 0);
rl = zfs_range_lock(&zv->zv_range_lock, uio->uio_loffset, start_jif = jiffies;
uio->uio_resid, RL_WRITER); generic_start_io_acct(WRITE, bio_sectors(bio), &zv->zv_disk->part0);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
uint64_t off = uio->uio_loffset; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
uint64_t off = uio.uio_loffset;
dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
if (bytes > volsize - off) /* don't write past the end */ if (bytes > volsize - off) /* don't write past the end */
@ -664,7 +694,7 @@ zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync)
dmu_tx_abort(tx); dmu_tx_abort(tx);
break; break;
} }
error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); error = dmu_write_uio_dbuf(zv->zv_dbuf, &uio, bytes, tx);
if (error == 0) if (error == 0)
zvol_log_write(zv, tx, off, bytes, sync); zvol_log_write(zv, tx, off, bytes, sync);
dmu_tx_commit(tx); dmu_tx_commit(tx);
@ -672,10 +702,14 @@ zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync)
if (error) if (error)
break; break;
} }
zfs_range_unlock(rl); zfs_range_unlock(zvr->rl);
if (sync) if (sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ); zil_commit(zv->zv_zilog, ZVOL_OBJ);
return (error);
rw_exit(&zv->zv_suspend_lock);
generic_end_io_acct(WRITE, &zv->zv_disk->part0, start_jif);
BIO_END_IO(bio, -error);
kmem_free(zvr, sizeof (zv_request_t));
} }
/* /*
@ -702,21 +736,28 @@ zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
zil_itx_assign(zilog, itx, tx); zil_itx_assign(zilog, itx, tx);
} }
static int static void
zvol_discard(struct bio *bio) zvol_discard(void *arg)
{ {
zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data; zv_request_t *zvr = arg;
struct bio *bio = zvr->bio;
zvol_state_t *zv = zvr->zv;
uint64_t start = BIO_BI_SECTOR(bio) << 9; uint64_t start = BIO_BI_SECTOR(bio) << 9;
uint64_t size = BIO_BI_SIZE(bio); uint64_t size = BIO_BI_SIZE(bio);
uint64_t end = start + size; uint64_t end = start + size;
int error; int error = 0;
rl_t *rl;
dmu_tx_t *tx; dmu_tx_t *tx;
unsigned long start_jif;
ASSERT(zv && zv->zv_open_count > 0); ASSERT(zv && zv->zv_open_count > 0);
if (end > zv->zv_volsize) start_jif = jiffies;
return (SET_ERROR(EIO)); generic_start_io_acct(WRITE, bio_sectors(bio), &zv->zv_disk->part0);
if (end > zv->zv_volsize) {
error = SET_ERROR(EIO);
goto out;
}
/* /*
* Align the request to volume block boundaries when a secure erase is * Align the request to volume block boundaries when a secure erase is
@ -731,9 +772,8 @@ zvol_discard(struct bio *bio)
} }
if (start >= end) if (start >= end)
return (0); goto out;
rl = zfs_range_lock(&zv->zv_range_lock, start, size, RL_WRITER);
tx = dmu_tx_create(zv->zv_objset); tx = dmu_tx_create(zv->zv_objset);
dmu_tx_mark_netfree(tx); dmu_tx_mark_netfree(tx);
error = dmu_tx_assign(tx, TXG_WAIT); error = dmu_tx_assign(tx, TXG_WAIT);
@ -746,30 +786,40 @@ zvol_discard(struct bio *bio)
ZVOL_OBJ, start, size); ZVOL_OBJ, start, size);
} }
zfs_range_unlock(rl); out:
zfs_range_unlock(zvr->rl);
return (error); rw_exit(&zv->zv_suspend_lock);
generic_end_io_acct(WRITE, &zv->zv_disk->part0, start_jif);
BIO_END_IO(bio, -error);
kmem_free(zvr, sizeof (zv_request_t));
} }
static int static void
zvol_read(zvol_state_t *zv, uio_t *uio) zvol_read(void *arg)
{ {
zv_request_t *zvr = arg;
struct bio *bio = zvr->bio;
uio_t uio;
zvol_state_t *zv = zvr->zv;
uint64_t volsize = zv->zv_volsize; uint64_t volsize = zv->zv_volsize;
rl_t *rl;
int error = 0; int error = 0;
unsigned long start_jif;
uio_from_bio(&uio, bio);
ASSERT(zv && zv->zv_open_count > 0); ASSERT(zv && zv->zv_open_count > 0);
rl = zfs_range_lock(&zv->zv_range_lock, uio->uio_loffset, start_jif = jiffies;
uio->uio_resid, RL_READER); generic_start_io_acct(READ, bio_sectors(bio), &zv->zv_disk->part0);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
/* don't read past the end */ /* don't read past the end */
if (bytes > volsize - uio->uio_loffset) if (bytes > volsize - uio.uio_loffset)
bytes = volsize - uio->uio_loffset; bytes = volsize - uio.uio_loffset;
error = dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes); error = dmu_read_uio_dbuf(zv->zv_dbuf, &uio, bytes);
if (error) { if (error) {
/* convert checksum errors into IO errors */ /* convert checksum errors into IO errors */
if (error == ECKSUM) if (error == ECKSUM)
@ -777,76 +827,93 @@ zvol_read(zvol_state_t *zv, uio_t *uio)
break; break;
} }
} }
zfs_range_unlock(rl); zfs_range_unlock(zvr->rl);
return (error);
rw_exit(&zv->zv_suspend_lock);
generic_end_io_acct(READ, &zv->zv_disk->part0, start_jif);
BIO_END_IO(bio, -error);
kmem_free(zvr, sizeof (zv_request_t));
} }
static MAKE_REQUEST_FN_RET static MAKE_REQUEST_FN_RET
zvol_request(struct request_queue *q, struct bio *bio) zvol_request(struct request_queue *q, struct bio *bio)
{ {
uio_t uio;
zvol_state_t *zv = q->queuedata; zvol_state_t *zv = q->queuedata;
fstrans_cookie_t cookie = spl_fstrans_mark(); fstrans_cookie_t cookie = spl_fstrans_mark();
uint64_t offset = BIO_BI_SECTOR(bio) << 9;
uint64_t size = BIO_BI_SIZE(bio);
int rw = bio_data_dir(bio); int rw = bio_data_dir(bio);
#ifdef HAVE_GENERIC_IO_ACCT zv_request_t *zvr;
unsigned long start = jiffies;
#endif
int error = 0;
rw_enter(&zv->zv_suspend_lock, RW_READER); if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
uio.uio_skip = BIO_BI_SKIP(bio);
uio.uio_resid = BIO_BI_SIZE(bio);
uio.uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
uio.uio_loffset = BIO_BI_SECTOR(bio) << 9;
uio.uio_limit = MAXOFFSET_T;
uio.uio_segflg = UIO_BVEC;
if (bio_has_data(bio) && uio.uio_loffset + uio.uio_resid >
zv->zv_volsize) {
printk(KERN_INFO printk(KERN_INFO
"%s: bad access: offset=%llu, size=%lu\n", "%s: bad access: offset=%llu, size=%lu\n",
zv->zv_disk->disk_name, zv->zv_disk->disk_name,
(long long unsigned)uio.uio_loffset, (long long unsigned)offset,
(long unsigned)uio.uio_resid); (long unsigned)size);
error = SET_ERROR(EIO);
goto out1;
}
generic_start_io_acct(rw, bio_sectors(bio), &zv->zv_disk->part0); BIO_END_IO(bio, -SET_ERROR(EIO));
goto out;
}
if (rw == WRITE) { if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
error = SET_ERROR(EROFS); BIO_END_IO(bio, -SET_ERROR(EROFS));
goto out2; goto out;
}
if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
error = zvol_discard(bio);
goto out2;
} }
/* /*
* Some requests are just for flush and nothing else. * To be released in the I/O function. See the comment on
* zfs_range_lock below.
*/ */
if (uio.uio_resid == 0) { rw_enter(&zv->zv_suspend_lock, RW_READER);
if (bio_is_flush(bio))
zil_commit(zv->zv_zilog, ZVOL_OBJ); /* bio marked as FLUSH need to flush before write */
goto out2; if (bio_is_flush(bio))
zil_commit(zv->zv_zilog, ZVOL_OBJ);
/* Some requests are just for flush and nothing else. */
if (size == 0) {
rw_exit(&zv->zv_suspend_lock);
BIO_END_IO(bio, 0);
goto out;
} }
error = zvol_write(zv, &uio, zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
bio_is_flush(bio) || bio_is_fua(bio) || zvr->zv = zv;
zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); zvr->bio = bio;
} else
error = zvol_read(zv, &uio);
out2: /*
generic_end_io_acct(rw, &zv->zv_disk->part0, start); * To be released in the I/O function. Since the I/O functions
out1: * are asynchronous, we take it here synchronously to make
BIO_END_IO(bio, -error); * sure overlapped I/Os are properly ordered.
rw_exit(&zv->zv_suspend_lock); */
zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
RL_WRITER);
if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
if (zvol_request_sync || taskq_dispatch(zvol_taskq,
zvol_discard, zvr, TQ_SLEEP) == TASKQID_INVALID)
zvol_discard(zvr);
} else {
if (zvol_request_sync || taskq_dispatch(zvol_taskq,
zvol_write, zvr, TQ_SLEEP) == TASKQID_INVALID)
zvol_write(zvr);
}
} else {
zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
zvr->zv = zv;
zvr->bio = bio;
rw_enter(&zv->zv_suspend_lock, RW_READER);
zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
RL_READER);
if (zvol_request_sync || taskq_dispatch(zvol_taskq,
zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID)
zvol_read(zvr);
}
out:
spl_fstrans_unmark(cookie); spl_fstrans_unmark(cookie);
#ifdef HAVE_MAKE_REQUEST_FN_RET_INT #ifdef HAVE_MAKE_REQUEST_FN_RET_INT
return (0); return (0);
@ -2166,6 +2233,7 @@ zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
int int
zvol_init(void) zvol_init(void)
{ {
int threads = MIN(MAX(zvol_threads, 1), 1024);
int i, error; int i, error;
list_create(&zvol_state_list, sizeof (zvol_state_t), list_create(&zvol_state_list, sizeof (zvol_state_t),
@ -2173,11 +2241,19 @@ zvol_init(void)
mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
ida_init(&zvol_ida); ida_init(&zvol_ida);
zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
if (zvol_taskq == NULL) {
printk(KERN_INFO "ZFS: taskq_create() failed\n");
error = -ENOMEM;
goto out;
}
zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
KM_SLEEP); KM_SLEEP);
if (!zvol_htable) { if (!zvol_htable) {
error = ENOMEM; error = -ENOMEM;
goto out; goto out_taskq;
} }
for (i = 0; i < ZVOL_HT_SIZE; i++) for (i = 0; i < ZVOL_HT_SIZE; i++)
INIT_HLIST_HEAD(&zvol_htable[i]); INIT_HLIST_HEAD(&zvol_htable[i]);
@ -2195,6 +2271,8 @@ zvol_init(void)
out_free: out_free:
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
out_taskq:
taskq_destroy(zvol_taskq);
out: out:
mutex_destroy(&zvol_state_lock); mutex_destroy(&zvol_state_lock);
list_destroy(&zvol_state_list); list_destroy(&zvol_state_list);
@ -2211,6 +2289,7 @@ zvol_fini(void)
unregister_blkdev(zvol_major, ZVOL_DRIVER); unregister_blkdev(zvol_major, ZVOL_DRIVER);
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
taskq_destroy(zvol_taskq);
list_destroy(&zvol_state_list); list_destroy(&zvol_state_list);
mutex_destroy(&zvol_state_lock); mutex_destroy(&zvol_state_lock);
@ -2224,6 +2303,12 @@ MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
module_param(zvol_major, uint, 0444); module_param(zvol_major, uint, 0444);
MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
module_param(zvol_threads, uint, 0444);
MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
module_param(zvol_request_sync, uint, 0644);
MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
module_param(zvol_max_discard_blocks, ulong, 0444); module_param(zvol_max_discard_blocks, ulong, 0444);
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");