diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 1df61a587d6c..8862d9bc9854 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -49,6 +49,7 @@ extern "C" { * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; +typedef struct vdev_io vdev_io_t; typedef struct vdev_cache vdev_cache_t; typedef struct vdev_cache_entry vdev_cache_entry_t; @@ -102,9 +103,15 @@ struct vdev_queue { avl_tree_t vq_read_tree; avl_tree_t vq_write_tree; avl_tree_t vq_pending_tree; + list_t vq_io_list; kmutex_t vq_lock; }; +struct vdev_io { + char vi_buffer[SPA_MAXBLOCKSIZE]; /* Must be first */ + list_node_t vi_node; +}; + /* * Virtual device descriptor */ diff --git a/include/sys/zio.h b/include/sys/zio.h index c0da4e2d78cd..4f20cab6591e 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -494,6 +494,8 @@ extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); +extern void *zio_vdev_alloc(void); +extern void zio_vdev_free(void *buf); extern void zio_resubmit_stage_async(void *); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index aacc55c49f4e..7ba638952e10 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -106,6 +106,7 @@ void vdev_queue_init(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; + int i; mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); @@ -120,18 +121,36 @@ vdev_queue_init(vdev_t *vd) avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); + + /* + * A list of buffers which can be used for aggregate I/O, this + * avoids the need to allocate them on demand when memory is low. + */ + list_create(&vq->vq_io_list, sizeof (vdev_io_t), + offsetof(vdev_io_t, vi_node)); + + for (i = 0; i < zfs_vdev_max_pending; i++) + list_insert_tail(&vq->vq_io_list, zio_vdev_alloc()); } void vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; + vdev_io_t *vi; avl_destroy(&vq->vq_deadline_tree); avl_destroy(&vq->vq_read_tree); avl_destroy(&vq->vq_write_tree); avl_destroy(&vq->vq_pending_tree); + while ((vi = list_head(&vq->vq_io_list)) != NULL) { + list_remove(&vq->vq_io_list, vi); + zio_vdev_free(vi); + } + + list_destroy(&vq->vq_io_list); + mutex_destroy(&vq->vq_lock); } @@ -152,6 +171,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) static void vdev_queue_agg_io_done(zio_t *aio) { + vdev_queue_t *vq = &aio->io_vd->vdev_queue; + vdev_io_t *vi = aio->io_data; zio_t *pio; while ((pio = zio_walk_parents(aio)) != NULL) @@ -159,7 +180,9 @@ vdev_queue_agg_io_done(zio_t *aio) bcopy((char *)aio->io_data + (pio->io_offset - aio->io_offset), pio->io_data, pio->io_size); - zio_buf_free(aio->io_data, aio->io_size); + mutex_enter(&vq->vq_lock); + list_insert_tail(&vq->vq_io_list, vi); + mutex_exit(&vq->vq_lock); } /* @@ -176,6 +199,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { zio_t *fio, *lio, *aio, *dio, *nio, *mio; avl_tree_t *t; + vdev_io_t *vi; int flags; uint64_t maxspan = zfs_vdev_aggregation_limit; uint64_t maxgap; @@ -194,6 +218,12 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; + vi = list_head(&vq->vq_io_list); + if (vi == NULL) { + vi = zio_vdev_alloc(); + list_insert_head(&vq->vq_io_list, vi); + } + if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { /* * We can aggregate I/Os that are sufficiently adjacent and of @@ -283,9 +313,10 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) if (fio != lio) { uint64_t size = IO_SPAN(fio, lio); ASSERT(size <= zfs_vdev_aggregation_limit); + ASSERT(vi != NULL); aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, - zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, + vi, size, fio->io_type, ZIO_PRIORITY_AGG, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); @@ -313,6 +344,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) } while (dio != lio); avl_add(&vq->vq_pending_tree, aio); + list_remove(&vq->vq_io_list, vi); return (aio); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 206ed9a936c3..fe2bdc867fea 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -73,6 +73,7 @@ char *zio_type_name[ZIO_TYPES] = { */ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; +kmem_cache_t *zio_vdev_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; int zio_bulk_flags = 0; @@ -141,6 +142,8 @@ zio_init(void) zio_cons, zio_dest, NULL, NULL, NULL, KMC_KMEM); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM); + zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof(vdev_io_t), + PAGESIZE, NULL, NULL, NULL, NULL, NULL, KMC_VMEM); /* * For small buffers, we want a cache for each multiple of @@ -230,6 +233,7 @@ zio_fini(void) zio_data_buf_cache[c] = NULL; } + kmem_cache_destroy(zio_vdev_cache); kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); @@ -294,6 +298,24 @@ zio_data_buf_free(void *buf, size_t size) kmem_cache_free(zio_data_buf_cache[c], buf); } +/* + * Dedicated I/O buffers to ensure that memory fragmentation never prevents + * or significantly delays the issuing of a zio. These buffers are used + * to aggregate I/O and could be used for raidz stripes. + */ +void * +zio_vdev_alloc(void) +{ + return (kmem_cache_alloc(zio_vdev_cache, KM_PUSHPAGE)); +} + +void +zio_vdev_free(void *buf) +{ + kmem_cache_free(zio_vdev_cache, buf); + +} + /* * ========================================================================== * Push and pop I/O transform buffers