Notable upstream pull request merges:
  #14925 Another set of vdev queue optimizations
  #14964 Use big transactions for small recordsize writes
  #14999 ZIL: Fix another use-after-free

Obtained from:	OpenZFS
OpenZFS commit:	a9d6b0690b
This commit is contained in:
Martin Matuska 2023-06-29 01:39:19 +02:00
commit 7b5e687355
16 changed files with 306 additions and 239 deletions

View File

@ -104,6 +104,7 @@ typedef struct taskq {
/* list node for the cpu hotplug callback */
struct hlist_node tq_hp_cb_node;
boolean_t tq_hp_support;
unsigned long lastshouldstop; /* when to purge dynamic */
} taskq_t;
typedef struct taskq_ent {

View File

@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
extern int vdev_queue_length(vdev_t *vd);
extern uint32_t vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);

View File

@ -130,27 +130,24 @@ typedef const struct vdev_ops {
/*
* Virtual device properties
*/
typedef struct vdev_queue_class {
uint32_t vqc_active;
/*
* Sorted by offset or timestamp, depending on if the queue is
* LBA-ordered vs FIFO.
*/
avl_tree_t vqc_queued_tree;
typedef union vdev_queue_class {
list_t vqc_list;
avl_tree_t vqc_tree;
} vdev_queue_class_t;
struct vdev_queue {
vdev_t *vq_vdev;
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
avl_tree_t vq_active_tree;
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
avl_tree_t vq_trim_offset_tree;
uint64_t vq_last_offset;
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
uint32_t vq_cqueued; /* Classes with queued I/Os. */
uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
uint32_t vq_active; /* Number of active I/Os. */
uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
list_t vq_active_list; /* List of active I/Os. */
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
zio_t vq_io_search; /* used as local for stack reduction */

View File

@ -436,6 +436,12 @@ typedef struct zio_link {
list_node_t zl_child_node;
} zio_link_t;
enum zio_qstate {
ZIO_QS_NONE = 0,
ZIO_QS_QUEUED,
ZIO_QS_ACTIVE,
};
struct zio {
/* Core information about this I/O */
zbookmark_phys_t io_bookmark;
@ -479,6 +485,12 @@ struct zio {
const zio_vsd_ops_t *io_vsd_ops;
metaslab_class_t *io_metaslab_class; /* dva throttle class */
enum zio_qstate io_queue_state; /* vdev queue state */
union {
list_node_t l;
avl_node_t a;
} io_queue_node ____cacheline_aligned; /* allocator and vdev queues */
avl_node_t io_offset_node; /* vdev offset queues */
uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */
hrtime_t io_queued_timestamp;
@ -486,9 +498,6 @@ struct zio {
hrtime_t io_delta; /* vdev queue service delta */
hrtime_t io_delay; /* Device access time (disk or */
/* file). */
avl_node_t io_queue_node;
avl_node_t io_offset_node;
avl_node_t io_alloc_node;
zio_alloc_list_t io_alloc_list;
/* Internal pipeline state */

View File

@ -193,4 +193,19 @@ The proc file will walk the lists with lock held,
reading it could cause a lock-up if the list grow too large
without limiting the output.
"(truncated)" will be shown if the list is larger than the limit.
.
.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint
(Linux-only)
How long a taskq has to have had no work before we tear it down.
Previously, we would tear down a dynamic taskq worker as soon
as we noticed it had no work, but it was observed that this led
to a lot of churn in tearing down things we then immediately
spawned anew.
In practice, it seems any nonzero value will remove the vast
majority of this churn, while the nontrivially larger value
was chosen to help filter out the little remaining churn on
a mostly idle system.
Setting this value to
.Sy 0
will revert to the previous behavior.
.El

View File

@ -2016,12 +2016,6 @@ Historical statistics for this many latest TXGs will be available in
Flush dirty data to disk at least every this many seconds (maximum TXG
duration).
.
.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Allow TRIM I/O operations to be aggregated.
This is normally not helpful because the extents to be trimmed
will have been already been aggregated by the metaslab.
This option is provided for debugging and performance analysis.
.
.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
Max vdev I/O aggregation size.
.

View File

@ -87,13 +87,13 @@ currently in use by another subsystem.
However this check is not robust enough
to detect simultaneous attempts to use a new device in different pools, even if
.Sy multihost Ns = Sy enabled .
The administrator must ensure, that simultaneous invocations of any combination
The administrator must ensure that simultaneous invocations of any combination
of
.Nm zpool Cm replace ,
.Nm zpool Cm create ,
.Nm zpool Cm add ,
or
.Nm zpool Cm labelclear ,
.Nm zpool Cm labelclear
do not refer to the same device.
Using the same device in two pools will result in pool corruption.
.Pp

View File

@ -36,6 +36,12 @@ static int spl_taskq_thread_bind = 0;
module_param(spl_taskq_thread_bind, int, 0644);
MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
static uint_t spl_taskq_thread_timeout_ms = 10000;
/* BEGIN CSTYLED */
module_param(spl_taskq_thread_timeout_ms, uint, 0644);
/* END CSTYLED */
MODULE_PARM_DESC(spl_taskq_thread_timeout_ms,
"Time to require a dynamic thread be idle before it gets cleaned up");
static int spl_taskq_thread_dynamic = 1;
module_param(spl_taskq_thread_dynamic, int, 0444);
@ -848,12 +854,37 @@ taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
tqt_thread_list) == tqt)
return (0);
return
int no_work =
((tq->tq_nspawn == 0) && /* No threads are being spawned */
(tq->tq_nactive == 0) && /* No threads are handling tasks */
(tq->tq_nthreads > 1) && /* More than 1 thread is running */
(!taskq_next_ent(tq)) && /* There are no pending tasks */
(spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
/*
* If we would have said stop before, let's instead wait a bit, maybe
* we'll see more work come our way soon...
*/
if (no_work) {
/* if it's 0, we want the old behavior. */
/* if the taskq is being torn down, we also want to go away. */
if (spl_taskq_thread_timeout_ms == 0 ||
!(tq->tq_flags & TASKQ_ACTIVE))
return (1);
unsigned long lasttime = tq->lastshouldstop;
if (lasttime > 0) {
if (time_after(jiffies, lasttime +
msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
return (1);
else
return (0);
} else {
tq->lastshouldstop = jiffies;
}
} else {
tq->lastshouldstop = 0;
}
return (0);
}
static int
@ -1091,6 +1122,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
tq->tq_flags = (flags | TASKQ_ACTIVE);
tq->tq_next_id = TASKQID_INITIAL;
tq->tq_lowest_id = TASKQID_INITIAL;
tq->lastshouldstop = 0;
INIT_LIST_HEAD(&tq->tq_free_list);
INIT_LIST_HEAD(&tq->tq_pend_list);
INIT_LIST_HEAD(&tq->tq_prio_list);

View File

@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
NULL);
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
}
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));

View File

@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl)
boolean_t
txg_all_lists_empty(txg_list_t *tl)
{
mutex_enter(&tl->tl_lock);
for (int i = 0; i < TXG_SIZE; i++) {
if (!txg_list_empty_impl(tl, i)) {
mutex_exit(&tl->tl_lock);
return (B_FALSE);
}
}
mutex_exit(&tl->tl_lock);
return (B_TRUE);
boolean_t res = B_TRUE;
for (int i = 0; i < TXG_SIZE; i++)
res &= (tl->tl_head[i] == NULL);
return (res);
}
/*

View File

@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
vsx->vsx_active_queue[t] =
vd->vdev_queue.vq_class[t].vqc_active;
vsx->vsx_pend_queue[t] = avl_numnodes(
&vd->vdev_queue.vq_class[t].vqc_queued_tree);
for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
}
}
}
@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag)
vdev_queue_t *vq = &vd->vdev_queue;
mutex_enter(&vq->vq_lock);
if (avl_numnodes(&vq->vq_active_tree) > 0) {
if (vq->vq_active > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;
zfs_dbgmsg("slow vdev: %s has %lu active IOs",
vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
zfs_dbgmsg("slow vdev: %s has %u active IOs",
vd->vdev_path, vq->vq_active);
/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic.
*/
fio = avl_first(&vq->vq_active_tree);
fio = list_head(&vq->vq_active_list);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);

View File

@ -228,13 +228,6 @@ uint_t zfs_vdev_queue_depth_pct = 300;
*/
uint_t zfs_vdev_def_queue_depth = 32;
/*
* Allow TRIM I/Os to be aggregated. This should normally not be needed since
* TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
* by the TRIM code in zfs_trim.c.
*/
static uint_t zfs_vdev_aggregate_trim = 0;
static int
vdev_queue_offset_compare(const void *x1, const void *x2)
{
@ -249,38 +242,60 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
return (TREE_PCMP(z1, z2));
}
static inline avl_tree_t *
vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
{
return (&vq->vq_class[p].vqc_queued_tree);
}
static inline avl_tree_t *
vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
{
ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
if (t == ZIO_TYPE_READ)
return (&vq->vq_read_offset_tree);
else if (t == ZIO_TYPE_WRITE)
return (&vq->vq_write_offset_tree);
else
return (&vq->vq_trim_offset_tree);
}
#define VDQ_T_SHIFT 29
static int
vdev_queue_timestamp_compare(const void *x1, const void *x2)
vdev_queue_to_compare(const void *x1, const void *x2)
{
const zio_t *z1 = (const zio_t *)x1;
const zio_t *z2 = (const zio_t *)x2;
int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
z2->io_timestamp >> VDQ_T_SHIFT);
int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
int cmp = tcmp ? tcmp : ocmp;
if (likely(cmp))
if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
return (cmp);
return (TREE_PCMP(z1, z2));
}
static inline boolean_t
vdev_queue_class_fifo(zio_priority_t p)
{
return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
p == ZIO_PRIORITY_TRIM);
}
static void
vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
{
zio_priority_t p = zio->io_priority;
vq->vq_cqueued |= 1U << p;
if (vdev_queue_class_fifo(p))
list_insert_tail(&vq->vq_class[p].vqc_list, zio);
else
avl_add(&vq->vq_class[p].vqc_tree, zio);
}
static void
vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
{
zio_priority_t p = zio->io_priority;
uint32_t empty;
if (vdev_queue_class_fifo(p)) {
list_t *list = &vq->vq_class[p].vqc_list;
list_remove(list, zio);
empty = list_is_empty(list);
} else {
avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
avl_remove(tree, zio);
empty = avl_is_empty(tree);
}
vq->vq_cqueued &= ~(empty << p);
}
static uint_t
vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
{
@ -360,7 +375,7 @@ vdev_queue_max_async_writes(spa_t *spa)
}
static uint_t
vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
{
switch (p) {
case ZIO_PRIORITY_SYNC_READ:
@ -370,7 +385,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
case ZIO_PRIORITY_ASYNC_READ:
return (zfs_vdev_async_read_max_active);
case ZIO_PRIORITY_ASYNC_WRITE:
return (vdev_queue_max_async_writes(spa));
return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
case ZIO_PRIORITY_SCRUB:
if (vq->vq_ia_active > 0) {
return (MIN(vq->vq_nia_credit,
@ -414,10 +429,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
static zio_priority_t
vdev_queue_class_to_issue(vdev_queue_t *vq)
{
spa_t *spa = vq->vq_vdev->vdev_spa;
zio_priority_t p, n;
uint32_t cq = vq->vq_cqueued;
zio_priority_t p, p1;
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE);
/*
@ -425,14 +440,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* Do round-robin to reduce starvation due to zfs_vdev_max_active
* and vq_nia_credit limits.
*/
for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
vdev_queue_class_min_active(vq, p)) {
vq->vq_last_prio = p;
return (p);
}
p1 = vq->vq_last_prio + 1;
if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
p1 = 0;
for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
vdev_queue_class_min_active(vq, p))
goto found;
}
for (p = 0; p < p1; p++) {
if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
vdev_queue_class_min_active(vq, p))
goto found;
}
/*
@ -440,16 +459,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* maximum # outstanding i/os.
*/
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
vdev_queue_class_max_active(spa, vq, p)) {
vq->vq_last_prio = p;
return (p);
}
if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
vdev_queue_class_max_active(vq, p))
break;
}
/* No eligible queued i/os */
return (ZIO_PRIORITY_NUM_QUEUEABLE);
found:
vq->vq_last_prio = p;
return (p);
}
void
@ -458,42 +475,30 @@ vdev_queue_init(vdev_t *vd)
vdev_queue_t *vq = &vd->vdev_queue;
zio_priority_t p;
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
vq->vq_vdev = vd;
taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
int (*compfn) (const void *, const void *);
/*
* The synchronous/trim i/o queues are dispatched in FIFO rather
* than LBA order. This provides more consistent latency for
* these i/os.
*/
if (p == ZIO_PRIORITY_SYNC_READ ||
p == ZIO_PRIORITY_SYNC_WRITE ||
p == ZIO_PRIORITY_TRIM) {
compfn = vdev_queue_timestamp_compare;
if (vdev_queue_class_fifo(p)) {
list_create(&vq->vq_class[p].vqc_list,
sizeof (zio_t),
offsetof(struct zio, io_queue_node.l));
} else {
compfn = vdev_queue_offset_compare;
avl_create(&vq->vq_class[p].vqc_tree,
vdev_queue_to_compare, sizeof (zio_t),
offsetof(struct zio, io_queue_node.a));
}
avl_create(vdev_queue_class_tree(vq, p), compfn,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
avl_create(&vq->vq_read_offset_tree,
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(&vq->vq_write_offset_tree,
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
vq->vq_last_offset = 0;
list_create(&vq->vq_active_list, sizeof (struct zio),
offsetof(struct zio, io_queue_node.l));
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
}
void
@ -501,30 +506,39 @@ vdev_queue_fini(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
avl_destroy(vdev_queue_class_tree(vq, p));
avl_destroy(&vq->vq_active_tree);
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if (vdev_queue_class_fifo(p))
list_destroy(&vq->vq_class[p].vqc_list);
else
avl_destroy(&vq->vq_class[p].vqc_tree);
}
avl_destroy(&vq->vq_read_offset_tree);
avl_destroy(&vq->vq_write_offset_tree);
list_destroy(&vq->vq_active_list);
mutex_destroy(&vq->vq_lock);
}
static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
zio->io_queue_state = ZIO_QS_QUEUED;
vdev_queue_class_add(vq, zio);
if (zio->io_type == ZIO_TYPE_READ)
avl_add(&vq->vq_read_offset_tree, zio);
else if (zio->io_type == ZIO_TYPE_WRITE)
avl_add(&vq->vq_write_offset_tree, zio);
}
static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
vdev_queue_class_remove(vq, zio);
if (zio->io_type == ZIO_TYPE_READ)
avl_remove(&vq->vq_read_offset_tree, zio);
else if (zio->io_type == ZIO_TYPE_WRITE)
avl_remove(&vq->vq_write_offset_tree, zio);
zio->io_queue_state = ZIO_QS_NONE;
}
static boolean_t
@ -546,14 +560,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active++;
vq->vq_cactive[zio->io_priority]++;
vq->vq_active++;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (++vq->vq_ia_active == 1)
vq->vq_nia_credit = 1;
} else if (vq->vq_ia_active > 0) {
vq->vq_nia_credit--;
}
avl_add(&vq->vq_active_tree, zio);
zio->io_queue_state = ZIO_QS_ACTIVE;
list_insert_tail(&vq->vq_active_list, zio);
}
static void
@ -561,7 +577,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active--;
vq->vq_cactive[zio->io_priority]--;
vq->vq_active--;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (--vq->vq_ia_active == 0)
vq->vq_nia_credit = 0;
@ -569,7 +586,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit = zfs_vdev_nia_credit;
} else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++;
avl_remove(&vq->vq_active_tree, zio);
list_remove(&vq->vq_active_list, zio);
zio->io_queue_state = ZIO_QS_NONE;
}
static void
@ -602,29 +620,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
uint64_t maxgap = 0;
uint64_t size;
uint64_t limit;
int maxblocksize;
boolean_t stretch = B_FALSE;
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
uint64_t next_offset;
abd_t *abd;
avl_tree_t *t;
/*
* TRIM aggregation should not be needed since code in zfs_trim.c can
* submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
*/
if (zio->io_type == ZIO_TYPE_TRIM)
return (NULL);
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
return (NULL);
maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
if (vq->vq_vdev->vdev_nonrot)
limit = zfs_vdev_aggregation_limit_non_rotating;
else
limit = zfs_vdev_aggregation_limit;
limit = MIN(limit, maxblocksize);
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
return (NULL);
/*
* While TRIM commands could be aggregated based on offset this
* behavior is disabled until it's determined to be beneficial.
*/
if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
if (limit == 0)
return (NULL);
limit = MIN(limit, SPA_MAXBLOCKSIZE);
/*
* I/Os to distributed spares are directly dispatched to the dRAID
@ -635,8 +652,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
first = last = zio;
if (zio->io_type == ZIO_TYPE_READ)
if (zio->io_type == ZIO_TYPE_READ) {
maxgap = zfs_vdev_read_gap_limit;
t = &vq->vq_read_offset_tree;
} else {
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
t = &vq->vq_write_offset_tree;
}
/*
* We can aggregate I/Os that are sufficiently adjacent and of
@ -657,6 +679,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* Walk backwards through sufficiently contiguous I/Os
* recording the last non-optional I/O.
*/
zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
while ((dio = AVL_PREV(t, first)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
IO_SPAN(dio, last) <= limit &&
@ -686,7 +709,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
(IO_SPAN(first, dio) <= limit ||
(dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
IO_SPAN(first, dio) <= maxblocksize &&
IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
IO_GAP(last, dio) <= maxgap &&
dio->io_type == zio->io_type) {
last = dio;
@ -740,7 +763,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
return (NULL);
size = IO_SPAN(first, last);
ASSERT3U(size, <=, maxblocksize);
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
abd = abd_alloc_gang();
if (abd == NULL)
@ -824,19 +847,30 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
return (NULL);
}
/*
* For LBA-ordered queues (async / scrub / initializing), issue the
* i/o which follows the most recently issued i/o in LBA (offset) order.
*
* For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
*/
tree = vdev_queue_class_tree(vq, p);
vq->vq_io_search.io_timestamp = 0;
vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
zio = avl_nearest(tree, idx, AVL_AFTER);
if (zio == NULL)
zio = avl_first(tree);
if (vdev_queue_class_fifo(p)) {
zio = list_head(&vq->vq_class[p].vqc_list);
} else {
/*
* For LBA-ordered queues (async / scrub / initializing),
* issue the I/O which follows the most recently issued I/O
* in LBA (offset) order, but to avoid starvation only within
* the same 0.5 second interval as the first I/O.
*/
tree = &vq->vq_class[p].vqc_tree;
zio = aio = avl_first(tree);
if (zio->io_offset < vq->vq_last_offset) {
vq->vq_io_search.io_timestamp = zio->io_timestamp;
vq->vq_io_search.io_offset = vq->vq_last_offset;
zio = avl_find(tree, &vq->vq_io_search, &idx);
if (zio == NULL) {
zio = avl_nearest(tree, idx, AVL_AFTER);
if (zio == NULL ||
(zio->io_timestamp >> VDQ_T_SHIFT) !=
(aio->io_timestamp >> VDQ_T_SHIFT))
zio = aio;
}
}
}
ASSERT3U(zio->io_priority, ==, p);
aio = vdev_queue_aggregate(vq, zio);
@ -967,7 +1001,6 @@ void
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
avl_tree_t *tree;
/*
* ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
@ -1002,12 +1035,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* Otherwise, the zio is currently active and we cannot change its
* priority.
*/
tree = vdev_queue_class_tree(vq, zio->io_priority);
if (avl_find(tree, zio, NULL) == zio) {
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
if (zio->io_queue_state == ZIO_QS_QUEUED) {
vdev_queue_class_remove(vq, zio);
zio->io_priority = priority;
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
vdev_queue_class_add(vq, zio);
} else if (zio->io_queue_state == ZIO_QS_NONE) {
zio->io_priority = priority;
}
@ -1020,10 +1052,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* vq_lock mutex use here, instead we prefer to keep it lock free for
* performance.
*/
int
uint32_t
vdev_queue_length(vdev_t *vd)
{
return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
return (vd->vdev_queue.vq_active);
}
uint64_t
@ -1032,15 +1064,22 @@ vdev_queue_last_offset(vdev_t *vd)
return (vd->vdev_queue.vq_last_offset);
}
uint64_t
vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
{
vdev_queue_t *vq = &vd->vdev_queue;
if (vdev_queue_class_fifo(p))
return (list_is_empty(&vq->vq_class[p].vqc_list) == 0);
else
return (avl_numnodes(&vq->vq_class[p].vqc_tree));
}
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
"Max vdev I/O aggregation size");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW,
"Allow TRIM I/O to be aggregated");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
"Aggregate read I/O over gap");

View File

@ -462,14 +462,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
return (SET_ERROR(EINVAL));
}
const uint64_t max_blksz = zfsvfs->z_max_blksz;
/*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
* Skip this if uio contains loaned arc_buf.
*/
if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
if (zfs_uio_prefaultpages(pfbytes, uio)) {
zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EFAULT));
}
@ -544,10 +542,31 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
break;
}
uint64_t blksz;
if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
if (zp->z_blksz > zfsvfs->z_max_blksz &&
!ISP2(zp->z_blksz)) {
/*
* File's blocksize is already larger than the
* "recordsize" property. Only let it grow to
* the next power of 2.
*/
blksz = 1 << highbit64(zp->z_blksz);
} else {
blksz = zfsvfs->z_max_blksz;
}
blksz = MIN(blksz, P2ROUNDUP(end_size,
SPA_MINBLOCKSIZE));
blksz = MAX(blksz, zp->z_blksz);
} else {
blksz = zp->z_blksz;
}
arc_buf_t *abuf = NULL;
if (n >= max_blksz && woff >= zp->z_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
ssize_t nbytes = n;
if (n >= blksz && woff >= zp->z_size &&
P2PHASE(woff, blksz) == 0 &&
(blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
/*
* This write covers a full block. "Borrow" a buffer
* from the dmu so that we can fill it before we enter
@ -555,18 +574,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* holding up the transaction if the data copy hangs
* up on a pagefault (e.g., from an NFS server mapping).
*/
size_t cbytes;
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
max_blksz);
blksz);
ASSERT(abuf != NULL);
ASSERT(arc_buf_size(abuf) == max_blksz);
if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
UIO_WRITE, uio, &cbytes))) {
ASSERT(arc_buf_size(abuf) == blksz);
if ((error = zfs_uiocopy(abuf->b_data, blksz,
UIO_WRITE, uio, &nbytes))) {
dmu_return_arcbuf(abuf);
break;
}
ASSERT3S(cbytes, ==, max_blksz);
ASSERT3S(nbytes, ==, blksz);
} else {
nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
P2PHASE(woff, blksz));
if (pfbytes < nbytes) {
if (zfs_uio_prefaultpages(nbytes, uio)) {
error = SET_ERROR(EFAULT);
break;
}
pfbytes = nbytes;
}
}
/*
@ -576,8 +603,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
DB_DNODE_ENTER(db);
dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
MIN(n, max_blksz));
dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
DB_DNODE_EXIT(db);
zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
@ -600,31 +626,10 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* shrink down lr_length to the appropriate size.
*/
if (lr->lr_length == UINT64_MAX) {
uint64_t new_blksz;
if (zp->z_blksz > max_blksz) {
/*
* File's blocksize is already larger than the
* "recordsize" property. Only let it grow to
* the next power of 2.
*/
ASSERT(!ISP2(zp->z_blksz));
new_blksz = MIN(end_size,
1 << highbit64(zp->z_blksz));
} else {
new_blksz = MIN(end_size, max_blksz);
}
zfs_grow_blocksize(zp, new_blksz, tx);
zfs_grow_blocksize(zp, blksz, tx);
zfs_rangelock_reduce(lr, woff, n);
}
/*
* XXX - should we really limit each write to z_max_blksz?
* Perhaps we should use SPA_MAXBLOCKSIZE chunks?
*/
const ssize_t nbytes =
MIN(n, max_blksz - P2PHASE(woff, max_blksz));
ssize_t tx_bytes;
if (abuf == NULL) {
tx_bytes = zfs_uio_resid(uio);
@ -644,12 +649,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* zfs_uio_prefaultpages, or prefaultpages may
* error, and we may break the loop early.
*/
if (tx_bytes != zfs_uio_resid(uio))
n -= tx_bytes - zfs_uio_resid(uio);
if (zfs_uio_prefaultpages(MIN(n, max_blksz),
uio)) {
break;
}
n -= tx_bytes - zfs_uio_resid(uio);
pfbytes -= tx_bytes - zfs_uio_resid(uio);
continue;
}
#endif
@ -665,15 +666,6 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
}
tx_bytes -= zfs_uio_resid(uio);
} else {
/* Implied by abuf != NULL: */
ASSERT3S(n, >=, max_blksz);
ASSERT0(P2PHASE(woff, max_blksz));
/*
* We can simplify nbytes to MIN(n, max_blksz) since
* P2PHASE(woff, max_blksz) is 0, and knowing
* n >= max_blksz lets us simplify further:
*/
ASSERT3S(nbytes, ==, max_blksz);
/*
* Thus, we're writing a full block at a block-aligned
* offset and extending the file past EOF.
@ -758,13 +750,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
break;
ASSERT3S(tx_bytes, ==, nbytes);
n -= nbytes;
if (n > 0) {
if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
error = SET_ERROR(EFAULT);
break;
}
}
pfbytes -= nbytes;
}
zfs_znode_update_vfs(zp);

View File

@ -1425,6 +1425,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
list_move_tail(&itxs, &lwb->lwb_itxs);
list_move_tail(&waiters, &lwb->lwb_waiters);
txg = lwb->lwb_issued_txg;
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
lwb->lwb_state = LWB_STATE_FLUSH_DONE;
@ -1465,7 +1466,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
list_destroy(&waiters);
mutex_enter(&zilog->zl_lwb_io_lock);
txg = lwb->lwb_issued_txg;
ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
zilog->zl_lwb_inflight[txg & TXG_MASK]--;
if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)

View File

@ -1051,7 +1051,7 @@
/* #undef ZFS_IS_GPL_COMPATIBLE */
/* Define the project alias string. */
#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_g8e8acabdc"
#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_ga9d6b0690"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@ -1081,7 +1081,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
#define ZFS_META_RELEASE "FreeBSD_g8e8acabdc"
#define ZFS_META_RELEASE "FreeBSD_ga9d6b0690"
/* Define the project version. */
#define ZFS_META_VERSION "2.1.99"

View File

@ -1 +1 @@
#define ZFS_META_GITREV "zfs-2.1.99-1999-g8e8acabdc"
#define ZFS_META_GITREV "zfs-2.1.99-2004-ga9d6b0690"