OpenZFS 6531 - Provide mechanism to artificially limit disk performance

Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>

OpenZFS-issue: https://www.illumos.org/issues/6531
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/97e8130

Porting notes:
- Added new IO delay tracepoints, and moved common ZIO tracepoint macros
  to a new trace_common.h file.
- Used zio_delay_taskq() in place of OpenZFS's timeout_generic() function.
- Updated zinject man page
- Updated zpool_scrub test files
This commit is contained in:
Tony Hutter 2016-05-23 10:41:29 -07:00 committed by Brian Behlendorf
parent 7e945072d1
commit 26ef0cc7db
18 changed files with 681 additions and 121 deletions

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
/*
@ -239,6 +239,38 @@ usage(void)
"\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n"
"\t\tPerform a specific action on a particular device.\n"
"\n"
"\tzinject -d device -D latency:lanes pool\n"
"\n"
"\t\tAdd an artificial delay to IO requests on a particular\n"
"\t\tdevice, such that the requests take a minimum of 'latency'\n"
"\t\tmilliseconds to complete. Each delay has an associated\n"
"\t\tnumber of 'lanes' which defines the number of concurrent\n"
"\t\tIO requests that can be processed.\n"
"\n"
"\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
"\t\tthe device will only be able to service a single IO request\n"
"\t\tat a time with each request taking 10 ms to complete. So,\n"
"\t\tif only a single request is submitted every 10 ms, the\n"
"\t\taverage latency will be 10 ms; but if more than one request\n"
"\t\tis submitted every 10 ms, the average latency will be more\n"
"\t\tthan 10 ms.\n"
"\n"
"\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
"\t\tlanes (-D 10:2), then the device will be able to service\n"
"\t\ttwo requests at a time, each with a minimum latency of\n"
"\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
"\t\tthe average latency will be 10 ms; but if more than two\n"
"\t\trequests are submitted every 10 ms, the average latency\n"
"\t\twill be more than 10 ms.\n"
"\n"
"\t\tAlso note, these delays are additive. So two invocations\n"
"\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
"\t\tof '-D 10:2'. This also means, one can specify multiple\n"
"\t\tlanes with differing target latencies. For example, an\n"
"\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
"\t\tcreate 3 lanes on the device; one lane with a latency\n"
"\t\tof 10 ms and two lanes with a 25 ms latency.\n"
"\n"
"\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
"\t\tCause the pool to stop writing blocks yet not\n"
"\t\treport errors for a duration. Simulates buggy hardware\n"
@ -353,6 +385,9 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
if (record->zi_guid == 0 || record->zi_func[0] != '\0')
return (0);
if (record->zi_cmd == ZINJECT_DELAY_IO)
return (0);
if (*count == 0) {
(void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID");
(void) printf("--- --------------- ----------------\n");
@ -366,6 +401,35 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
return (0);
}
static int
print_delay_handler(int id, const char *pool, zinject_record_t *record,
void *data)
{
int *count = data;
if (record->zi_guid == 0 || record->zi_func[0] != '\0')
return (0);
if (record->zi_cmd != ZINJECT_DELAY_IO)
return (0);
if (*count == 0) {
(void) printf("%3s %-15s %-15s %-15s %s\n",
"ID", "POOL", "DELAY (ms)", "LANES", "GUID");
(void) printf("--- --------------- --------------- "
"--------------- ----------------\n");
}
*count += 1;
(void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool,
(u_longlong_t)NSEC2MSEC(record->zi_timer),
(u_longlong_t)record->zi_nlanes,
(u_longlong_t)record->zi_guid);
return (0);
}
static int
print_panic_handler(int id, const char *pool, zinject_record_t *record,
void *data)
@ -403,6 +467,13 @@ print_all_handlers(void)
count = 0;
}
(void) iter_handlers(print_delay_handler, &count);
if (count > 0) {
total += count;
(void) printf("\n");
count = 0;
}
(void) iter_handlers(print_data_handler, &count);
if (count > 0) {
total += count;
@ -545,6 +616,35 @@ perform_action(const char *pool, zinject_record_t *record, int cmd)
return (1);
}
static int
parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
{
unsigned long scan_delay;
unsigned long scan_nlanes;
if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
return (1);
/*
* We explicitly disallow a delay of zero here, because we key
* off this value being non-zero in translate_device(), to
* determine if the fault is a ZINJECT_DELAY_IO fault or not.
*/
if (scan_delay == 0)
return (1);
/*
* The units for the CLI delay parameter is milliseconds, but
* the data passed to the kernel is interpreted as nanoseconds.
* Thus we scale the milliseconds to nanoseconds here, and this
* nanosecond value is used to pass the delay to the kernel.
*/
*delay = MSEC2NSEC(scan_delay);
*nlanes = scan_nlanes;
return (0);
}
int
main(int argc, char **argv)
{
@ -628,8 +728,10 @@ main(int argc, char **argv)
break;
case 'D':
errno = 0;
record.zi_timer = strtoull(optarg, &end, 10);
if (errno != 0 || *end != '\0') {
ret = parse_delay(optarg, &record.zi_timer,
&record.zi_nlanes);
if (ret != 0) {
(void) fprintf(stderr, "invalid i/o delay "
"value: '%s'\n", optarg);
usage();

View File

@ -53,6 +53,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/trace.h \
$(top_srcdir)/include/sys/trace_acl.h \
$(top_srcdir)/include/sys/trace_arc.h \
$(top_srcdir)/include/sys/trace_common.h \
$(top_srcdir)/include/sys/trace_dbgmsg.h \
$(top_srcdir)/include/sys/trace_dbuf.h \
$(top_srcdir)/include/sys/trace_dmu.h \
@ -60,6 +61,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/trace_multilist.h \
$(top_srcdir)/include/sys/trace_txg.h \
$(top_srcdir)/include/sys/trace_zil.h \
$(top_srcdir)/include/sys/trace_zio.h \
$(top_srcdir)/include/sys/trace_zrlock.h \
$(top_srcdir)/include/sys/txg.h \
$(top_srcdir)/include/sys/txg_impl.h \

View File

@ -34,6 +34,7 @@
#include <linux/tracepoint.h>
#include <sys/types.h>
#include <sys/trace_common.h> /* For ZIO macros */
/*
* Generic support for one argument tracepoints of the form:
@ -115,86 +116,6 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);
* zio_t *, ...);
*/
#define ZIO_TP_STRUCT_ENTRY \
__field(zio_type_t, zio_type) \
__field(int, zio_cmd) \
__field(zio_priority_t, zio_priority) \
__field(uint64_t, zio_size) \
__field(uint64_t, zio_orig_size) \
__field(uint64_t, zio_offset) \
__field(hrtime_t, zio_timestamp) \
__field(hrtime_t, zio_delta) \
__field(uint64_t, zio_delay) \
__field(enum zio_flag, zio_flags) \
__field(enum zio_stage, zio_stage) \
__field(enum zio_stage, zio_pipeline) \
__field(enum zio_flag, zio_orig_flags) \
__field(enum zio_stage, zio_orig_stage) \
__field(enum zio_stage, zio_orig_pipeline) \
__field(uint8_t, zio_reexecute) \
__field(uint64_t, zio_txg) \
__field(int, zio_error) \
__field(uint64_t, zio_ena) \
\
__field(enum zio_checksum, zp_checksum) \
__field(enum zio_compress, zp_compress) \
__field(dmu_object_type_t, zp_type) \
__field(uint8_t, zp_level) \
__field(uint8_t, zp_copies) \
__field(boolean_t, zp_dedup) \
__field(boolean_t, zp_dedup_verify) \
__field(boolean_t, zp_nopwrite)
#define ZIO_TP_FAST_ASSIGN \
__entry->zio_type = zio->io_type; \
__entry->zio_cmd = zio->io_cmd; \
__entry->zio_priority = zio->io_priority; \
__entry->zio_size = zio->io_size; \
__entry->zio_orig_size = zio->io_orig_size; \
__entry->zio_offset = zio->io_offset; \
__entry->zio_timestamp = zio->io_timestamp; \
__entry->zio_delta = zio->io_delta; \
__entry->zio_delay = zio->io_delay; \
__entry->zio_flags = zio->io_flags; \
__entry->zio_stage = zio->io_stage; \
__entry->zio_pipeline = zio->io_pipeline; \
__entry->zio_orig_flags = zio->io_orig_flags; \
__entry->zio_orig_stage = zio->io_orig_stage; \
__entry->zio_orig_pipeline = zio->io_orig_pipeline; \
__entry->zio_reexecute = zio->io_reexecute; \
__entry->zio_txg = zio->io_txg; \
__entry->zio_error = zio->io_error; \
__entry->zio_ena = zio->io_ena; \
\
__entry->zp_checksum = zio->io_prop.zp_checksum; \
__entry->zp_compress = zio->io_prop.zp_compress; \
__entry->zp_type = zio->io_prop.zp_type; \
__entry->zp_level = zio->io_prop.zp_level; \
__entry->zp_copies = zio->io_prop.zp_copies; \
__entry->zp_dedup = zio->io_prop.zp_dedup; \
__entry->zp_nopwrite = zio->io_prop.zp_nopwrite; \
__entry->zp_dedup_verify = zio->io_prop.zp_dedup_verify;
#define ZIO_TP_PRINTK_FMT \
"zio { type %u cmd %i prio %u size %llu orig_size %llu " \
"offset %llu timestamp %llu delta %llu delay %llu " \
"flags 0x%x stage 0x%x pipeline 0x%x orig_flags 0x%x " \
"orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \
"txg %llu error %d ena %llu prop { checksum %u compress %u " \
"type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }"
#define ZIO_TP_PRINTK_ARGS \
__entry->zio_type, __entry->zio_cmd, __entry->zio_priority, \
__entry->zio_size, __entry->zio_orig_size, __entry->zio_offset, \
__entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \
__entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \
__entry->zio_orig_flags, __entry->zio_orig_stage, \
__entry->zio_orig_pipeline, __entry->zio_reexecute, \
__entry->zio_txg, __entry->zio_error, __entry->zio_ena, \
__entry->zp_checksum, __entry->zp_compress, __entry->zp_type, \
__entry->zp_level, __entry->zp_copies, __entry->zp_dedup, \
__entry->zp_dedup_verify, __entry->zp_nopwrite
DECLARE_EVENT_CLASS(zfs_l2arc_rw_class,
TP_PROTO(vdev_t *vd, zio_t *zio),
TP_ARGS(vd, zio),

112
include/sys/trace_common.h Normal file
View File

@ -0,0 +1,112 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* This file contains commonly used trace macros. Feel free to add and use
* them in your tracepoint headers.
*/
#ifndef _SYS_TRACE_COMMON_H
#define _SYS_TRACE_COMMON_H
#include <linux/tracepoint.h>
/* ZIO macros */
#define ZIO_TP_STRUCT_ENTRY \
__field(zio_type_t, zio_type) \
__field(int, zio_cmd) \
__field(zio_priority_t, zio_priority) \
__field(uint64_t, zio_size) \
__field(uint64_t, zio_orig_size) \
__field(uint64_t, zio_offset) \
__field(hrtime_t, zio_timestamp) \
__field(hrtime_t, zio_delta) \
__field(uint64_t, zio_delay) \
__field(enum zio_flag, zio_flags) \
__field(enum zio_stage, zio_stage) \
__field(enum zio_stage, zio_pipeline) \
__field(enum zio_flag, zio_orig_flags) \
__field(enum zio_stage, zio_orig_stage) \
__field(enum zio_stage, zio_orig_pipeline) \
__field(uint8_t, zio_reexecute) \
__field(uint64_t, zio_txg) \
__field(int, zio_error) \
__field(uint64_t, zio_ena) \
\
__field(enum zio_checksum, zp_checksum) \
__field(enum zio_compress, zp_compress) \
__field(dmu_object_type_t, zp_type) \
__field(uint8_t, zp_level) \
__field(uint8_t, zp_copies) \
__field(boolean_t, zp_dedup) \
__field(boolean_t, zp_dedup_verify) \
__field(boolean_t, zp_nopwrite)
#define ZIO_TP_FAST_ASSIGN \
__entry->zio_type = zio->io_type; \
__entry->zio_cmd = zio->io_cmd; \
__entry->zio_priority = zio->io_priority; \
__entry->zio_size = zio->io_size; \
__entry->zio_orig_size = zio->io_orig_size; \
__entry->zio_offset = zio->io_offset; \
__entry->zio_timestamp = zio->io_timestamp; \
__entry->zio_delta = zio->io_delta; \
__entry->zio_delay = zio->io_delay; \
__entry->zio_flags = zio->io_flags; \
__entry->zio_stage = zio->io_stage; \
__entry->zio_pipeline = zio->io_pipeline; \
__entry->zio_orig_flags = zio->io_orig_flags; \
__entry->zio_orig_stage = zio->io_orig_stage; \
__entry->zio_orig_pipeline = zio->io_orig_pipeline; \
__entry->zio_reexecute = zio->io_reexecute; \
__entry->zio_txg = zio->io_txg; \
__entry->zio_error = zio->io_error; \
__entry->zio_ena = zio->io_ena; \
\
__entry->zp_checksum = zio->io_prop.zp_checksum; \
__entry->zp_compress = zio->io_prop.zp_compress; \
__entry->zp_type = zio->io_prop.zp_type; \
__entry->zp_level = zio->io_prop.zp_level; \
__entry->zp_copies = zio->io_prop.zp_copies; \
__entry->zp_dedup = zio->io_prop.zp_dedup; \
__entry->zp_nopwrite = zio->io_prop.zp_nopwrite; \
__entry->zp_dedup_verify = zio->io_prop.zp_dedup_verify;
#define ZIO_TP_PRINTK_FMT \
"zio { type %u cmd %i prio %u size %llu orig_size %llu " \
"offset %llu timestamp %llu delta %llu delay %llu " \
"flags 0x%x stage 0x%x pipeline 0x%x orig_flags 0x%x " \
"orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \
"txg %llu error %d ena %llu prop { checksum %u compress %u " \
"type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }"
#define ZIO_TP_PRINTK_ARGS \
__entry->zio_type, __entry->zio_cmd, __entry->zio_priority, \
__entry->zio_size, __entry->zio_orig_size, __entry->zio_offset, \
__entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \
__entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \
__entry->zio_orig_flags, __entry->zio_orig_stage, \
__entry->zio_orig_pipeline, __entry->zio_reexecute, \
__entry->zio_txg, __entry->zio_error, __entry->zio_ena, \
__entry->zp_checksum, __entry->zp_compress, __entry->zp_type, \
__entry->zp_level, __entry->zp_copies, __entry->zp_dedup, \
__entry->zp_dedup_verify, __entry->zp_nopwrite
#endif /* _SYS_TRACE_COMMON_H */

87
include/sys/trace_zio.h Normal file
View File

@ -0,0 +1,87 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
#include <sys/list.h>
#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS)
#undef TRACE_SYSTEM
#define TRACE_SYSTEM zfs
#undef TRACE_SYSTEM_VAR
#define TRACE_SYSTEM_VAR zfs_zio
#if !defined(_TRACE_ZIO_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_ZIO_H
#include <linux/tracepoint.h>
#include <sys/types.h>
#include <sys/trace_common.h> /* For ZIO macros */
TRACE_EVENT(zfs_zio__delay__miss,
TP_PROTO(zio_t *zio, hrtime_t now),
TP_ARGS(zio, now),
TP_STRUCT__entry(
ZIO_TP_STRUCT_ENTRY
__field(hrtime_t, now)
),
TP_fast_assign(
ZIO_TP_FAST_ASSIGN
__entry->now = now;
),
TP_printk("now %llu " ZIO_TP_PRINTK_FMT, __entry->now,
ZIO_TP_PRINTK_ARGS)
);
TRACE_EVENT(zfs_zio__delay__hit,
TP_PROTO(zio_t *zio, hrtime_t now, hrtime_t diff),
TP_ARGS(zio, now, diff),
TP_STRUCT__entry(
ZIO_TP_STRUCT_ENTRY
__field(hrtime_t, now)
__field(hrtime_t, diff)
),
TP_fast_assign(
ZIO_TP_FAST_ASSIGN
__entry->now = now;
__entry->diff = diff;
),
TP_printk("now %llu diff %llu " ZIO_TP_PRINTK_FMT, __entry->now,
__entry->diff, ZIO_TP_PRINTK_ARGS)
);
TRACE_EVENT(zfs_zio__delay__skip,
TP_PROTO(zio_t *zio),
TP_ARGS(zio),
TP_STRUCT__entry(ZIO_TP_STRUCT_ENTRY),
TP_fast_assign(ZIO_TP_FAST_ASSIGN),
TP_printk(ZIO_TP_PRINTK_FMT, ZIO_TP_PRINTK_ARGS)
);
#endif /* _TRACE_ZIO_H */
#undef TRACE_INCLUDE_PATH
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_PATH sys
#define TRACE_INCLUDE_FILE trace_zio
#include <trace/define_trace.h>
#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */

View File

@ -287,6 +287,7 @@ typedef struct zinject_record {
uint32_t zi_iotype;
int32_t zi_duration;
uint64_t zi_timer;
uint64_t zi_nlanes;
uint32_t zi_cmd;
uint32_t zi_pad;
} zinject_record_t;

View File

@ -420,6 +420,7 @@ struct zio {
uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */
hrtime_t io_target_timestamp;
hrtime_t io_delta; /* vdev queue service delta */
hrtime_t io_delay; /* Device access time (disk or */
/* file). */
@ -510,6 +511,8 @@ extern int zio_wait(zio_t *zio);
extern void zio_nowait(zio_t *zio);
extern void zio_execute(zio_t *zio);
extern void zio_interrupt(zio_t *zio);
extern void zio_delay_init(zio_t *zio);
extern void zio_delay_interrupt(zio_t *zio);
extern zio_t *zio_walk_parents(zio_t *cio);
extern zio_t *zio_walk_children(zio_t *pio);
@ -572,7 +575,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error);
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error);
extern void zio_handle_ignored_writes(zio_t *zio);
extern uint64_t zio_handle_io_delay(zio_t *zio);
extern hrtime_t zio_handle_io_delay(zio_t *zio);
/*
* Checksum ereport functions

View File

@ -42,6 +42,39 @@ Cancel injection records.
.TP
.B "zinject \-d \fIvdev\fB \-A <degrade|fault> \fIpool\fB
Force a vdev into the DEGRADED or FAULTED state.
.TP
.B "zinject -d \fIvdev\fB -D latency:lanes \fIpool\fB
Add an artificial delay to IO requests on a particular
device, such that the requests take a minimum of 'latency'
milliseconds to complete. Each delay has an associated
number of 'lanes' which defines the number of concurrent
IO requests that can be processed.
For example, with a single lane delay of 10 ms (-D 10:1),
the device will only be able to service a single IO request
at a time with each request taking 10 ms to complete. So,
if only a single request is submitted every 10 ms, the
average latency will be 10 ms; but if more than one request
is submitted every 10 ms, the average latency will be more
than 10 ms.
Similarly, if a delay of 10 ms is specified to have two
lanes (-D 10:2), then the device will be able to service
two requests at a time, each with a minimum latency of
10 ms. So, if two requests are submitted every 10 ms, then
the average latency will be 10 ms; but if more than two
requests are submitted every 10 ms, the average latency
will be more than 10 ms.
Also note, these delays are additive. So two invocations
of '-D 10:1', is roughly equivalent to a single invocation
of '-D 10:2'. This also means, one can specify multiple
lanes with differing target latencies. For example, an
invocation of '-D 10:1' followed by '-D 25:2' will
create 3 lanes on the device; one lane with a latency
of 10 ms and two lanes with a 25 ms latency.
.TP
.B "zinject \-d \fIvdev\fB [\-e \fIdevice_error\fB] [\-L \fIlabel_error\fB] [\-T \fIfailure\fB] [\-F] \fIpool\fB"
Force a vdev error.

View File

@ -47,4 +47,5 @@
#include <sys/trace_multilist.h>
#include <sys/trace_txg.h>
#include <sys/trace_zil.h>
#include <sys/trace_zio.h>
#include <sys/trace_zrlock.h>

View File

@ -23,7 +23,7 @@
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
* LLNL-CODE-403049.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -414,7 +414,7 @@ vdev_disk_dio_put(dio_request_t *dr)
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
vdev_disk_error(zio);
zio_interrupt(zio);
zio_delay_interrupt(zio);
}
}
@ -726,6 +726,7 @@ vdev_disk_io_start(zio_t *zio)
return;
}
zio->io_target_timestamp = zio_handle_io_delay(zio);
error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
zio->io_size, zio->io_offset, flags, 0);
if (error) {

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@ -159,7 +159,7 @@ vdev_file_io_strategy(void *arg)
if (resid != 0 && zio->io_error == 0)
zio->io_error = SET_ERROR(ENOSPC);
zio_interrupt(zio);
zio_delay_interrupt(zio);
}
static void
@ -217,6 +217,8 @@ vdev_file_io_start(zio_t *zio)
return;
}
zio->io_target_timestamp = zio_handle_io_delay(zio);
VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, zio,
TQ_SLEEP), !=, 0);
}

View File

@ -753,9 +753,6 @@ vdev_queue_io_done(zio_t *zio)
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
if (zio_injection_enabled)
delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
mutex_enter(&vq->vq_lock);
vdev_queue_pending_remove(vq, zio);

View File

@ -40,6 +40,7 @@
#include <sys/blkptr.h>
#include <sys/zfeature.h>
#include <sys/time.h>
#include <sys/trace_zio.h>
/*
* ==========================================================================
@ -1390,6 +1391,76 @@ zio_interrupt(zio_t *zio)
zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
}
void
zio_delay_interrupt(zio_t *zio)
{
/*
* The timeout_generic() function isn't defined in userspace, so
* rather than trying to implement the function, the zio delay
* functionality has been disabled for userspace builds.
*/
#ifdef _KERNEL
/*
* If io_target_timestamp is zero, then no delay has been registered
* for this IO, thus jump to the end of this function and "skip" the
* delay; issuing it directly to the zio layer.
*/
if (zio->io_target_timestamp != 0) {
hrtime_t now = gethrtime();
if (now >= zio->io_target_timestamp) {
/*
* This IO has already taken longer than the target
* delay to complete, so we don't want to delay it
* any longer; we "miss" the delay and issue it
* directly to the zio layer. This is likely due to
* the target latency being set to a value less than
* the underlying hardware can satisfy (e.g. delay
* set to 1ms, but the disks take 10ms to complete an
* IO request).
*/
DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
hrtime_t, now);
zio_interrupt(zio);
} else {
taskqid_t tid;
hrtime_t diff = zio->io_target_timestamp - now;
clock_t expire_at_tick = ddi_get_lbolt() +
NSEC_TO_TICK(diff);
DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
hrtime_t, now, hrtime_t, diff);
if (NSEC_TO_TICK(diff) == 0) {
/* Our delay is less than a jiffy - just spin */
zfs_sleep_until(zio->io_target_timestamp);
} else {
/*
* Use taskq_dispatch_delay() in the place of
* OpenZFS's timeout_generic().
*/
tid = taskq_dispatch_delay(system_taskq,
(task_func_t *) zio_interrupt,
zio, TQ_NOSLEEP, expire_at_tick);
if (!tid) {
/*
* Couldn't allocate a task. Just
* finish the zio without a delay.
*/
zio_interrupt(zio);
}
}
}
return;
}
#endif
DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
zio_interrupt(zio);
}
/*
* Execute the I/O pipeline until one of the following occurs:
* (1) the I/O completes; (2) the pipeline stalls waiting for

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
/*
@ -49,15 +49,53 @@
uint32_t zio_injection_enabled = 0;
/*
* Data describing each zinject handler registered on the system, and
* contains the list node linking the handler in the global zinject
* handler list.
*/
typedef struct inject_handler {
int zi_id;
spa_t *zi_spa;
zinject_record_t zi_record;
uint64_t *zi_lanes;
int zi_next_lane;
list_node_t zi_link;
} inject_handler_t;
/*
* List of all zinject handlers registered on the system, protected by
* the inject_lock defined below.
*/
static list_t inject_handlers;
/*
* This protects insertion into, and traversal of, the inject handler
* list defined above; as well as the inject_delay_count. Any time a
* handler is inserted or removed from the list, this lock should be
* taken as a RW_WRITER; and any time traversal is done over the list
* (without modification to it) this lock should be taken as a RW_READER.
*/
static krwlock_t inject_lock;
/*
* This holds the number of zinject delay handlers that have been
* registered on the system. It is protected by the inject_lock defined
* above. Thus modifications to this count must be a RW_WRITER of the
* inject_lock, and reads of this count must be (at least) a RW_READER
* of the lock.
*/
static int inject_delay_count = 0;
/*
* This lock is used only in zio_handle_io_delay(), refer to the comment
* in that function for more details.
*/
static kmutex_t inject_delay_mtx;
/*
* Used to assign unique identifying numbers to each new zinject handler.
*/
static int inject_next_id = 1;
/*
@ -361,21 +399,70 @@ spa_handle_ignored_writes(spa_t *spa)
rw_exit(&inject_lock);
}
uint64_t
hrtime_t
zio_handle_io_delay(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
inject_handler_t *min_handler = NULL;
hrtime_t min_target = 0;
inject_handler_t *handler;
uint64_t seconds = 0;
if (zio_injection_enabled == 0)
return (0);
hrtime_t idle;
hrtime_t busy;
hrtime_t target;
rw_enter(&inject_lock, RW_READER);
for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) {
/*
* inject_delay_count is a subset of zio_injection_enabled that
* is only incremented for delay handlers. These checks are
* mainly added to remind the reader why we're not explicitly
* checking zio_injection_enabled like the other functions.
*/
IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
/*
* If there aren't any inject delay handlers registered, then we
* can short circuit and simply return 0 here. A value of zero
* informs zio_delay_interrupt() that this request should not be
* delayed. This short circuit keeps us from acquiring the
* inject_delay_mutex unnecessarily.
*/
if (inject_delay_count == 0) {
rw_exit(&inject_lock);
return (0);
}
/*
* Each inject handler has a number of "lanes" associated with
* it. Each lane is able to handle requests independently of one
* another, and at a latency defined by the inject handler
* record's zi_timer field. Thus if a handler in configured with
* a single lane with a 10ms latency, it will delay requests
* such that only a single request is completed every 10ms. So,
* if more than one request is attempted per each 10ms interval,
* the average latency of the requests will be greater than
* 10ms; but if only a single request is submitted each 10ms
* interval the average latency will be 10ms.
*
* We need to acquire this mutex to prevent multiple concurrent
* threads being assigned to the same lane of a given inject
* handler. The mutex allows us to perform the following two
* operations atomically:
*
* 1. determine the minimum handler and minimum target
* value of all the possible handlers
* 2. update that minimum handler's lane array
*
* Without atomicity, two (or more) threads could pick the same
* lane in step (1), and then conflict with each other in step
* (2). This could allow a single lane handler to process
* multiple requests simultaneously, which shouldn't be possible.
*/
mutex_enter(&inject_delay_mtx);
for (handler = list_head(&inject_handlers);
handler != NULL; handler = list_next(&inject_handlers, handler)) {
if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
continue;
@ -384,14 +471,101 @@ zio_handle_io_delay(zio_t *zio)
continue;
}
if (vd->vdev_guid == handler->zi_record.zi_guid) {
seconds = handler->zi_record.zi_timer;
break;
if (vd->vdev_guid != handler->zi_record.zi_guid)
continue;
/*
* Defensive; should never happen as the array allocation
* occurs prior to inserting this handler on the list.
*/
ASSERT3P(handler->zi_lanes, !=, NULL);
/*
* This should never happen, the zinject command should
* prevent a user from setting an IO delay with zero lanes.
*/
ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
ASSERT3U(handler->zi_record.zi_nlanes, >,
handler->zi_next_lane);
/*
* We want to issue this IO to the lane that will become
* idle the soonest, so we compare the soonest this
* specific handler can complete the IO with all other
* handlers, to find the lowest value of all possible
* lanes. We then use this lane to submit the request.
*
* Since each handler has a constant value for its
* delay, we can just use the "next" lane for that
* handler; as it will always be the lane with the
* lowest value for that particular handler (i.e. the
* lane that will become idle the soonest). This saves a
* scan of each handler's lanes array.
*
* There's two cases to consider when determining when
* this specific IO request should complete. If this
* lane is idle, we want to "submit" the request now so
* it will complete after zi_timer milliseconds. Thus,
* we set the target to now + zi_timer.
*
* If the lane is busy, we want this request to complete
* zi_timer milliseconds after the lane becomes idle.
* Since the 'zi_lanes' array holds the time at which
* each lane will become idle, we use that value to
* determine when this request should complete.
*/
idle = handler->zi_record.zi_timer + gethrtime();
busy = handler->zi_record.zi_timer +
handler->zi_lanes[handler->zi_next_lane];
target = MAX(idle, busy);
if (min_handler == NULL) {
min_handler = handler;
min_target = target;
continue;
}
ASSERT3P(min_handler, !=, NULL);
ASSERT3U(min_target, !=, 0);
/*
* We don't yet increment the "next lane" variable since
* we still might find a lower value lane in another
* handler during any remaining iterations. Once we're
* sure we've selected the absolute minimum, we'll claim
* the lane and increment the handler's "next lane"
* field below.
*/
if (target < min_target) {
min_handler = handler;
min_target = target;
}
}
/*
* 'min_handler' will be NULL if no IO delays are registered for
* this vdev, otherwise it will point to the handler containing
* the lane that will become idle the soonest.
*/
if (min_handler != NULL) {
ASSERT3U(min_target, !=, 0);
min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
/*
* If we've used all possible lanes for this handler,
* loop back and start using the first lane again;
* otherwise, just increment the lane index.
*/
min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
min_handler->zi_record.zi_nlanes;
}
mutex_exit(&inject_delay_mtx);
rw_exit(&inject_lock);
return (seconds);
return (min_target);
}
/*
@ -415,6 +589,24 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
if ((error = spa_reset(name)) != 0)
return (error);
if (record->zi_cmd == ZINJECT_DELAY_IO) {
/*
* A value of zero for the number of lanes or for the
* delay time doesn't make sense.
*/
if (record->zi_timer == 0 || record->zi_nlanes == 0)
return (SET_ERROR(EINVAL));
/*
* The number of lanes is directly mapped to the size of
* an array used by the handler. Thus, to ensure the
* user doesn't trigger an allocation that's "too large"
* we cap the number of lanes here.
*/
if (record->zi_nlanes >= UINT16_MAX)
return (SET_ERROR(EINVAL));
}
if (!(flags & ZINJECT_NULL)) {
/*
* spa_inject_ref() will add an injection reference, which will
@ -426,11 +618,34 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
rw_enter(&inject_lock, RW_WRITER);
*id = handler->zi_id = inject_next_id++;
handler->zi_spa = spa;
handler->zi_record = *record;
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
handler->zi_lanes = kmem_zalloc(
sizeof (*handler->zi_lanes) *
handler->zi_record.zi_nlanes, KM_SLEEP);
handler->zi_next_lane = 0;
} else {
handler->zi_lanes = NULL;
handler->zi_next_lane = 0;
}
rw_enter(&inject_lock, RW_WRITER);
/*
* We can't move this increment into the conditional
* above because we need to hold the RW_WRITER lock of
* inject_lock, and we don't want to hold that while
* allocating the handler's zi_lanes array.
*/
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
ASSERT3S(inject_delay_count, >=, 0);
inject_delay_count++;
ASSERT3S(inject_delay_count, >, 0);
}
*id = handler->zi_id = inject_next_id++;
list_insert_tail(&inject_handlers, handler);
atomic_inc_32(&zio_injection_enabled);
@ -508,9 +723,23 @@ zio_clear_fault(int id)
return (SET_ERROR(ENOENT));
}
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
ASSERT3S(inject_delay_count, >, 0);
inject_delay_count--;
ASSERT3S(inject_delay_count, >=, 0);
}
list_remove(&inject_handlers, handler);
rw_exit(&inject_lock);
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
ASSERT3P(handler->zi_lanes, !=, NULL);
kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
handler->zi_record.zi_nlanes);
} else {
ASSERT3P(handler->zi_lanes, ==, NULL);
}
spa_inject_delref(handler->zi_spa);
kmem_free(handler, sizeof (inject_handler_t));
atomic_dec_32(&zio_injection_enabled);
@ -522,6 +751,7 @@ void
zio_inject_init(void)
{
rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
list_create(&inject_handlers, sizeof (inject_handler_t),
offsetof(inject_handler_t, zi_link));
}
@ -530,6 +760,7 @@ void
zio_inject_fini(void)
{
list_destroy(&inject_handlers);
mutex_destroy(&inject_delay_mtx);
rw_destroy(&inject_lock);
}

View File

@ -38,10 +38,7 @@ verify_disk_count "$DISKS" 2
default_mirror_setup_noexit $DISK1 $DISK2
mntpnt=$(get_prop mountpoint $TESTPOOL)
typeset -i i=0
while ((i < 10)); do
log_must $DD if=/dev/urandom of=$mntpnt/bigfile.$i bs=1024k count=100
((i += 1))
done
# Create 100MB of data
log_must $FILE_WRITE -b 1048576 -c 100 -o create -d 0 -f $mntpnt/bigfile
log_pass

View File

@ -38,16 +38,15 @@
# 3. Verify zpool scrub -s succeed when the system is scrubbing.
#
# NOTES:
# A 1 second delay is added to 10% of zio's in order to ensure that
# the scrub does not complete before it has a chance to be cancelled.
# A 10ms delay is added to the ZIOs in order to ensure that the
# scrub does not complete before it has a chance to be cancelled.
# This can occur when testing with small pools or very fast hardware.
#
verify_runnable "global"
log_assert "Verify scrub -s works correctly."
log_must $ZINJECT -d $DISK1 -f10 -D1 $TESTPOOL
log_must $ZINJECT -d $DISK1 -D10:1 $TESTPOOL
log_must $ZPOOL scrub $TESTPOOL
log_must $ZPOOL scrub -s $TESTPOOL
log_must is_pool_scrub_stopped $TESTPOOL

View File

@ -40,8 +40,8 @@
# 4. Check the percent again, verify a new scrub started.
#
# NOTES:
# A 1 second delay is added to 10% of zio's in order to ensure that
# the scrub does not complete before it has a chance to be restarted.
# A 10ms delay is added to the ZIOs in order to ensure that the
# scrub does not complete before it has a chance to be restarted.
# This can occur when testing with small pools or very fast hardware.
#
@ -61,7 +61,7 @@ function get_scrub_percent
log_assert "scrub command terminates the existing scrub process and starts" \
"a new scrub."
log_must $ZINJECT -d $DISK1 -f10 -D1 $TESTPOOL
log_must $ZINJECT -d $DISK1 -D10:1 $TESTPOOL
log_must $ZPOOL scrub $TESTPOOL
typeset -i PERCENT=30 percent=0
while ((percent < PERCENT)) ; do

View File

@ -42,16 +42,16 @@
# 3. Verify scrub failed until the resilver completed
#
# NOTES:
# A 1 second delay is added to 10% of zio's in order to ensure that
# the resilver does not complete before the scrub can be issue. This
# A 10ms delay is added to 10% of zio's in order to ensure that the
# resilver does not complete before the scrub can be issued. This
# can occur when testing with small pools or very fast hardware.
verify_runnable "global"
log_assert "Resilver prevent scrub from starting until the resilver completes"
log_must $ZINJECT -d $DISK1 -f10 -D1 $TESTPOOL
log_must $ZPOOL detach $TESTPOOL $DISK2
log_must $ZINJECT -d $DISK1 -D10:1 $TESTPOOL
log_must $ZPOOL attach $TESTPOOL $DISK1 $DISK2
log_must is_pool_resilvering $TESTPOOL
log_mustnot $ZPOOL scrub $TESTPOOL