MFV v242732:
Merge the ZFS I/O deadman thread from vendor (illumos). This feature panics the system on hanging ZFS I/O, helps debugging and resumes failed service. The panic behavior can be controlled with the loader-only tunables: vfs.zfs.deadman_enabled (enable or disable panic on stalled ZFS I/O) vfs.zfs.deadman_synctime (expiration time for stalled ZFS I/O) By default, ZFS I/O deadman is enabled by default on amd64 and i386 excluding virtual guest machines. Illumos ZFS issues: 3246 ZFS I/O deadman thread References: https://www.illumos.org/issues/3246 MFC after: 2 weeks
This commit is contained in:
commit
e70664bafc
@ -20,6 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <libzfs.h>
|
||||
@ -455,6 +456,20 @@ translate_device(const char *pool, const char *device, err_type_t label_type,
|
||||
&record->zi_guid) == 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Device faults can take on three different forms:
|
||||
* 1). delayed or hanging I/O
|
||||
* 2). zfs label faults
|
||||
* 3). generic disk faults
|
||||
*/
|
||||
if (record->zi_timer != 0) {
|
||||
record->zi_cmd = ZINJECT_DELAY_IO;
|
||||
} else if (label_type != TYPE_INVAL) {
|
||||
record->zi_cmd = ZINJECT_LABEL_FAULT;
|
||||
} else {
|
||||
record->zi_cmd = ZINJECT_DEVICE_FAULT;
|
||||
}
|
||||
|
||||
switch (label_type) {
|
||||
case TYPE_LABEL_UBERBLOCK:
|
||||
record->zi_start = offsetof(vdev_label_t, vl_uberblock[0]);
|
||||
|
@ -20,6 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
/*
|
||||
@ -603,7 +604,7 @@ main(int argc, char **argv)
|
||||
}
|
||||
|
||||
while ((c = getopt(argc, argv,
|
||||
":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
|
||||
":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
|
||||
switch (c) {
|
||||
case 'a':
|
||||
flags |= ZINJECT_FLUSH_ARC;
|
||||
@ -629,6 +630,15 @@ main(int argc, char **argv)
|
||||
case 'd':
|
||||
device = optarg;
|
||||
break;
|
||||
case 'D':
|
||||
record.zi_timer = strtoull(optarg, &end, 10);
|
||||
if (errno != 0 || *end != '\0') {
|
||||
(void) fprintf(stderr, "invalid i/o delay "
|
||||
"value: '%s'\n", optarg);
|
||||
usage();
|
||||
return (1);
|
||||
}
|
||||
break;
|
||||
case 'e':
|
||||
if (strcasecmp(optarg, "io") == 0) {
|
||||
error = EIO;
|
||||
@ -693,6 +703,7 @@ main(int argc, char **argv)
|
||||
case 'p':
|
||||
(void) strlcpy(record.zi_func, optarg,
|
||||
sizeof (record.zi_func));
|
||||
record.zi_cmd = ZINJECT_PANIC;
|
||||
break;
|
||||
case 'q':
|
||||
quiet = 1;
|
||||
@ -766,13 +777,15 @@ main(int argc, char **argv)
|
||||
argc -= optind;
|
||||
argv += optind;
|
||||
|
||||
if (record.zi_duration != 0)
|
||||
record.zi_cmd = ZINJECT_IGNORED_WRITES;
|
||||
|
||||
if (cancel != NULL) {
|
||||
/*
|
||||
* '-c' is invalid with any other options.
|
||||
*/
|
||||
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
|
||||
level != 0 || record.zi_func[0] != '\0' ||
|
||||
record.zi_duration != 0) {
|
||||
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
|
||||
(void) fprintf(stderr, "cancel (-c) incompatible with "
|
||||
"any other options\n");
|
||||
usage();
|
||||
@ -804,8 +817,7 @@ main(int argc, char **argv)
|
||||
* for doing injection, so handle it separately here.
|
||||
*/
|
||||
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
|
||||
level != 0 || record.zi_func[0] != '\0' ||
|
||||
record.zi_duration != 0) {
|
||||
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
|
||||
(void) fprintf(stderr, "device (-d) incompatible with "
|
||||
"data error injection\n");
|
||||
usage();
|
||||
@ -839,7 +851,7 @@ main(int argc, char **argv)
|
||||
|
||||
} else if (raw != NULL) {
|
||||
if (range != NULL || type != TYPE_INVAL || level != 0 ||
|
||||
record.zi_func[0] != '\0' || record.zi_duration != 0) {
|
||||
record.zi_cmd != ZINJECT_UNINITIALIZED) {
|
||||
(void) fprintf(stderr, "raw (-b) format with "
|
||||
"any other options\n");
|
||||
usage();
|
||||
@ -862,13 +874,14 @@ main(int argc, char **argv)
|
||||
return (1);
|
||||
}
|
||||
|
||||
record.zi_cmd = ZINJECT_DATA_FAULT;
|
||||
if (translate_raw(raw, &record) != 0)
|
||||
return (1);
|
||||
if (!error)
|
||||
error = EIO;
|
||||
} else if (record.zi_func[0] != '\0') {
|
||||
} else if (record.zi_cmd == ZINJECT_PANIC) {
|
||||
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
|
||||
level != 0 || device != NULL || record.zi_duration != 0) {
|
||||
level != 0 || device != NULL) {
|
||||
(void) fprintf(stderr, "panic (-p) incompatible with "
|
||||
"other options\n");
|
||||
usage();
|
||||
@ -886,7 +899,7 @@ main(int argc, char **argv)
|
||||
if (argv[1] != NULL)
|
||||
record.zi_type = atoi(argv[1]);
|
||||
dataset[0] = '\0';
|
||||
} else if (record.zi_duration != 0) {
|
||||
} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
|
||||
if (nowrites == 0) {
|
||||
(void) fprintf(stderr, "-s or -g meaningless "
|
||||
"without -I (ignore writes)\n");
|
||||
@ -940,6 +953,7 @@ main(int argc, char **argv)
|
||||
return (1);
|
||||
}
|
||||
|
||||
record.zi_cmd = ZINJECT_DATA_FAULT;
|
||||
if (translate_record(type, argv[0], range, level, &record, pool,
|
||||
dataset) != 0)
|
||||
return (1);
|
||||
|
@ -45,6 +45,9 @@ int aok;
|
||||
uint64_t physmem;
|
||||
vnode_t *rootdir = (vnode_t *)0xabcd1234;
|
||||
char hw_serial[HW_HOSTID_LEN];
|
||||
#ifdef illumos
|
||||
kmutex_t cpu_lock;
|
||||
#endif
|
||||
|
||||
struct utsname utsname = {
|
||||
"userland", "libzpool", "1", "1", "na"
|
||||
@ -842,6 +845,28 @@ ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
|
||||
return (0);
|
||||
}
|
||||
|
||||
#ifdef illumos
|
||||
/* ARGSUSED */
|
||||
cyclic_id_t
|
||||
cyclic_add(cyc_handler_t *hdlr, cyc_time_t *when)
|
||||
{
|
||||
return (1);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
void
|
||||
cyclic_remove(cyclic_id_t id)
|
||||
{
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
int
|
||||
cyclic_reprogram(cyclic_id_t id, hrtime_t expiration)
|
||||
{
|
||||
return (1);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* =========================================================================
|
||||
* kernel emulation setup & teardown
|
||||
@ -875,6 +900,10 @@ kernel_init(int mode)
|
||||
|
||||
system_taskq_init();
|
||||
|
||||
#ifdef illumos
|
||||
mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
#endif
|
||||
|
||||
spa_init(mode);
|
||||
}
|
||||
|
||||
|
@ -457,6 +457,9 @@ extern vnode_t *rootdir;
|
||||
|
||||
extern void delay(clock_t ticks);
|
||||
|
||||
#define SEC_TO_TICK(sec) ((sec) * hz)
|
||||
#define NSEC_TO_TICK(usec) ((usec) / (NANOSEC / hz))
|
||||
|
||||
#define gethrestime_sec() time(NULL)
|
||||
#define gethrestime(t) \
|
||||
do {\
|
||||
@ -624,6 +627,36 @@ typedef uint32_t idmap_rid_t;
|
||||
#define ERESTART (-1)
|
||||
#endif
|
||||
|
||||
#ifdef illumos
|
||||
/*
|
||||
* Cyclic information
|
||||
*/
|
||||
extern kmutex_t cpu_lock;
|
||||
|
||||
typedef uintptr_t cyclic_id_t;
|
||||
typedef uint16_t cyc_level_t;
|
||||
typedef void (*cyc_func_t)(void *);
|
||||
|
||||
#define CY_LOW_LEVEL 0
|
||||
#define CY_INFINITY INT64_MAX
|
||||
#define CYCLIC_NONE ((cyclic_id_t)0)
|
||||
|
||||
typedef struct cyc_time {
|
||||
hrtime_t cyt_when;
|
||||
hrtime_t cyt_interval;
|
||||
} cyc_time_t;
|
||||
|
||||
typedef struct cyc_handler {
|
||||
cyc_func_t cyh_func;
|
||||
void *cyh_arg;
|
||||
cyc_level_t cyh_level;
|
||||
} cyc_handler_t;
|
||||
|
||||
extern cyclic_id_t cyclic_add(cyc_handler_t *, cyc_time_t *);
|
||||
extern void cyclic_remove(cyclic_id_t);
|
||||
extern int cyclic_reprogram(cyclic_id_t, hrtime_t);
|
||||
#endif /* illumos */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -46,6 +46,9 @@ typedef longlong_t hrtime_t;
|
||||
((ts)->tv_sec < INT64_MIN || (ts)->tv_sec > INT64_MAX)
|
||||
#endif
|
||||
|
||||
#define SEC_TO_TICK(sec) ((sec) * hz)
|
||||
#define NSEC_TO_TICK(usec) ((usec) / (NANOSEC / hz))
|
||||
|
||||
#ifdef _KERNEL
|
||||
static __inline hrtime_t
|
||||
gethrtime(void) {
|
||||
|
@ -22,6 +22,7 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
|
||||
*/
|
||||
|
||||
/*
|
||||
@ -141,6 +142,10 @@ uint_t zio_taskq_basedc = 80; /* base duty cycle */
|
||||
boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
|
||||
extern int zfs_sync_pass_deferred_free;
|
||||
|
||||
#ifndef illumos
|
||||
extern void spa_deadman(void *arg);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This (illegal) pool name is used when temporarily importing a spa_t in order
|
||||
* to get the vdev stats associated with the imported devices.
|
||||
@ -6258,6 +6263,17 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
|
||||
tx = dmu_tx_create_assigned(dp, txg);
|
||||
|
||||
spa->spa_sync_starttime = gethrtime();
|
||||
#ifdef illumos
|
||||
VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
|
||||
spa->spa_sync_starttime + spa->spa_deadman_synctime));
|
||||
#else /* FreeBSD */
|
||||
#ifdef _KERNEL
|
||||
callout_reset(&spa->spa_deadman_cycid,
|
||||
hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
|
||||
* set spa_deflate if we have no raid-z vdevs.
|
||||
@ -6386,6 +6402,14 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
}
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
#ifdef illumos
|
||||
VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
|
||||
#else /* FreeBSD */
|
||||
#ifdef _KERNEL
|
||||
callout_drain(&spa->spa_deadman_cycid);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Clear the dirty config list.
|
||||
*/
|
||||
|
@ -22,10 +22,12 @@
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/spa_impl.h>
|
||||
#include <sys/spa_boot.h>
|
||||
#include <sys/zio.h>
|
||||
#include <sys/zio_checksum.h>
|
||||
#include <sys/zio_compress.h>
|
||||
@ -253,6 +255,52 @@ TUNABLE_INT("vfs.zfs.recover", &zfs_recover);
|
||||
SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
|
||||
"Try to recover from otherwise-fatal errors.");
|
||||
|
||||
extern int zfs_txg_synctime_ms;
|
||||
|
||||
/*
|
||||
* Expiration time in units of zfs_txg_synctime_ms. This value has two
|
||||
* meanings. First it is used to determine when the spa_deadman logic
|
||||
* should fire. By default the spa_deadman will fire if spa_sync has
|
||||
* not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
|
||||
* Secondly, the value determines if an I/O is considered "hung".
|
||||
* Any I/O that has not completed in zfs_deadman_synctime is considered
|
||||
* "hung" resulting in a system panic.
|
||||
* 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
|
||||
*/
|
||||
uint64_t zfs_deadman_synctime = 1000ULL;
|
||||
TUNABLE_QUAD("vfs.zfs.deadman_synctime", &zfs_deadman_synctime);
|
||||
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime, CTLFLAG_RDTUN,
|
||||
&zfs_deadman_synctime, 0,
|
||||
"Stalled ZFS I/O expiration time in units of vfs.zfs.txg_synctime_ms");
|
||||
|
||||
/*
|
||||
* Default value of -1 for zfs_deadman_enabled is resolved in
|
||||
* zfs_deadman_init()
|
||||
*/
|
||||
int zfs_deadman_enabled = -1;
|
||||
TUNABLE_INT("vfs.zfs.deadman_enabled", &zfs_deadman_enabled);
|
||||
SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
|
||||
&zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
|
||||
|
||||
#ifndef illumos
|
||||
#ifdef _KERNEL
|
||||
static void
|
||||
zfs_deadman_init()
|
||||
{
|
||||
/*
|
||||
* If we are not i386 or amd64 or in a virtual machine,
|
||||
* disable ZFS deadman thread by default
|
||||
*/
|
||||
if (zfs_deadman_enabled == -1) {
|
||||
#if defined(__amd64__) || defined(__i386__)
|
||||
zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
|
||||
#else
|
||||
zfs_deadman_enabled = 0;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif /* _KERNEL */
|
||||
#endif /* !illumos */
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
@ -421,6 +469,23 @@ spa_lookup(const char *name)
|
||||
return (spa);
|
||||
}
|
||||
|
||||
/*
|
||||
* Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
|
||||
* If the zfs_deadman_enabled flag is set then it inspects all vdev queues
|
||||
* looking for potentially hung I/Os.
|
||||
*/
|
||||
void
|
||||
spa_deadman(void *arg)
|
||||
{
|
||||
spa_t *spa = arg;
|
||||
|
||||
zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
|
||||
(gethrtime() - spa->spa_sync_starttime) / NANOSEC,
|
||||
++spa->spa_deadman_calls);
|
||||
if (zfs_deadman_enabled)
|
||||
vdev_deadman(spa->spa_root_vdev);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create an uninitialized spa_t with the given name. Requires
|
||||
* spa_namespace_lock. The caller must ensure that the spa_t doesn't already
|
||||
@ -431,6 +496,10 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
|
||||
{
|
||||
spa_t *spa;
|
||||
spa_config_dirent_t *dp;
|
||||
#ifdef illumos
|
||||
cyc_handler_t hdlr;
|
||||
cyc_time_t when;
|
||||
#endif
|
||||
|
||||
ASSERT(MUTEX_HELD(&spa_namespace_lock));
|
||||
|
||||
@ -462,6 +531,32 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
|
||||
spa->spa_proc = &p0;
|
||||
spa->spa_proc_state = SPA_PROC_NONE;
|
||||
|
||||
#ifdef illumos
|
||||
hdlr.cyh_func = spa_deadman;
|
||||
hdlr.cyh_arg = spa;
|
||||
hdlr.cyh_level = CY_LOW_LEVEL;
|
||||
#endif
|
||||
|
||||
spa->spa_deadman_synctime = zfs_deadman_synctime *
|
||||
zfs_txg_synctime_ms * MICROSEC;
|
||||
|
||||
#ifdef illumos
|
||||
/*
|
||||
* This determines how often we need to check for hung I/Os after
|
||||
* the cyclic has already fired. Since checking for hung I/Os is
|
||||
* an expensive operation we don't want to check too frequently.
|
||||
* Instead wait for 5 synctimes before checking again.
|
||||
*/
|
||||
when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC;
|
||||
when.cyt_when = CY_INFINITY;
|
||||
mutex_enter(&cpu_lock);
|
||||
spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
|
||||
mutex_exit(&cpu_lock);
|
||||
#else /* !illumos */
|
||||
#ifdef _KERNEL
|
||||
callout_init(&spa->spa_deadman_cycid, CALLOUT_MPSAFE);
|
||||
#endif
|
||||
#endif
|
||||
refcount_create(&spa->spa_refcount);
|
||||
spa_config_lock_init(spa);
|
||||
|
||||
@ -544,6 +639,18 @@ spa_remove(spa_t *spa)
|
||||
nvlist_free(spa->spa_load_info);
|
||||
spa_config_set(spa, NULL);
|
||||
|
||||
#ifdef illumos
|
||||
mutex_enter(&cpu_lock);
|
||||
if (spa->spa_deadman_cycid != CYCLIC_NONE)
|
||||
cyclic_remove(spa->spa_deadman_cycid);
|
||||
mutex_exit(&cpu_lock);
|
||||
spa->spa_deadman_cycid = CYCLIC_NONE;
|
||||
#else /* !illumos */
|
||||
#ifdef _KERNEL
|
||||
callout_drain(&spa->spa_deadman_cycid);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
refcount_destroy(&spa->spa_refcount);
|
||||
|
||||
spa_config_lock_destroy(spa);
|
||||
@ -1510,6 +1617,12 @@ spa_prev_software_version(spa_t *spa)
|
||||
return (spa->spa_prev_software_version);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
spa_deadman_synctime(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_deadman_synctime);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
|
||||
{
|
||||
@ -1605,7 +1718,9 @@ spa_init(int mode)
|
||||
spa_mode_global = mode;
|
||||
|
||||
#ifdef illumos
|
||||
#ifndef _KERNEL
|
||||
#ifdef _KERNEL
|
||||
spa_arch_init();
|
||||
#else
|
||||
if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
|
||||
arc_procfd = open("/proc/self/ctl", O_WRONLY);
|
||||
if (arc_procfd == -1) {
|
||||
@ -1629,6 +1744,11 @@ spa_init(int mode)
|
||||
zpool_feature_init();
|
||||
spa_config_load();
|
||||
l2arc_start();
|
||||
#ifndef illumos
|
||||
#ifdef _KERNEL
|
||||
zfs_deadman_init();
|
||||
#endif
|
||||
#endif /* !illumos */
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -599,6 +599,7 @@ extern boolean_t spa_suspended(spa_t *spa);
|
||||
extern uint64_t spa_bootfs(spa_t *spa);
|
||||
extern uint64_t spa_delegation(spa_t *spa);
|
||||
extern objset_t *spa_meta_objset(spa_t *spa);
|
||||
extern uint64_t spa_deadman_synctime(spa_t *spa);
|
||||
|
||||
/* Miscellaneous support routines */
|
||||
extern void spa_activate_mos_feature(spa_t *spa, const char *feature);
|
||||
|
@ -23,6 +23,10 @@
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_SPA_BOOT_H
|
||||
#define _SYS_SPA_BOOT_H
|
||||
|
||||
@ -35,6 +39,8 @@ extern "C" {
|
||||
extern char *spa_get_bootprop(char *prop);
|
||||
extern void spa_free_bootprop(char *prop);
|
||||
|
||||
extern void spa_arch_init(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -22,6 +22,7 @@
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_SPA_IMPL_H
|
||||
@ -230,6 +231,16 @@ struct spa {
|
||||
uint64_t spa_feat_for_write_obj; /* required to write to pool */
|
||||
uint64_t spa_feat_for_read_obj; /* required to read from pool */
|
||||
uint64_t spa_feat_desc_obj; /* Feature descriptions */
|
||||
#ifdef illumos
|
||||
cyclic_id_t spa_deadman_cycid; /* cyclic id */
|
||||
#else /* FreeBSD */
|
||||
#ifdef _KERNEL
|
||||
struct callout spa_deadman_cycid; /* callout id */
|
||||
#endif
|
||||
#endif /* illumos */
|
||||
uint64_t spa_deadman_calls; /* number of deadman calls */
|
||||
uint64_t spa_sync_starttime; /* starting time fo spa_sync */
|
||||
uint64_t spa_deadman_synctime; /* deadman expiration timer */
|
||||
/*
|
||||
* spa_refcnt & spa_config_lock must be the last elements
|
||||
* because refcount_t changes size based on compilation options.
|
||||
|
@ -80,6 +80,7 @@ extern void vdev_metaslab_fini(vdev_t *vd);
|
||||
extern void vdev_metaslab_set_size(vdev_t *);
|
||||
extern void vdev_expand(vdev_t *vd, uint64_t txg);
|
||||
extern void vdev_split(vdev_t *vd);
|
||||
extern void vdev_deadman(vdev_t *vd);
|
||||
|
||||
|
||||
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
|
||||
|
@ -104,6 +104,8 @@ struct vdev_queue {
|
||||
avl_tree_t vq_read_tree;
|
||||
avl_tree_t vq_write_tree;
|
||||
avl_tree_t vq_pending_tree;
|
||||
uint64_t vq_io_complete_ts;
|
||||
uint64_t vq_io_delta_ts;
|
||||
kmutex_t vq_lock;
|
||||
};
|
||||
|
||||
|
@ -22,6 +22,10 @@
|
||||
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
/*
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_ZFS_CONTEXT_H
|
||||
#define _SYS_ZFS_CONTEXT_H
|
||||
@ -88,6 +92,11 @@ extern "C" {
|
||||
#include <sys/u8_textprep.h>
|
||||
#include <sys/fm/util.h>
|
||||
#include <sys/sunddi.h>
|
||||
#ifdef illumos
|
||||
#include <sys/cyclic.h>
|
||||
#else /* FreeBSD */
|
||||
#include <sys/callout.h>
|
||||
#endif
|
||||
|
||||
#include <machine/stdarg.h>
|
||||
|
||||
|
@ -246,12 +246,24 @@ typedef struct zinject_record {
|
||||
uint32_t zi_iotype;
|
||||
int32_t zi_duration;
|
||||
uint64_t zi_timer;
|
||||
uint32_t zi_cmd;
|
||||
uint32_t zi_pad;
|
||||
} zinject_record_t;
|
||||
|
||||
#define ZINJECT_NULL 0x1
|
||||
#define ZINJECT_FLUSH_ARC 0x2
|
||||
#define ZINJECT_UNLOAD_SPA 0x4
|
||||
|
||||
typedef enum zinject_type {
|
||||
ZINJECT_UNINITIALIZED,
|
||||
ZINJECT_DATA_FAULT,
|
||||
ZINJECT_DEVICE_FAULT,
|
||||
ZINJECT_LABEL_FAULT,
|
||||
ZINJECT_IGNORED_WRITES,
|
||||
ZINJECT_PANIC,
|
||||
ZINJECT_DELAY_IO,
|
||||
} zinject_type_t;
|
||||
|
||||
typedef struct zfs_share {
|
||||
uint64_t z_exportdata;
|
||||
uint64_t z_sharedata;
|
||||
|
@ -443,6 +443,7 @@ struct zio {
|
||||
|
||||
uint64_t io_offset;
|
||||
uint64_t io_deadline;
|
||||
uint64_t io_timestamp;
|
||||
avl_node_t io_offset_node;
|
||||
avl_node_t io_deadline_node;
|
||||
avl_tree_t *io_vdev_tree;
|
||||
@ -596,6 +597,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error);
|
||||
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
|
||||
extern int zio_handle_label_injection(zio_t *zio, int error);
|
||||
extern void zio_handle_ignored_writes(zio_t *zio);
|
||||
extern uint64_t zio_handle_io_delay(zio_t *zio);
|
||||
|
||||
/*
|
||||
* Checksum ereport functions
|
||||
|
@ -3173,3 +3173,41 @@ vdev_split(vdev_t *vd)
|
||||
}
|
||||
vdev_propagate_state(cvd);
|
||||
}
|
||||
|
||||
void
|
||||
vdev_deadman(vdev_t *vd)
|
||||
{
|
||||
for (int c = 0; c < vd->vdev_children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
|
||||
vdev_deadman(cvd);
|
||||
}
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf) {
|
||||
vdev_queue_t *vq = &vd->vdev_queue;
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
if (avl_numnodes(&vq->vq_pending_tree) > 0) {
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
zio_t *fio;
|
||||
uint64_t delta;
|
||||
|
||||
/*
|
||||
* Look at the head of all the pending queues,
|
||||
* if any I/O has been outstanding for longer than
|
||||
* the spa_deadman_synctime we panic the system.
|
||||
*/
|
||||
fio = avl_first(&vq->vq_pending_tree);
|
||||
delta = ddi_get_lbolt64() - fio->io_timestamp;
|
||||
if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) {
|
||||
zfs_dbgmsg("SLOW IO: zio timestamp %llu, "
|
||||
"delta %llu, last io %llu",
|
||||
fio->io_timestamp, delta,
|
||||
vq->vq_io_complete_ts);
|
||||
fm_panic("I/O to pool '%s' appears to be "
|
||||
"hung.", spa_name(spa));
|
||||
}
|
||||
}
|
||||
mutex_exit(&vq->vq_lock);
|
||||
}
|
||||
}
|
||||
|
@ -23,6 +23,10 @@
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/zio.h>
|
||||
@ -315,6 +319,7 @@ again:
|
||||
zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
|
||||
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
|
||||
vdev_queue_agg_io_done, NULL);
|
||||
aio->io_timestamp = fio->io_timestamp;
|
||||
|
||||
nio = fio;
|
||||
do {
|
||||
@ -386,7 +391,8 @@ vdev_queue_io(zio_t *zio)
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
|
||||
zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
|
||||
zio->io_timestamp = ddi_get_lbolt64();
|
||||
zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
|
||||
zio->io_priority;
|
||||
|
||||
vdev_queue_io_add(vq, zio);
|
||||
@ -411,10 +417,16 @@ vdev_queue_io_done(zio_t *zio)
|
||||
{
|
||||
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
|
||||
|
||||
if (zio_injection_enabled)
|
||||
delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
|
||||
avl_remove(&vq->vq_pending_tree, zio);
|
||||
|
||||
vq->vq_io_complete_ts = ddi_get_lbolt64();
|
||||
vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
|
||||
|
||||
for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
|
||||
zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
|
||||
if (nio == NULL)
|
||||
|
@ -20,6 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
/*
|
||||
@ -147,14 +148,8 @@ zio_handle_fault_injection(zio_t *zio, int error)
|
||||
for (handler = list_head(&inject_handlers); handler != NULL;
|
||||
handler = list_next(&inject_handlers, handler)) {
|
||||
|
||||
/* Ignore errors not destined for this pool */
|
||||
if (zio->io_spa != handler->zi_spa)
|
||||
continue;
|
||||
|
||||
/* Ignore device errors and panic injection */
|
||||
if (handler->zi_record.zi_guid != 0 ||
|
||||
handler->zi_record.zi_func[0] != '\0' ||
|
||||
handler->zi_record.zi_duration != 0)
|
||||
if (zio->io_spa != handler->zi_spa ||
|
||||
handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
|
||||
continue;
|
||||
|
||||
/* If this handler matches, return EIO */
|
||||
@ -197,10 +192,7 @@ zio_handle_label_injection(zio_t *zio, int error)
|
||||
uint64_t start = handler->zi_record.zi_start;
|
||||
uint64_t end = handler->zi_record.zi_end;
|
||||
|
||||
/* Ignore device only faults or panic injection */
|
||||
if (handler->zi_record.zi_start == 0 ||
|
||||
handler->zi_record.zi_func[0] != '\0' ||
|
||||
handler->zi_record.zi_duration != 0)
|
||||
if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
|
||||
continue;
|
||||
|
||||
/*
|
||||
@ -246,13 +238,7 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
|
||||
for (handler = list_head(&inject_handlers); handler != NULL;
|
||||
handler = list_next(&inject_handlers, handler)) {
|
||||
|
||||
/*
|
||||
* Ignore label specific faults, panic injection
|
||||
* or fake writes
|
||||
*/
|
||||
if (handler->zi_record.zi_start != 0 ||
|
||||
handler->zi_record.zi_func[0] != '\0' ||
|
||||
handler->zi_record.zi_duration != 0)
|
||||
if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
|
||||
continue;
|
||||
|
||||
if (vd->vdev_guid == handler->zi_record.zi_guid) {
|
||||
@ -316,10 +302,8 @@ zio_handle_ignored_writes(zio_t *zio)
|
||||
handler = list_next(&inject_handlers, handler)) {
|
||||
|
||||
/* Ignore errors not destined for this pool */
|
||||
if (zio->io_spa != handler->zi_spa)
|
||||
continue;
|
||||
|
||||
if (handler->zi_record.zi_duration == 0)
|
||||
if (zio->io_spa != handler->zi_spa ||
|
||||
handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
|
||||
continue;
|
||||
|
||||
/*
|
||||
@ -355,11 +339,8 @@ spa_handle_ignored_writes(spa_t *spa)
|
||||
for (handler = list_head(&inject_handlers); handler != NULL;
|
||||
handler = list_next(&inject_handlers, handler)) {
|
||||
|
||||
/* Ignore errors not destined for this pool */
|
||||
if (spa != handler->zi_spa)
|
||||
continue;
|
||||
|
||||
if (handler->zi_record.zi_duration == 0)
|
||||
if (spa != handler->zi_spa ||
|
||||
handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
|
||||
continue;
|
||||
|
||||
if (handler->zi_record.zi_duration > 0) {
|
||||
@ -379,6 +360,34 @@ spa_handle_ignored_writes(spa_t *spa)
|
||||
rw_exit(&inject_lock);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
zio_handle_io_delay(zio_t *zio)
|
||||
{
|
||||
vdev_t *vd = zio->io_vd;
|
||||
inject_handler_t *handler;
|
||||
uint64_t seconds = 0;
|
||||
|
||||
if (zio_injection_enabled == 0)
|
||||
return (0);
|
||||
|
||||
rw_enter(&inject_lock, RW_READER);
|
||||
|
||||
for (handler = list_head(&inject_handlers); handler != NULL;
|
||||
handler = list_next(&inject_handlers, handler)) {
|
||||
|
||||
if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
|
||||
continue;
|
||||
|
||||
if (vd->vdev_guid == handler->zi_record.zi_guid) {
|
||||
seconds = handler->zi_record.zi_timer;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
rw_exit(&inject_lock);
|
||||
return (seconds);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a new handler for the given record. We add it to the list, adding
|
||||
* a reference to the spa_t in the process. We increment zio_injection_enabled,
|
||||
|
Loading…
x
Reference in New Issue
Block a user