Properly handle IO with B_FAILFAST

Retry IO once with ZIO_FLAG_TRYHARD before declaring a pool faulted

OpenSolaris revision and Bug IDs:

9725:0bf7402e8022
6843014 ZFS B_FAILFAST handling is broken

Approved by:	delphij (mentor)
Obtained from:	OpenSolaris (Bug ID 6843014)
MFC after:	3 weeks
This commit is contained in:
Martin Matuska 2010-09-27 09:42:31 +00:00
parent 96a1a6a568
commit aa007a9f0e
11 changed files with 105 additions and 45 deletions

View File

@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* ZFS Fault Injector
*
@ -227,7 +225,7 @@ usage(void)
"\t\tClear the particular record (if given a numeric ID), or\n"
"\t\tall records if 'all' is specificed.\n"
"\n"
"\tzinject -d device [-e errno] [-L <nvlist|uber>] pool\n"
"\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F] pool\n"
"\t\tInject a fault into a particular device or the device's\n"
"\t\tlabel. Label injection can either be 'nvlist' or 'uber'.\n"
"\t\t'errno' can either be 'nxio' (the default) or 'io'.\n"
@ -519,7 +517,7 @@ main(int argc, char **argv)
return (0);
}
while ((c = getopt(argc, argv, ":ab:d:f:qhc:t:l:mr:e:uL:")) != -1) {
while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
@ -556,6 +554,9 @@ main(int argc, char **argv)
return (1);
}
break;
case 'F':
record.zi_failfast = B_TRUE;
break;
case 'h':
usage();
return (0);

View File

@ -4252,10 +4252,16 @@ spa_sync(spa_t *spa, uint64_t txg)
if (svdcount == SPA_DVAS_PER_BP)
break;
}
error = vdev_config_sync(svd, svdcount, txg);
error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
if (error != 0)
error = vdev_config_sync(svd, svdcount, txg,
B_TRUE);
} else {
error = vdev_config_sync(rvd->vdev_child,
rvd->vdev_children, txg);
rvd->vdev_children, txg, B_FALSE);
if (error != 0)
error = vdev_config_sync(rvd->vdev_child,
rvd->vdev_children, txg, B_TRUE);
}
spa_config_exit(spa, SCL_STATE, FTAG);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@ -113,7 +113,8 @@ extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
boolean_t);
extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);

View File

@ -118,7 +118,7 @@ typedef struct zinject_record {
uint32_t zi_error;
uint64_t zi_type;
uint32_t zi_freq;
uint32_t zi_pad; /* pad out to 64 bit alignment */
uint32_t zi_failfast;
} zinject_record_t;
#define ZINJECT_NULL 0x1

View File

@ -117,31 +117,33 @@ enum zio_compress {
#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
#define ZIO_PRIORITY_TABLE_SIZE 11
#define ZIO_FLAG_MUSTSUCCEED 0x00000
#define ZIO_FLAG_CANFAIL 0x00001
#define ZIO_FLAG_SPECULATIVE 0x00002
#define ZIO_FLAG_CONFIG_WRITER 0x00004
#define ZIO_FLAG_DONT_RETRY 0x00008
#define ZIO_FLAG_MUSTSUCCEED 0x000000
#define ZIO_FLAG_CANFAIL 0x000001
#define ZIO_FLAG_SPECULATIVE 0x000002
#define ZIO_FLAG_CONFIG_WRITER 0x000004
#define ZIO_FLAG_DONT_RETRY 0x000008
#define ZIO_FLAG_DONT_CACHE 0x00010
#define ZIO_FLAG_DONT_QUEUE 0x00020
#define ZIO_FLAG_DONT_AGGREGATE 0x00040
#define ZIO_FLAG_DONT_PROPAGATE 0x00080
#define ZIO_FLAG_DONT_CACHE 0x000010
#define ZIO_FLAG_DONT_QUEUE 0x000020
#define ZIO_FLAG_DONT_AGGREGATE 0x000040
#define ZIO_FLAG_DONT_PROPAGATE 0x000080
#define ZIO_FLAG_IO_BYPASS 0x00100
#define ZIO_FLAG_IO_REPAIR 0x00200
#define ZIO_FLAG_IO_RETRY 0x00400
#define ZIO_FLAG_IO_REWRITE 0x00800
#define ZIO_FLAG_IO_BYPASS 0x000100
#define ZIO_FLAG_IO_REPAIR 0x000200
#define ZIO_FLAG_IO_RETRY 0x000400
#define ZIO_FLAG_IO_REWRITE 0x000800
#define ZIO_FLAG_SELF_HEAL 0x01000
#define ZIO_FLAG_RESILVER 0x02000
#define ZIO_FLAG_SCRUB 0x04000
#define ZIO_FLAG_SCRUB_THREAD 0x08000
#define ZIO_FLAG_SELF_HEAL 0x001000
#define ZIO_FLAG_RESILVER 0x002000
#define ZIO_FLAG_SCRUB 0x004000
#define ZIO_FLAG_SCRUB_THREAD 0x008000
#define ZIO_FLAG_PROBE 0x10000
#define ZIO_FLAG_GANG_CHILD 0x20000
#define ZIO_FLAG_RAW 0x40000
#define ZIO_FLAG_GODFATHER 0x80000
#define ZIO_FLAG_PROBE 0x010000
#define ZIO_FLAG_GANG_CHILD 0x020000
#define ZIO_FLAG_RAW 0x040000
#define ZIO_FLAG_GODFATHER 0x080000
#define ZIO_FLAG_TRYHARD 0x100000
#define ZIO_FLAG_GANG_INHERIT \
(ZIO_FLAG_CANFAIL | \
@ -159,7 +161,8 @@ enum zio_compress {
(ZIO_FLAG_GANG_INHERIT | \
ZIO_FLAG_IO_REPAIR | \
ZIO_FLAG_IO_RETRY | \
ZIO_FLAG_PROBE)
ZIO_FLAG_PROBE | \
ZIO_FLAG_TRYHARD)
#define ZIO_FLAG_AGG_INHERIT \
(ZIO_FLAG_DONT_AGGREGATE | \
@ -440,7 +443,7 @@ extern int zio_inject_list_next(int *id, char *name, size_t buflen,
struct zinject_record *record);
extern int zio_clear_fault(int id);
extern int zio_handle_fault_injection(zio_t *zio, int error);
extern int zio_handle_device_injection(vdev_t *vd, int error);
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error);
#ifdef __cplusplus

View File

@ -928,7 +928,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
ZIO_FLAG_DONT_RETRY;
ZIO_FLAG_TRYHARD;
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
@ -1025,7 +1025,7 @@ vdev_open(vdev_t *vd)
error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
if (zio_injection_enabled && error == 0)
error = zio_handle_device_injection(vd, ENXIO);
error = zio_handle_device_injection(vd, NULL, ENXIO);
if (error) {
if (vd->vdev_removed &&
@ -2207,6 +2207,16 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (flags & ZIO_FLAG_SPECULATIVE)
return;
/*
* If this is an I/O error that is going to be retried, then ignore the
* error. Otherwise, the user may interpret B_FAILFAST I/O errors as
* hard errors, when in reality they can happen for any number of
* innocuous reasons (bus resets, MPxIO link failure, etc).
*/
if (zio->io_error == EIO &&
!(zio->io_flags & ZIO_FLAG_IO_RETRY))
return;
mutex_enter(&vd->vdev_stat_lock);
if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
if (zio->io_error == ECKSUM)

View File

@ -401,8 +401,9 @@ vdev_disk_io_start(zio_t *zio)
bioinit(bp);
bp->b_flags = B_BUSY | B_NOCACHE |
(zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE) |
((zio->io_flags & ZIO_FLAG_IO_RETRY) ? 0 : B_FAILFAST);
(zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
bp->b_flags |= B_FAILFAST;
bp->b_bcount = zio->io_size;
bp->b_un.b_addr = zio->io_data;
bp->b_lblkno = lbtodb(zio->io_offset);

View File

@ -339,8 +339,8 @@ vdev_label_read_config(vdev_t *vd)
nvlist_t *config = NULL;
vdev_phys_t *vp;
zio_t *zio;
int flags =
ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SPECULATIVE;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
@ -349,6 +349,7 @@ vdev_label_read_config(vdev_t *vd)
vp = zio_buf_alloc(sizeof (vdev_phys_t));
retry:
for (int l = 0; l < VDEV_LABELS; l++) {
zio = zio_root(spa, NULL, NULL, flags);
@ -368,6 +369,11 @@ vdev_label_read_config(vdev_t *vd)
}
}
if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
flags |= ZIO_FLAG_TRYHARD;
goto retry;
}
zio_buf_free(vp, sizeof (vdev_phys_t));
return (config);
@ -648,6 +654,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* Write everything in parallel.
*/
retry:
zio = zio_root(spa, NULL, NULL, flags);
for (int l = 0; l < VDEV_LABELS; l++) {
@ -674,6 +681,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
error = zio_wait(zio);
if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
flags |= ZIO_FLAG_TRYHARD;
goto retry;
}
nvlist_free(label);
zio_buf_free(pad2, VDEV_PAD_SIZE);
zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
@ -760,8 +772,8 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
{
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
int flags =
ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
if (vd == rvd) {
ASSERT(zio == NULL);
@ -999,7 +1011,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
* at any time, you can just call it again, and it will resume its work.
*/
int
vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
{
spa_t *spa = svd[0]->vdev_spa;
uberblock_t *ub = &spa->spa_uberblock;
@ -1008,6 +1020,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
int error;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
/*
* Normally, we don't want to try too hard to write every label and
* uberblock. If there is a flaky disk, we don't want the rest of the
* sync process to block while we retry. But if we can't write a
* single label out, we should retry with ZIO_FLAG_TRYHARD before
* bailing out and declaring the pool faulted.
*/
if (tryhard)
flags |= ZIO_FLAG_TRYHARD;
ASSERT(ub->ub_txg <= txg);
/*

View File

@ -134,6 +134,15 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
return;
/*
* If this I/O is not a retry I/O, don't post an ereport.
* Otherwise, we risk making bad diagnoses based on B_FAILFAST
* I/Os.
*/
if (zio->io_error == EIO &&
!(zio->io_flags & ZIO_FLAG_IO_RETRY))
return;
if (vd != NULL) {
/*
* If the vdev has already been marked as failing due

View File

@ -1870,7 +1870,8 @@ zio_vdev_io_done(zio_t *zio)
vdev_cache_write(zio);
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_device_injection(vd, EIO);
zio->io_error = zio_handle_device_injection(vd,
zio, EIO);
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_label_injection(zio, EIO);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@ -195,7 +195,7 @@ zio_handle_label_injection(zio_t *zio, int error)
int
zio_handle_device_injection(vdev_t *vd, int error)
zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
{
inject_handler_t *handler;
int ret = 0;
@ -210,6 +210,12 @@ zio_handle_device_injection(vdev_t *vd, int error)
continue;
if (vd->vdev_guid == handler->zi_record.zi_guid) {
if (handler->zi_record.zi_failfast &&
(zio == NULL || (zio->io_flags &
(ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
continue;
}
if (handler->zi_record.zi_error == error) {
/*
* For a failed open, pretend like the device