MFV r268714:

Improve extreme rewind import.

When doing an "extreme rewind" import ("zpool import -XF"), we attempt
to verify all data in the pool, essentially scrubbing the entire pool.
The problem is that spa_load_verify_cb() issues an unbounded number of
concurrent scrub i/os.  This can lead to all of memory being used for
these zio's, wedging the system. Like normal scrub, we need to put a
cap on the number of outstanding i/os, and have the traverse thread
block when we reach this cap.

For this purpose the cap can be very large (10,000) to optimize the
elevator algorithm.  Three kernel tunables have been added:

	vfs.zfs.spa_load_verify_maxinflight
	vfs.zfs.spa_load_verify_metadata
	vfs.zfs.spa_load_verify_data

The latter two tunables controls whether metadata and/or user data
when doing extreme rewind.

Make 'zpool import -T' imply scrub.

Make zpool import -T <txg> accept hexadecimal values for the txg when
prefixed with 0x.

Skip txg's for which there is no uberblock when doing extreme rewind.

Skip reading all user data twice by skipping prefetches when doing
extreme rewinds as we do not access via the ARC.

Illumos issues:
  4970 need controls on i/o issued by zpool import -XF
  4971 zpool import -T should accept hex values
  4972 zpool import -T implies extreme rewind, and thus a scrub
  4973 spa_load_retry retries the same txg
  4974 spa_load_verify() reads all data twice

MFC after:	2 weeks
This commit is contained in:
Xin LI 2014-07-15 22:44:04 +00:00
commit 7079d5877c
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=268720
2 changed files with 63 additions and 15 deletions

View File

@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012 by Frederik Wessels. All rights reserved.
* Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
@ -2033,7 +2033,7 @@ zpool_do_import(int argc, char **argv)
break;
case 'T':
errno = 0;
txg = strtoull(optarg, &endptr, 10);
txg = strtoull(optarg, &endptr, 0);
if (errno != 0 || *endptr != '\0') {
(void) fprintf(stderr,
gettext("invalid txg value\n"));

View File

@ -1873,6 +1873,7 @@ spa_load_verify_done(zio_t *zio)
spa_load_error_t *sle = zio->io_private;
dmu_object_type_t type = BP_GET_TYPE(bp);
int error = zio->io_error;
spa_t *spa = zio->io_spa;
if (error) {
if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
@ -1882,23 +1883,65 @@ spa_load_verify_done(zio_t *zio)
atomic_add_64(&sle->sle_data_count, 1);
}
zio_data_buf_free(zio->io_data, zio->io_size);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
cv_broadcast(&spa->spa_scrub_io_cv);
mutex_exit(&spa->spa_scrub_lock);
}
/*
* Maximum number of concurrent scrub i/os to create while verifying
* a pool while importing it.
*/
int spa_load_verify_maxinflight = 10000;
boolean_t spa_load_verify_metadata = B_TRUE;
boolean_t spa_load_verify_data = B_TRUE;
SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
&spa_load_verify_maxinflight, 0,
"Maximum number of concurrent scrub I/Os to create while verifying a "
"pool while importing it");
SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
&spa_load_verify_metadata, 0,
"Check metadata on import?");
SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
&spa_load_verify_data, 0,
"Check user data on import?");
/*ARGSUSED*/
static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
zio_t *rio = arg;
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
/*
* Note: normally this routine will not be called if
* spa_load_verify_metadata is not set. However, it may be useful
* to manually set the flag after the traversal has begun.
*/
if (!spa_load_verify_metadata)
return (0);
if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
return (0);
zio_nowait(zio_read(rio, spa, bp, data, size,
spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
}
zio_t *rio = arg;
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(rio, spa, bp, data, size,
spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
return (0);
}
@ -1909,7 +1952,7 @@ spa_load_verify(spa_t *spa)
spa_load_error_t sle = { 0 };
zpool_rewind_policy_t policy;
boolean_t verify_ok = B_FALSE;
int error;
int error = 0;
zpool_get_rewind_policy(spa->spa_config, &policy);
@ -1919,8 +1962,11 @@ spa_load_verify(spa_t *spa)
rio = zio_root(spa, NULL, &sle,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
error = traverse_pool(spa, spa->spa_verify_min_txg,
TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
if (spa_load_verify_metadata) {
error = traverse_pool(spa, spa->spa_verify_min_txg,
TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
spa_load_verify_cb, rio);
}
(void) zio_wait(rio);
@ -2795,7 +2841,7 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
spa_unload(spa);
spa_deactivate(spa);
spa->spa_load_max_txg--;
spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
spa_activate(spa, mode);
spa_async_suspend(spa);
@ -2825,6 +2871,8 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
spa_set_log_state(spa, SPA_LOG_CLEAR);
} else {
spa->spa_load_max_txg = max_request;
if (max_request != UINT64_MAX)
spa->spa_extreme_rewind = B_TRUE;
}
load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,