MFV r329793, r329795:

9075 Improve ZFS pool import/load process and corrupted pool recovery

illumos/illumos-gate@6f7938128a

Some work has been done lately to improve the debugability of the ZFS pool
load (and import) process. This includes:

https://www.illumos.org/issues/7638: Refactor spa_load_impl into several functions
https://www.illumos.org/issues/8961: SPA load/import should tell us why it failed
https://www.illumos.org/issues/7277: zdb should be able to print zfs_dbgmsg's

To iterate on top of that, there's a few changes that were made to make the
import process more resilient and crash free. One of the first tasks during the
pool load process is to parse a config provided from userland that describes
what devices the pool is composed of. A vdev tree is generated from that config,
and then all the vdevs are opened.

The Meta Object Set (MOS) of the pool is accessed, and several metadata objects
that are necessary to load the pool are read. The exact configuration of the
pool is also stored inside the MOS. Since the configuration provided from
userland is external and might not accurately describe the vdev tree
of the pool at the txg that is being loaded, it cannot be relied upon to safely
operate the pool. For that reason, the configuration in the MOS is read early
on. In the past, the two configurations were compared together and if there was
a mismatch then the load process was aborted and an error was returned.

The latter was a good way to ensure a pool does not get corrupted, however it
made the pool load process needlessly fragile in cases where the vdev
configuration changed or the userland configuration was outdated. Since the MOS
is stored in 3 copies, the configuration provided by userland doesn't have to be
perfect in order to read its contents. Hence, a new approach has been adopted:
The pool is first opened with the untrusted userland configuration just so that
the real configuration can be read from the MOS. The trusted MOS configuration
is then used to generate a new vdev tree and the pool is re-opened.

When the pool is opened with an untrusted configuration, writes are disabled
to avoid accidentally damaging it. During reads, some sanity checks are
performed on block pointers to see if each DVA points to a known vdev;
when the configuration is untrusted, instead of panicking the system if those
checks fail we simply avoid issuing reads to the invalid DVAs.

This new two-step pool load process now allows rewinding pools accross
vdev tree changes such as device replacement, addition, etc. Loading a pool
from an external config file in a clustering environment also becomes much
safer now since the pool will import even if the config is outdated and didn't,
for instance, register a recent device addition.

With this code in place, it became relatively easy to implement a
long-sought-after feature: the ability to import a pool with missing top level
(i.e. non-redundant) devices. Note that since this almost guarantees some loss
Of data, this feature is for now restricted to a read-only import.

Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Author: Pavel Zakharov <pavel.zakharov@delphix.com>
This commit is contained in:
Alexander Motin 2018-02-22 03:15:35 +00:00
commit 1ea10a60f9
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=329798
19 changed files with 1128 additions and 558 deletions

View File

@ -1582,6 +1582,10 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
(void) printf(gettext("split into new pool"));
break;
case VDEV_AUX_CHILDREN_OFFLINE:
(void) printf(gettext("all children offline"));
break;
default:
(void) printf(gettext("corrupted data"));
break;
@ -1675,6 +1679,10 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
(void) printf(gettext("too many errors"));
break;
case VDEV_AUX_CHILDREN_OFFLINE:
(void) printf(gettext("all children offline"));
break;
default:
(void) printf(gettext("corrupted data"));
break;
@ -2328,6 +2336,7 @@ zpool_do_import(int argc, char **argv)
idata.poolname = searchname;
idata.guid = searchguid;
idata.cachefile = cachefile;
idata.policy = policy;
pools = zpool_search_import(g_zfs, &idata);

View File

@ -391,6 +391,7 @@ typedef struct importargs {
int can_be_active : 1; /* can the pool be active? */
int unique : 1; /* does 'poolname' already exist? */
int exists : 1; /* set on return if pool already exists */
nvlist_t *policy; /* rewind policy (rewind txg, etc.) */
} importargs_t;
extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *);

View File

@ -440,7 +440,8 @@ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
* return to the user.
*/
static nvlist_t *
get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok,
nvlist_t *policy)
{
pool_entry_t *pe;
vdev_entry_t *ve;
@ -774,6 +775,12 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
continue;
}
if (policy != NULL) {
if (nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY,
policy) != 0)
goto nomem;
}
if ((nvl = refresh_config(hdl, config)) == NULL) {
nvlist_free(config);
config = NULL;
@ -1412,7 +1419,7 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
goto error;
}
ret = get_configs(hdl, &pools, iarg->can_be_active);
ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);
error:
for (pe = pools.pools; pe != NULL; pe = penext) {
@ -1542,6 +1549,14 @@ zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
if (active)
continue;
if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,
cachefile) != 0) {
(void) no_memory(hdl);
nvlist_free(raw);
nvlist_free(pools);
return (NULL);
}
if ((dst = refresh_config(hdl, src)) == NULL) {
nvlist_free(raw);
nvlist_free(pools);

View File

@ -1809,8 +1809,9 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
nvlist_lookup_nvlist(nvinfo,
ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) {
(void) printf(dgettext(TEXT_DOMAIN,
"The devices below are missing, use "
"'-m' to import the pool anyway:\n"));
"The devices below are missing or "
"corrupted, use '-m' to import the pool "
"anyway:\n"));
print_vdev_tree(hdl, NULL, missing, 2);
(void) printf("\n");
}

View File

@ -1007,6 +1007,16 @@ kernel_fini(void)
urandom_fd = -1;
}
/* ARGSUSED */
uint32_t
zone_get_hostid(void *zonep)
{
/*
* We're emulating the system's hostid in userland.
*/
return (strtoul(hw_serial, NULL, 10));
}
int
z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
{

View File

@ -609,6 +609,7 @@ typedef struct callb_cpr {
#define zone_dataset_visible(x, y) (1)
#define INGLOBALZONE(z) (1)
extern uint32_t zone_get_hostid(void *zonep);
extern char *kmem_asprintf(const char *fmt, ...);
#define strfree(str) kmem_free((str), strlen(str) + 1)

File diff suppressed because it is too large Load Diff

View File

@ -362,7 +362,8 @@ void
spa_config_set(spa_t *spa, nvlist_t *config)
{
mutex_enter(&spa->spa_props_lock);
nvlist_free(spa->spa_config);
if (spa->spa_config != NULL && spa->spa_config != config)
nvlist_free(spa->spa_config);
spa->spa_config = config;
mutex_exit(&spa->spa_props_lock);
}
@ -409,15 +410,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
spa->spa_comment);
}
#ifdef _KERNEL
hostid = zone_get_hostid(NULL);
#else /* _KERNEL */
/*
* We're emulating the system's hostid in userland, so we can't use
* zone_get_hostid().
*/
(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
#endif /* _KERNEL */
if (hostid != 0) {
fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
}

View File

@ -442,7 +442,8 @@ spa_load_failed(spa_t *spa, const char *fmt, ...)
(void) vsnprintf(buf, sizeof (buf), fmt, adx);
va_end(adx);
zfs_dbgmsg("spa_load(%s): FAILED: %s", spa->spa_name, buf);
zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
spa->spa_trust_config ? "trusted" : "untrusted", buf);
}
/*PRINTFLIKE2*/
@ -456,7 +457,8 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
(void) vsnprintf(buf, sizeof (buf), fmt, adx);
va_end(adx);
zfs_dbgmsg("spa_load(%s): %s", spa->spa_name, buf);
zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
spa->spa_trust_config ? "trusted" : "untrusted", buf);
}
/*
@ -718,6 +720,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_load_max_txg = UINT64_MAX;
spa->spa_proc = &p0;
spa->spa_proc_state = SPA_PROC_NONE;
spa->spa_trust_config = B_TRUE;
#ifdef illumos
hdlr.cyh_func = spa_deadman;
@ -2150,7 +2153,7 @@ spa_is_root(spa_t *spa)
boolean_t
spa_writeable(spa_t *spa)
{
return (!!(spa->spa_mode & FWRITE));
return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config);
}
/*
@ -2298,3 +2301,21 @@ spa_get_last_removal_txg(spa_t *spa)
return (ret);
}
boolean_t
spa_trust_config(spa_t *spa)
{
return (spa->spa_trust_config);
}
uint64_t
spa_missing_tvds_allowed(spa_t *spa)
{
return (spa->spa_missing_tvds_allowed);
}
void
spa_set_missing_tvds(spa_t *spa, uint64_t missing)
{
spa->spa_missing_tvds = missing;
}

View File

@ -338,6 +338,7 @@ typedef enum bp_embedded_type {
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
#define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */
/*
* A block is a hole when it has either 1) never been written to, or
@ -856,11 +857,16 @@ extern boolean_t spa_writeable(spa_t *spa);
extern boolean_t spa_has_pending_synctask(spa_t *spa);
extern int spa_maxblocksize(spa_t *spa);
extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
const blkptr_t *bp);
typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
void *arg);
extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
spa_remap_cb_t callback, void *arg);
extern uint64_t spa_get_last_removal_txg(spa_t *spa);
extern boolean_t spa_trust_config(spa_t *spa);
extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);

View File

@ -182,6 +182,15 @@ typedef enum spa_all_vdev_zap_action {
AVZ_ACTION_INITIALIZE
} spa_avz_action_t;
typedef enum spa_config_source {
SPA_CONFIG_SRC_NONE = 0,
SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */
SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */
SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */
SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */
SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */
} spa_config_source_t;
struct spa {
/*
* Fields protected by spa_namespace_lock.
@ -200,6 +209,8 @@ struct spa {
uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */
boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */
boolean_t spa_trust_config; /* do we trust vdev tree? */
spa_config_source_t spa_config_source; /* where config comes from? */
uint64_t spa_import_flags; /* import specific flags */
spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
dsl_pool_t *spa_dsl_pool;
@ -261,6 +272,8 @@ struct spa {
int spa_async_suspended; /* async tasks suspended */
kcondvar_t spa_async_cv; /* wait for thread_exit() */
uint16_t spa_async_tasks; /* async task mask */
uint64_t spa_missing_tvds; /* unopenable tvds on load */
uint64_t spa_missing_tvds_allowed; /* allow loading spa? */
spa_removing_phys_t spa_removing_phys;
spa_vdev_removal_t *spa_vdev_removal;

View File

@ -49,10 +49,13 @@ extern boolean_t zfs_nocacheflush;
extern boolean_t zfs_trim_enabled;
extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...);
extern void vdev_dbgmsg_print_tree(vdev_t *, int);
extern int vdev_open(vdev_t *);
extern void vdev_open_children(vdev_t *);
extern boolean_t vdev_uses_zvols(vdev_t *);
extern int vdev_validate(vdev_t *, boolean_t);
extern int vdev_validate(vdev_t *);
extern int vdev_copy_path_strict(vdev_t *, vdev_t *);
extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *);
extern void vdev_close(vdev_t *);
extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
extern void vdev_reopen(vdev_t *);
@ -101,6 +104,7 @@ extern void vdev_scan_stat_init(vdev_t *vd);
extern void vdev_propagate_state(vdev_t *vd);
extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
vdev_aux_t aux);
extern boolean_t vdev_children_are_offline(vdev_t *vd);
extern void vdev_space_update(vdev_t *vd,
int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
@ -145,7 +149,8 @@ typedef enum vdev_config_flag {
VDEV_CONFIG_SPARE = 1 << 0,
VDEV_CONFIG_L2CACHE = 1 << 1,
VDEV_CONFIG_REMOVING = 1 << 2,
VDEV_CONFIG_MOS = 1 << 3
VDEV_CONFIG_MOS = 1 << 3,
VDEV_CONFIG_MISSING = 1 << 4
} vdev_config_flag_t;
extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);

View File

@ -429,7 +429,6 @@ extern void vdev_remove_parent(vdev_t *cvd);
/*
* vdev sync load and sync
*/
extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
extern boolean_t vdev_log_state_valid(vdev_t *vd);
extern int vdev_load(vdev_t *vd);
extern int vdev_dtl_load(vdev_t *vd);

View File

@ -171,6 +171,8 @@ SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN,
&metaslabs_per_vdev, 0,
"When a vdev is added, how many metaslabs the vdev should be divided into");
boolean_t vdev_validate_skip = B_FALSE;
/*PRINTFLIKE2*/
void
vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
@ -193,6 +195,57 @@ vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
}
}
void
vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
{
char state[20];
if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
vd->vdev_ops->vdev_op_type);
return;
}
switch (vd->vdev_state) {
case VDEV_STATE_UNKNOWN:
(void) snprintf(state, sizeof (state), "unknown");
break;
case VDEV_STATE_CLOSED:
(void) snprintf(state, sizeof (state), "closed");
break;
case VDEV_STATE_OFFLINE:
(void) snprintf(state, sizeof (state), "offline");
break;
case VDEV_STATE_REMOVED:
(void) snprintf(state, sizeof (state), "removed");
break;
case VDEV_STATE_CANT_OPEN:
(void) snprintf(state, sizeof (state), "can't open");
break;
case VDEV_STATE_FAULTED:
(void) snprintf(state, sizeof (state), "faulted");
break;
case VDEV_STATE_DEGRADED:
(void) snprintf(state, sizeof (state), "degraded");
break;
case VDEV_STATE_HEALTHY:
(void) snprintf(state, sizeof (state), "healthy");
break;
default:
(void) snprintf(state, sizeof (state), "<state %u>",
(uint_t)vd->vdev_state);
}
zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
"", vd->vdev_id, vd->vdev_ops->vdev_op_type,
vd->vdev_islog ? " (log)" : "",
(u_longlong_t)vd->vdev_guid,
vd->vdev_path ? vd->vdev_path : "N/A", state);
for (uint64_t i = 0; i < vd->vdev_children; i++)
vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
}
/*
* Given a vdev type, return the appropriate ops vector.
*/
@ -1388,8 +1441,13 @@ vdev_open(vdev_t *vd)
vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
vd->vdev_removed = B_FALSE;
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
vd->vdev_stat.vs_aux);
if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
vd->vdev_stat.vs_aux);
} else {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
vd->vdev_stat.vs_aux);
}
return (error);
}
@ -1564,29 +1622,29 @@ vdev_open(vdev_t *vd)
/*
* Called once the vdevs are all opened, this routine validates the label
* contents. This needs to be done before vdev_load() so that we don't
* contents. This needs to be done before vdev_load() so that we don't
* inadvertently do repair I/Os to the wrong device.
*
* If 'strict' is false ignore the spa guid check. This is necessary because
* if the machine crashed during a re-guid the new guid might have been written
* to all of the vdev labels, but not the cached config. The strict check
* will be performed when the pool is opened again using the mos config.
*
* This function will only return failure if one of the vdevs indicates that it
* has since been destroyed or exported. This is only possible if
* /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
* will be updated but the function will return 0.
*/
int
vdev_validate(vdev_t *vd, boolean_t strict)
vdev_validate(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
uint64_t guid = 0, top_guid;
uint64_t guid = 0, aux_guid = 0, top_guid;
uint64_t state;
nvlist_t *nvl;
uint64_t txg;
for (int c = 0; c < vd->vdev_children; c++)
if (vdev_validate(vd->vdev_child[c], strict) != 0)
if (vdev_validate_skip)
return (0);
for (uint64_t c = 0; c < vd->vdev_children; c++)
if (vdev_validate(vd->vdev_child[c]) != 0)
return (SET_ERROR(EBADF));
/*
@ -1594,115 +1652,276 @@ vdev_validate(vdev_t *vd, boolean_t strict)
* any further validation. Otherwise, label I/O will fail and we will
* overwrite the previous state.
*/
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
uint64_t aux_guid = 0;
nvlist_t *nvl;
uint64_t txg = spa_last_synced_txg(spa) != 0 ?
spa_last_synced_txg(spa) : -1ULL;
if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
return (0);
if ((label = vdev_label_read_config(vd, txg)) == NULL) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
vdev_dbgmsg(vd, "vdev_validate: failed reading config");
return (0);
}
/*
* If we are performing an extreme rewind, we allow for a label that
* was modified at a point after the current txg.
*/
if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0)
txg = UINT64_MAX;
else
txg = spa_last_synced_txg(spa);
/*
* Determine if this vdev has been split off into another
* pool. If so, then refuse to open it.
*/
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
&aux_guid) == 0 && aux_guid == spa_guid(spa)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_SPLIT_POOL);
nvlist_free(label);
vdev_dbgmsg(vd, "vdev_validate: vdev split into other "
"pool");
return (0);
}
if (strict && (nvlist_lookup_uint64(label,
ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
guid != spa_guid(spa))) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid "
"doesn't match config (%llu != %llu)",
(u_longlong_t)guid,
(u_longlong_t)spa_guid(spa));
return (0);
}
if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
!= 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
&aux_guid) != 0)
aux_guid = 0;
/*
* If this vdev just became a top-level vdev because its
* sibling was detached, it will have adopted the parent's
* vdev guid -- but the label may or may not be on disk yet.
* Fortunately, either version of the label will have the
* same top guid, so if we're a top-level vdev, we can
* safely compare to that instead.
*
* If we split this vdev off instead, then we also check the
* original pool's guid. We don't want to consider the vdev
* corrupt if it is partway through a split operation.
*/
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
&guid) != 0 ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
&top_guid) != 0 ||
((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
(vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
vdev_dbgmsg(vd, "vdev_validate: config guid doesn't "
"match label guid (%llu != %llu)",
(u_longlong_t)vd->vdev_guid, (u_longlong_t)guid);
return (0);
}
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
&state) != 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
vdev_dbgmsg(vd, "vdev_validate: '%s' missing",
ZPOOL_CONFIG_POOL_STATE);
return (0);
}
nvlist_free(label);
/*
* If this is a verbatim import, no need to check the
* state of the pool.
*/
if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
spa_load_state(spa) == SPA_LOAD_OPEN &&
state != POOL_STATE_ACTIVE) {
vdev_dbgmsg(vd, "vdev_validate: invalid pool state "
"(%llu) for spa %s", (u_longlong_t)state,
spa->spa_name);
return (SET_ERROR(EBADF));
}
/*
* If we were able to open and validate a vdev that was
* previously marked permanently unavailable, clear that state
* now.
*/
if (vd->vdev_not_present)
vd->vdev_not_present = 0;
if ((label = vdev_label_read_config(vd, txg)) == NULL) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
vdev_dbgmsg(vd, "vdev_validate: failed reading config");
return (0);
}
/*
* Determine if this vdev has been split off into another
* pool. If so, then refuse to open it.
*/
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
&aux_guid) == 0 && aux_guid == spa_guid(spa)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_SPLIT_POOL);
nvlist_free(label);
vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
return (0);
}
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
ZPOOL_CONFIG_POOL_GUID);
return (0);
}
/*
* If config is not trusted then ignore the spa guid check. This is
* necessary because if the machine crashed during a re-guid the new
* guid might have been written to all of the vdev labels, but not the
* cached config. The check will be performed again once we have the
* trusted config from the MOS.
*/
if (spa->spa_trust_config && guid != spa_guid(spa)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
"match config (%llu != %llu)", (u_longlong_t)guid,
(u_longlong_t)spa_guid(spa));
return (0);
}
if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
!= 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
&aux_guid) != 0)
aux_guid = 0;
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
ZPOOL_CONFIG_GUID);
return (0);
}
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
!= 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
ZPOOL_CONFIG_TOP_GUID);
return (0);
}
/*
* If this vdev just became a top-level vdev because its sibling was
* detached, it will have adopted the parent's vdev guid -- but the
* label may or may not be on disk yet. Fortunately, either version
* of the label will have the same top guid, so if we're a top-level
* vdev, we can safely compare to that instead.
* However, if the config comes from a cachefile that failed to update
* after the detach, a top-level vdev will appear as a non top-level
* vdev in the config. Also relax the constraints if we perform an
* extreme rewind.
*
* If we split this vdev off instead, then we also check the
* original pool's guid. We don't want to consider the vdev
* corrupt if it is partway through a split operation.
*/
if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
boolean_t mismatch = B_FALSE;
if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
mismatch = B_TRUE;
} else {
if (vd->vdev_guid != top_guid &&
vd->vdev_top->vdev_guid != guid)
mismatch = B_TRUE;
}
if (mismatch) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
vdev_dbgmsg(vd, "vdev_validate: config guid "
"doesn't match label guid");
vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
(u_longlong_t)vd->vdev_guid,
(u_longlong_t)vd->vdev_top->vdev_guid);
vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
"aux_guid %llu", (u_longlong_t)guid,
(u_longlong_t)top_guid, (u_longlong_t)aux_guid);
return (0);
}
}
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
&state) != 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
ZPOOL_CONFIG_POOL_STATE);
return (0);
}
nvlist_free(label);
/*
* If this is a verbatim import, no need to check the
* state of the pool.
*/
if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
spa_load_state(spa) == SPA_LOAD_OPEN &&
state != POOL_STATE_ACTIVE) {
vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
"for spa %s", (u_longlong_t)state, spa->spa_name);
return (SET_ERROR(EBADF));
}
/*
* If we were able to open and validate a vdev that was
* previously marked permanently unavailable, clear that state
* now.
*/
if (vd->vdev_not_present)
vd->vdev_not_present = 0;
return (0);
}
static void
vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
{
if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
"from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
dvd->vdev_path, svd->vdev_path);
spa_strfree(dvd->vdev_path);
dvd->vdev_path = spa_strdup(svd->vdev_path);
}
} else if (svd->vdev_path != NULL) {
dvd->vdev_path = spa_strdup(svd->vdev_path);
zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
(u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
}
}
/*
* Recursively copy vdev paths from one vdev to another. Source and destination
* vdev trees must have same geometry otherwise return error. Intended to copy
* paths from userland config into MOS config.
*/
int
vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
{
if ((svd->vdev_ops == &vdev_missing_ops) ||
(svd->vdev_ishole && dvd->vdev_ishole) ||
(dvd->vdev_ops == &vdev_indirect_ops))
return (0);
if (svd->vdev_ops != dvd->vdev_ops) {
vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
return (SET_ERROR(EINVAL));
}
if (svd->vdev_guid != dvd->vdev_guid) {
vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
"%llu)", (u_longlong_t)svd->vdev_guid,
(u_longlong_t)dvd->vdev_guid);
return (SET_ERROR(EINVAL));
}
if (svd->vdev_children != dvd->vdev_children) {
vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
"%llu != %llu", (u_longlong_t)svd->vdev_children,
(u_longlong_t)dvd->vdev_children);
return (SET_ERROR(EINVAL));
}
for (uint64_t i = 0; i < svd->vdev_children; i++) {
int error = vdev_copy_path_strict(svd->vdev_child[i],
dvd->vdev_child[i]);
if (error != 0)
return (error);
}
if (svd->vdev_ops->vdev_op_leaf)
vdev_copy_path_impl(svd, dvd);
return (0);
}
static void
vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
{
ASSERT(stvd->vdev_top == stvd);
ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
for (uint64_t i = 0; i < dvd->vdev_children; i++) {
vdev_copy_path_search(stvd, dvd->vdev_child[i]);
}
if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
return;
/*
* The idea here is that while a vdev can shift positions within
* a top vdev (when replacing, attaching mirror, etc.) it cannot
* step outside of it.
*/
vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
return;
ASSERT(vd->vdev_ops->vdev_op_leaf);
vdev_copy_path_impl(vd, dvd);
}
/*
* Recursively copy vdev paths from one root vdev to another. Source and
* destination vdev trees may differ in geometry. For each destination leaf
* vdev, search a vdev with the same guid and top vdev id in the source.
* Intended to copy paths from userland config into MOS config.
*/
void
vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
{
uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
ASSERT(srvd->vdev_ops == &vdev_root_ops);
ASSERT(drvd->vdev_ops == &vdev_root_ops);
for (uint64_t i = 0; i < children; i++) {
vdev_copy_path_search(srvd->vdev_child[i],
drvd->vdev_child[i]);
}
}
/*
* Close a virtual device.
*/
@ -1801,7 +2020,7 @@ vdev_reopen(vdev_t *vd)
!l2arc_vdev_present(vd))
l2arc_add_vdev(spa, vd);
} else {
(void) vdev_validate(vd, B_TRUE);
(void) vdev_validate(vd);
}
/*
@ -3705,6 +3924,19 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
vdev_propagate_state(vd->vdev_parent);
}
boolean_t
vdev_children_are_offline(vdev_t *vd)
{
ASSERT(!vd->vdev_ops->vdev_op_leaf);
for (uint64_t i = 0; i < vd->vdev_children; i++) {
if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Check the vdev configuration to ensure that it's capable of supporting
* a root pool. We do not support partial configuration.
@ -3748,35 +3980,6 @@ vdev_is_concrete(vdev_t *vd)
}
}
/*
* Load the state from the original vdev tree (ovd) which
* we've retrieved from the MOS config object. If the original
* vdev was offline or faulted then we transfer that state to the
* device in the current vdev tree (nvd).
*/
void
vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
{
spa_t *spa = nvd->vdev_spa;
ASSERT(nvd->vdev_top->vdev_islog);
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
for (int c = 0; c < nvd->vdev_children; c++)
vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
if (nvd->vdev_ops->vdev_op_leaf) {
/*
* Restore the persistent vdev state
*/
nvd->vdev_offline = ovd->vdev_offline;
nvd->vdev_faulted = ovd->vdev_faulted;
nvd->vdev_degraded = ovd->vdev_degraded;
nvd->vdev_removed = ovd->vdev_removed;
}
}
/*
* Determine if a log device has valid content. If the vdev was
* removed or faulted in the MOS config then we know that

View File

@ -271,7 +271,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
vd->vdev_wholedisk);
if (vd->vdev_not_present)
if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
if (vd->vdev_isspare)
@ -1117,6 +1117,11 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
"txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
if (*config == NULL && spa->spa_extreme_rewind) {
vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
"Trying again without txg restrictions.");
*config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX);
}
if (*config == NULL) {
vdev_dbgmsg(cb.ubl_vd, "failed to read label config");
}
@ -1143,7 +1148,7 @@ vdev_uberblock_sync_done(zio_t *zio)
static void
vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
{
for (int c = 0; c < vd->vdev_children; c++)
for (uint64_t c = 0; c < vd->vdev_children; c++)
vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
if (!vd->vdev_ops->vdev_op_leaf)

View File

@ -211,9 +211,34 @@ vdev_mirror_map_init(zio_t *zio)
if (vd == NULL) {
dva_t *dva = zio->io_bp->blk_dva;
spa_t *spa = zio->io_spa;
dva_t dva_copy[SPA_DVAS_PER_BP];
c = BP_GET_NDVAS(zio->io_bp);
/*
* If we do not trust the pool config, some DVAs might be
* invalid or point to vdevs that do not exist. We skip them.
*/
if (!spa_trust_config(spa)) {
ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
int j = 0;
for (int i = 0; i < c; i++) {
if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
dva_copy[j++] = dva[i];
}
if (j == 0) {
zio->io_vsd = NULL;
zio->io_error = ENXIO;
return (NULL);
}
if (j < c) {
dva = dva_copy;
c = j;
}
}
mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE,
B_TRUE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
@ -296,7 +321,10 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
}
if (numerrors == vd->vdev_children) {
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
if (vdev_children_are_offline(vd))
vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
else
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
return (lasterror);
}
@ -480,6 +508,13 @@ vdev_mirror_io_start(zio_t *zio)
mm = vdev_mirror_map_init(zio);
if (mm == NULL) {
ASSERT(!spa_trust_config(zio->io_spa));
ASSERT(zio->io_type == ZIO_TYPE_READ);
zio_execute(zio);
return;
}
if (zio->io_type == ZIO_TYPE_READ) {
if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
mm->mm_children > 1) {
@ -552,6 +587,9 @@ vdev_mirror_io_done(zio_t *zio)
int good_copies = 0;
int unexpected_errors = 0;
if (mm == NULL)
return;
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
@ -659,13 +697,19 @@ vdev_mirror_io_done(zio_t *zio)
static void
vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
{
if (faulted == vd->vdev_children)
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_NO_REPLICAS);
else if (degraded + faulted != 0)
if (faulted == vd->vdev_children) {
if (vdev_children_are_offline(vd)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
VDEV_AUX_CHILDREN_OFFLINE);
} else {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_NO_REPLICAS);
}
} else if (degraded + faulted != 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
else
} else {
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
}
}
vdev_ops_t vdev_mirror_ops = {

View File

@ -37,6 +37,23 @@
* Virtual device vector for the pool's root vdev.
*/
static uint64_t
vdev_root_core_tvds(vdev_t *vd)
{
uint64_t tvds = 0;
for (uint64_t c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (!cvd->vdev_ishole && !cvd->vdev_islog &&
cvd->vdev_ops != &vdev_indirect_ops) {
tvds++;
}
}
return (tvds);
}
/*
* We should be able to tolerate one failure with absolutely no damage
* to our metadata. Two failures will take out space maps, a bunch of
@ -46,17 +63,28 @@
* probably fine. Adding bean counters during alloc/free can make this
* future guesswork more accurate.
*/
static int
too_many_errors(vdev_t *vd, int numerrors)
static boolean_t
too_many_errors(vdev_t *vd, uint64_t numerrors)
{
ASSERT3U(numerrors, <=, vd->vdev_children);
return (numerrors > 0);
uint64_t tvds;
if (numerrors == 0)
return (B_FALSE);
tvds = vdev_root_core_tvds(vd);
ASSERT3U(numerrors, <=, tvds);
if (numerrors == tvds)
return (B_TRUE);
return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa));
}
static int
vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
spa_t *spa = vd->vdev_spa;
int lasterror = 0;
int numerrors = 0;
@ -76,6 +104,9 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
}
}
if (spa_load_state(spa) != SPA_LOAD_NONE)
spa_set_missing_tvds(spa, numerrors);
if (too_many_errors(vd, numerrors)) {
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
return (lasterror);
@ -102,7 +133,7 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
if (too_many_errors(vd, faulted)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_NO_REPLICAS);
} else if (degraded) {
} else if (degraded || faulted) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
} else {
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);

View File

@ -762,6 +762,13 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
}
}
/*
* Do not verify individual DVAs if the config is not trusted. This
* will be done once the zio is executed in vdev_mirror_map_alloc.
*/
if (!spa->spa_trust_config)
return;
/*
* Pool-specific checks.
*
@ -811,6 +818,36 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
}
}
boolean_t
zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
{
uint64_t vdevid = DVA_GET_VDEV(dva);
if (vdevid >= spa->spa_root_vdev->vdev_children)
return (B_FALSE);
vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
if (vd == NULL)
return (B_FALSE);
if (vd->vdev_ops == &vdev_hole_ops)
return (B_FALSE);
if (vd->vdev_ops == &vdev_missing_ops) {
return (B_FALSE);
}
uint64_t offset = DVA_GET_OFFSET(dva);
uint64_t asize = DVA_GET_ASIZE(dva);
if (BP_IS_GANG(bp))
asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
if (offset + asize > vd->vdev_asize)
return (B_FALSE);
return (B_TRUE);
}
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
@ -3148,10 +3185,14 @@ zio_vdev_io_start(zio_t *zio)
}
ASSERT3P(zio->io_logical, !=, zio);
if (zio->io_type == ZIO_TYPE_WRITE && zio->io_vd->vdev_removing) {
ASSERT(zio->io_flags &
(ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
ZIO_FLAG_INDUCE_DAMAGE));
if (zio->io_type == ZIO_TYPE_WRITE) {
ASSERT(spa->spa_trust_config);
if (zio->io_vd->vdev_removing) {
ASSERT(zio->io_flags &
(ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
ZIO_FLAG_INDUCE_DAMAGE));
}
}
/*

View File

@ -588,6 +588,7 @@ typedef struct zpool_rewind_policy {
#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top"
#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf"
#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps"
#define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */
/*
* The persistent vdev state is stored as separate values rather than a single
* 'vdev_state' entry. This is because a device can be in multiple states, such
@ -689,7 +690,8 @@ typedef enum vdev_aux {
VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */
VDEV_AUX_EXTERNAL, /* external diagnosis */
VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */
VDEV_AUX_ASHIFT_TOO_BIG /* vdev's min block size is too large */
VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */
VDEV_AUX_CHILDREN_OFFLINE /* all children are offline */
} vdev_aux_t;
/*