3206 lines
84 KiB
C
Raw Normal View History

2008-11-20 12:01:55 -08:00
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
2008-11-20 12:01:55 -08:00
*/
#include <sys/zfs_context.h>
#include <sys/fm/fs/zfs.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/vdev_impl.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/space_map.h>
#include <sys/zio.h>
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include <sys/arc.h>
2009-07-02 15:44:48 -07:00
#include <sys/zil.h>
#include <sys/dsl_scan.h>
#include <sys/zvol.h>
2008-11-20 12:01:55 -08:00
/*
* Virtual device management.
*/
static vdev_ops_t *vdev_ops_table[] = {
&vdev_root_ops,
&vdev_raidz_ops,
&vdev_mirror_ops,
&vdev_replacing_ops,
&vdev_spare_ops,
&vdev_disk_ops,
&vdev_file_ops,
&vdev_missing_ops,
&vdev_hole_ops,
2008-11-20 12:01:55 -08:00
NULL
};
/* maximum scrub/resilver I/O queue per leaf vdev */
int zfs_scrub_limit = 10;
2008-11-20 12:01:55 -08:00
/*
* Given a vdev type, return the appropriate ops vector.
*/
static vdev_ops_t *
vdev_getops(const char *type)
{
vdev_ops_t *ops, **opspp;
for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
if (strcmp(ops->vdev_op_type, type) == 0)
break;
return (ops);
}
/*
* Default asize function: return the MAX of psize with the asize of
* all children. This is what's used by anything other than RAID-Z.
*/
uint64_t
vdev_default_asize(vdev_t *vd, uint64_t psize)
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;
int c;
2008-11-20 12:01:55 -08:00
for (c = 0; c < vd->vdev_children; c++) {
2008-11-20 12:01:55 -08:00
csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
asize = MAX(asize, csize);
}
return (asize);
}
/*
2009-07-02 15:44:48 -07:00
* Get the minimum allocatable size. We define the allocatable size as
* the vdev's asize rounded to the nearest metaslab. This allows us to
* replace or attach devices which don't have the same physical size but
* can still satisfy the same number of allocations.
2008-11-20 12:01:55 -08:00
*/
uint64_t
2009-07-02 15:44:48 -07:00
vdev_get_min_asize(vdev_t *vd)
2008-11-20 12:01:55 -08:00
{
2009-07-02 15:44:48 -07:00
vdev_t *pvd = vd->vdev_parent;
2008-11-20 12:01:55 -08:00
2009-07-02 15:44:48 -07:00
/*
* If our parent is NULL (inactive spare or cache) or is the root,
2009-07-02 15:44:48 -07:00
* just return our own asize.
*/
if (pvd == NULL)
return (vd->vdev_asize);
2008-11-20 12:01:55 -08:00
/*
2009-07-02 15:44:48 -07:00
* The top-level vdev just returns the allocatable size rounded
* to the nearest metaslab.
2008-11-20 12:01:55 -08:00
*/
2009-07-02 15:44:48 -07:00
if (vd == vd->vdev_top)
return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
2008-11-20 12:01:55 -08:00
2009-07-02 15:44:48 -07:00
/*
* The allocatable space for a raidz vdev is N * sizeof(smallest child),
* so each child must provide at least 1/Nth of its asize.
*/
if (pvd->vdev_ops == &vdev_raidz_ops)
return (pvd->vdev_min_asize / pvd->vdev_children);
2008-11-20 12:01:55 -08:00
2009-07-02 15:44:48 -07:00
return (pvd->vdev_min_asize);
}
void
vdev_set_min_asize(vdev_t *vd)
{
int c;
2009-07-02 15:44:48 -07:00
vd->vdev_min_asize = vdev_get_min_asize(vd);
2008-11-20 12:01:55 -08:00
for (c = 0; c < vd->vdev_children; c++)
2009-07-02 15:44:48 -07:00
vdev_set_min_asize(vd->vdev_child[c]);
2008-11-20 12:01:55 -08:00
}
vdev_t *
vdev_lookup_top(spa_t *spa, uint64_t vdev)
{
vdev_t *rvd = spa->spa_root_vdev;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2008-11-20 12:01:55 -08:00
if (vdev < rvd->vdev_children) {
ASSERT(rvd->vdev_child[vdev] != NULL);
2008-11-20 12:01:55 -08:00
return (rvd->vdev_child[vdev]);
}
2008-11-20 12:01:55 -08:00
return (NULL);
}
vdev_t *
vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
{
vdev_t *mvd;
int c;
2008-11-20 12:01:55 -08:00
if (vd->vdev_guid == guid)
return (vd);
for (c = 0; c < vd->vdev_children; c++)
2008-11-20 12:01:55 -08:00
if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
NULL)
return (mvd);
return (NULL);
}
void
vdev_add_child(vdev_t *pvd, vdev_t *cvd)
{
size_t oldsize, newsize;
uint64_t id = cvd->vdev_id;
vdev_t **newchild;
ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2008-11-20 12:01:55 -08:00
ASSERT(cvd->vdev_parent == NULL);
cvd->vdev_parent = pvd;
if (pvd == NULL)
return;
ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
oldsize = pvd->vdev_children * sizeof (vdev_t *);
pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
newsize = pvd->vdev_children * sizeof (vdev_t *);
newchild = kmem_zalloc(newsize, KM_PUSHPAGE);
2008-11-20 12:01:55 -08:00
if (pvd->vdev_child != NULL) {
bcopy(pvd->vdev_child, newchild, oldsize);
kmem_free(pvd->vdev_child, oldsize);
}
pvd->vdev_child = newchild;
pvd->vdev_child[id] = cvd;
cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
/*
* Walk up all ancestors to update guid sum.
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += cvd->vdev_guid_sum;
}
void
vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
{
int c;
uint_t id = cvd->vdev_id;
ASSERT(cvd->vdev_parent == pvd);
if (pvd == NULL)
return;
ASSERT(id < pvd->vdev_children);
ASSERT(pvd->vdev_child[id] == cvd);
pvd->vdev_child[id] = NULL;
cvd->vdev_parent = NULL;
for (c = 0; c < pvd->vdev_children; c++)
if (pvd->vdev_child[c])
break;
if (c == pvd->vdev_children) {
kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
pvd->vdev_child = NULL;
pvd->vdev_children = 0;
}
/*
* Walk up all ancestors to update guid sum.
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
}
/*
* Remove any holes in the child array.
*/
void
vdev_compact_children(vdev_t *pvd)
{
vdev_t **newchild, *cvd;
int oldc = pvd->vdev_children;
2009-07-02 15:44:48 -07:00
int newc;
int c;
2008-11-20 12:01:55 -08:00
ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2008-11-20 12:01:55 -08:00
for (c = newc = 0; c < oldc; c++)
2008-11-20 12:01:55 -08:00
if (pvd->vdev_child[c])
newc++;
newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_PUSHPAGE);
2008-11-20 12:01:55 -08:00
for (c = newc = 0; c < oldc; c++) {
2008-11-20 12:01:55 -08:00
if ((cvd = pvd->vdev_child[c]) != NULL) {
newchild[newc] = cvd;
cvd->vdev_id = newc++;
}
}
kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
pvd->vdev_child = newchild;
pvd->vdev_children = newc;
}
/*
* Allocate and minimally initialize a vdev_t.
*/
vdev_t *
2008-11-20 12:01:55 -08:00
vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
{
vdev_t *vd;
int t;
2008-11-20 12:01:55 -08:00
vd = kmem_zalloc(sizeof (vdev_t), KM_PUSHPAGE);
2008-11-20 12:01:55 -08:00
if (spa->spa_root_vdev == NULL) {
ASSERT(ops == &vdev_root_ops);
spa->spa_root_vdev = vd;
spa->spa_load_guid = spa_generate_guid(NULL);
2008-11-20 12:01:55 -08:00
}
if (guid == 0 && ops != &vdev_hole_ops) {
2008-11-20 12:01:55 -08:00
if (spa->spa_root_vdev == vd) {
/*
* The root vdev's guid will also be the pool guid,
* which must be unique among all pools.
*/
guid = spa_generate_guid(NULL);
2008-11-20 12:01:55 -08:00
} else {
/*
* Any other vdev's guid must be unique within the pool.
*/
guid = spa_generate_guid(spa);
2008-11-20 12:01:55 -08:00
}
ASSERT(!spa_guid_exists(spa_guid(spa), guid));
}
vd->vdev_spa = spa;
vd->vdev_id = id;
vd->vdev_guid = guid;
vd->vdev_guid_sum = guid;
vd->vdev_ops = ops;
vd->vdev_state = VDEV_STATE_CLOSED;
vd->vdev_ishole = (ops == &vdev_hole_ops);
2008-11-20 12:01:55 -08:00
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
2008-11-20 12:01:55 -08:00
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
for (t = 0; t < DTL_TYPES; t++) {
2009-01-15 13:59:39 -08:00
space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
&vd->vdev_dtl_lock);
}
2008-11-20 12:01:55 -08:00
txg_list_create(&vd->vdev_ms_list,
offsetof(struct metaslab, ms_txg_node));
txg_list_create(&vd->vdev_dtl_list,
offsetof(struct vdev, vdev_dtl_node));
vd->vdev_stat.vs_timestamp = gethrtime();
vdev_queue_init(vd);
vdev_cache_init(vd);
return (vd);
}
/*
* Allocate a new vdev. The 'alloctype' is used to control whether we are
* creating a new vdev or loading an existing one - the behavior is slightly
* different for each case.
*/
int
vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
int alloctype)
{
vdev_ops_t *ops;
char *type;
uint64_t guid = 0, islog, nparity;
vdev_t *vd;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2008-11-20 12:01:55 -08:00
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
return (EINVAL);
if ((ops = vdev_getops(type)) == NULL)
return (EINVAL);
/*
* If this is a load, get the vdev guid from the nvlist.
* Otherwise, vdev_alloc_common() will generate one for us.
*/
if (alloctype == VDEV_ALLOC_LOAD) {
uint64_t label_id;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
label_id != id)
return (EINVAL);
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (EINVAL);
} else if (alloctype == VDEV_ALLOC_SPARE) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (EINVAL);
} else if (alloctype == VDEV_ALLOC_L2CACHE) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (EINVAL);
2009-07-02 15:44:48 -07:00
} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (EINVAL);
2008-11-20 12:01:55 -08:00
}
/*
* The first allocated vdev must be of type 'root'.
*/
if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
return (EINVAL);
/*
* Determine whether we're a log vdev.
*/
islog = 0;
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
return (ENOTSUP);
if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
return (ENOTSUP);
2008-11-20 12:01:55 -08:00
/*
* Set the nparity property for RAID-Z vdevs.
*/
nparity = -1ULL;
if (ops == &vdev_raidz_ops) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
2008-11-20 12:01:55 -08:00
return (EINVAL);
/*
2009-08-18 11:43:27 -07:00
* Previous versions could only support 1 or 2 parity
* device.
2008-11-20 12:01:55 -08:00
*/
2009-08-18 11:43:27 -07:00
if (nparity > 1 &&
spa_version(spa) < SPA_VERSION_RAIDZ2)
return (ENOTSUP);
if (nparity > 2 &&
spa_version(spa) < SPA_VERSION_RAIDZ3)
2008-11-20 12:01:55 -08:00
return (ENOTSUP);
} else {
/*
* We require the parity to be specified for SPAs that
* support multiple parity levels.
*/
2009-08-18 11:43:27 -07:00
if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
2008-11-20 12:01:55 -08:00
return (EINVAL);
/*
* Otherwise, we default to 1 parity device for RAID-Z.
*/
nparity = 1;
}
} else {
nparity = 0;
}
ASSERT(nparity != -1ULL);
vd = vdev_alloc_common(spa, id, guid, ops);
vd->vdev_islog = islog;
vd->vdev_nparity = nparity;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
vd->vdev_path = spa_strdup(vd->vdev_path);
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
vd->vdev_devid = spa_strdup(vd->vdev_devid);
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
&vd->vdev_physpath) == 0)
vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
2009-07-02 15:44:48 -07:00
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
vd->vdev_fru = spa_strdup(vd->vdev_fru);
2008-11-20 12:01:55 -08:00
/*
* Set the whole_disk property. If it's not specified, leave the value
* as -1.
*/
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&vd->vdev_wholedisk) != 0)
vd->vdev_wholedisk = -1ULL;
/*
* Look for the 'not present' flag. This will only be set if the device
* was not present at the time of import.
*/
2009-07-02 15:44:48 -07:00
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
&vd->vdev_not_present);
2008-11-20 12:01:55 -08:00
/*
* Get the alignment requirement.
*/
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
/*
* Retrieve the vdev creation time.
*/
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
&vd->vdev_crtxg);
2008-11-20 12:01:55 -08:00
/*
* If we're a top-level vdev, try to load the allocation parameters.
*/
if (parent && !parent->vdev_parent &&
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
2008-11-20 12:01:55 -08:00
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
&vd->vdev_ms_array);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
&vd->vdev_ms_shift);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
&vd->vdev_asize);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
&vd->vdev_removing);
}
if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
ASSERT(alloctype == VDEV_ALLOC_LOAD ||
alloctype == VDEV_ALLOC_ADD ||
alloctype == VDEV_ALLOC_SPLIT ||
alloctype == VDEV_ALLOC_ROOTPOOL);
vd->vdev_mg = metaslab_group_create(islog ?
spa_log_class(spa) : spa_normal_class(spa), vd);
2008-11-20 12:01:55 -08:00
}
/*
* If we're a leaf vdev, try to load the DTL object and other state.
*/
if (vd->vdev_ops->vdev_op_leaf &&
2009-07-02 15:44:48 -07:00
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
alloctype == VDEV_ALLOC_ROOTPOOL)) {
if (alloctype == VDEV_ALLOC_LOAD) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
2009-01-15 13:59:39 -08:00
&vd->vdev_dtl_smo.smo_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
&vd->vdev_unspare);
}
2009-07-02 15:44:48 -07:00
if (alloctype == VDEV_ALLOC_ROOTPOOL) {
uint64_t spare = 0;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
&spare) == 0 && spare)
spa_spare_add(vd);
}
2008-11-20 12:01:55 -08:00
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
&vd->vdev_resilvering);
2008-11-20 12:01:55 -08:00
/*
* When importing a pool, we want to ignore the persistent fault
* state, as the diagnosis made on another system may not be
* valid in the current context. Local vdevs will
* remain in the faulted state.
2008-11-20 12:01:55 -08:00
*/
if (spa_load_state(spa) == SPA_LOAD_OPEN) {
2008-11-20 12:01:55 -08:00
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
&vd->vdev_faulted);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
&vd->vdev_degraded);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
&vd->vdev_removed);
if (vd->vdev_faulted || vd->vdev_degraded) {
char *aux;
vd->vdev_label_aux =
VDEV_AUX_ERR_EXCEEDED;
if (nvlist_lookup_string(nv,
ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
strcmp(aux, "external") == 0)
vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
}
2008-11-20 12:01:55 -08:00
}
}
/*
* Add ourselves to the parent's list of children.
*/
vdev_add_child(parent, vd);
*vdp = vd;
return (0);
}
void
vdev_free(vdev_t *vd)
{
int c, t;
2008-11-20 12:01:55 -08:00
spa_t *spa = vd->vdev_spa;
/*
* vdev_free() implies closing the vdev first. This is simpler than
* trying to ensure complicated semantics for all callers.
*/
vdev_close(vd);
ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
2008-11-20 12:01:55 -08:00
/*
* Free all children.
*/
for (c = 0; c < vd->vdev_children; c++)
2008-11-20 12:01:55 -08:00
vdev_free(vd->vdev_child[c]);
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
/*
* Discard allocation state.
*/
if (vd->vdev_mg != NULL) {
2008-11-20 12:01:55 -08:00
vdev_metaslab_fini(vd);
metaslab_group_destroy(vd->vdev_mg);
}
2008-11-20 12:01:55 -08:00
ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
/*
* Remove this vdev from its parent's child list.
*/
vdev_remove_child(vd->vdev_parent, vd);
ASSERT(vd->vdev_parent == NULL);
/*
* Clean up vdev structure.
*/
vdev_queue_fini(vd);
vdev_cache_fini(vd);
if (vd->vdev_path)
spa_strfree(vd->vdev_path);
if (vd->vdev_devid)
spa_strfree(vd->vdev_devid);
if (vd->vdev_physpath)
spa_strfree(vd->vdev_physpath);
2009-07-02 15:44:48 -07:00
if (vd->vdev_fru)
spa_strfree(vd->vdev_fru);
2008-11-20 12:01:55 -08:00
if (vd->vdev_isspare)
spa_spare_remove(vd);
if (vd->vdev_isl2cache)
spa_l2cache_remove(vd);
txg_list_destroy(&vd->vdev_ms_list);
txg_list_destroy(&vd->vdev_dtl_list);
2009-01-15 13:59:39 -08:00
2008-11-20 12:01:55 -08:00
mutex_enter(&vd->vdev_dtl_lock);
for (t = 0; t < DTL_TYPES; t++) {
2009-01-15 13:59:39 -08:00
space_map_unload(&vd->vdev_dtl[t]);
space_map_destroy(&vd->vdev_dtl[t]);
}
2008-11-20 12:01:55 -08:00
mutex_exit(&vd->vdev_dtl_lock);
2009-01-15 13:59:39 -08:00
2008-11-20 12:01:55 -08:00
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
2008-11-20 12:01:55 -08:00
if (vd == spa->spa_root_vdev)
spa->spa_root_vdev = NULL;
kmem_free(vd, sizeof (vdev_t));
}
/*
* Transfer top-level vdev state from svd to tvd.
*/
static void
vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
{
spa_t *spa = svd->vdev_spa;
metaslab_t *msp;
vdev_t *vd;
int t;
ASSERT(tvd == tvd->vdev_top);
tvd->vdev_ms_array = svd->vdev_ms_array;
tvd->vdev_ms_shift = svd->vdev_ms_shift;
tvd->vdev_ms_count = svd->vdev_ms_count;
svd->vdev_ms_array = 0;
svd->vdev_ms_shift = 0;
svd->vdev_ms_count = 0;
if (tvd->vdev_mg)
ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
2008-11-20 12:01:55 -08:00
tvd->vdev_mg = svd->vdev_mg;
tvd->vdev_ms = svd->vdev_ms;
svd->vdev_mg = NULL;
svd->vdev_ms = NULL;
if (tvd->vdev_mg != NULL)
tvd->vdev_mg->mg_vd = tvd;
tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
svd->vdev_stat.vs_alloc = 0;
svd->vdev_stat.vs_space = 0;
svd->vdev_stat.vs_dspace = 0;
for (t = 0; t < TXG_SIZE; t++) {
while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
}
if (list_link_active(&svd->vdev_config_dirty_node)) {
2008-11-20 12:01:55 -08:00
vdev_config_clean(svd);
vdev_config_dirty(tvd);
}
if (list_link_active(&svd->vdev_state_dirty_node)) {
vdev_state_clean(svd);
vdev_state_dirty(tvd);
}
2008-11-20 12:01:55 -08:00
tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
svd->vdev_deflate_ratio = 0;
tvd->vdev_islog = svd->vdev_islog;
svd->vdev_islog = 0;
}
static void
vdev_top_update(vdev_t *tvd, vdev_t *vd)
{
int c;
2008-11-20 12:01:55 -08:00
if (vd == NULL)
return;
vd->vdev_top = tvd;
for (c = 0; c < vd->vdev_children; c++)
2008-11-20 12:01:55 -08:00
vdev_top_update(tvd, vd->vdev_child[c]);
}
/*
* Add a mirror/replacing vdev above an existing vdev.
*/
vdev_t *
vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
{
spa_t *spa = cvd->vdev_spa;
vdev_t *pvd = cvd->vdev_parent;
vdev_t *mvd;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2008-11-20 12:01:55 -08:00
mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
mvd->vdev_asize = cvd->vdev_asize;
2009-07-02 15:44:48 -07:00
mvd->vdev_min_asize = cvd->vdev_min_asize;
mvd->vdev_max_asize = cvd->vdev_max_asize;
2008-11-20 12:01:55 -08:00
mvd->vdev_ashift = cvd->vdev_ashift;
mvd->vdev_state = cvd->vdev_state;
mvd->vdev_crtxg = cvd->vdev_crtxg;
2008-11-20 12:01:55 -08:00
vdev_remove_child(pvd, cvd);
vdev_add_child(pvd, mvd);
cvd->vdev_id = mvd->vdev_children;
vdev_add_child(mvd, cvd);
vdev_top_update(cvd->vdev_top, cvd->vdev_top);
if (mvd == mvd->vdev_top)
vdev_top_transfer(cvd, mvd);
return (mvd);
}
/*
* Remove a 1-way mirror/replacing vdev from the tree.
*/
void
vdev_remove_parent(vdev_t *cvd)
{
vdev_t *mvd = cvd->vdev_parent;
vdev_t *pvd = mvd->vdev_parent;
ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2008-11-20 12:01:55 -08:00
ASSERT(mvd->vdev_children == 1);
ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
mvd->vdev_ops == &vdev_replacing_ops ||
mvd->vdev_ops == &vdev_spare_ops);
cvd->vdev_ashift = mvd->vdev_ashift;
vdev_remove_child(mvd, cvd);
vdev_remove_child(pvd, mvd);
2009-01-15 13:59:39 -08:00
2008-11-20 12:01:55 -08:00
/*
* If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
* Otherwise, we could have detached an offline device, and when we
* go to import the pool we'll think we have two top-level vdevs,
* instead of a different version of the same top-level vdev.
2008-11-20 12:01:55 -08:00
*/
2009-01-15 13:59:39 -08:00
if (mvd->vdev_top == mvd) {
uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
cvd->vdev_orig_guid = cvd->vdev_guid;
2009-01-15 13:59:39 -08:00
cvd->vdev_guid += guid_delta;
cvd->vdev_guid_sum += guid_delta;
}
cvd->vdev_id = mvd->vdev_id;
vdev_add_child(pvd, cvd);
2008-11-20 12:01:55 -08:00
vdev_top_update(cvd->vdev_top, cvd->vdev_top);
if (cvd == cvd->vdev_top)
vdev_top_transfer(mvd, cvd);
ASSERT(mvd->vdev_children == 0);
vdev_free(mvd);
}
int
vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
uint64_t m;
uint64_t oldc = vd->vdev_ms_count;
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
int error;
ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
/*
* This vdev is not being allocated from yet or is a hole.
*/
if (vd->vdev_ms_shift == 0)
2008-11-20 12:01:55 -08:00
return (0);
ASSERT(!vd->vdev_ishole);
2009-07-02 15:44:48 -07:00
/*
* Compute the raidz-deflation ratio. Note, we hard-code
* in 128k (1 << 17) because it is the current "typical" blocksize.
* Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
* or we will inconsistently account for existing bp's.
*/
vd->vdev_deflate_ratio = (1 << 17) /
(vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
2008-11-20 12:01:55 -08:00
ASSERT(oldc <= newc);
mspp = kmem_zalloc(newc * sizeof (*mspp), KM_PUSHPAGE | KM_NODEBUG);
2008-11-20 12:01:55 -08:00
if (oldc != 0) {
bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
}
vd->vdev_ms = mspp;
vd->vdev_ms_count = newc;
for (m = oldc; m < newc; m++) {
space_map_obj_t smo = { 0, 0, 0 };
if (txg == 0) {
uint64_t object = 0;
error = dmu_read(mos, vd->vdev_ms_array,
2009-07-02 15:44:48 -07:00
m * sizeof (uint64_t), sizeof (uint64_t), &object,
DMU_READ_PREFETCH);
2008-11-20 12:01:55 -08:00
if (error)
return (error);
if (object != 0) {
dmu_buf_t *db;
error = dmu_bonus_hold(mos, object, FTAG, &db);
if (error)
return (error);
ASSERT3U(db->db_size, >=, sizeof (smo));
bcopy(db->db_data, &smo, sizeof (smo));
ASSERT3U(smo.smo_object, ==, object);
dmu_buf_rele(db, FTAG);
}
}
vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
}
if (txg == 0)
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
/*
* If the vdev is being removed we don't activate
* the metaslabs since we want to ensure that no new
* allocations are performed on this device.
*/
if (oldc == 0 && !vd->vdev_removing)
metaslab_group_activate(vd->vdev_mg);
if (txg == 0)
spa_config_exit(spa, SCL_ALLOC, FTAG);
2008-11-20 12:01:55 -08:00
return (0);
}
void
vdev_metaslab_fini(vdev_t *vd)
{
uint64_t m;
uint64_t count = vd->vdev_ms_count;
if (vd->vdev_ms != NULL) {
metaslab_group_passivate(vd->vdev_mg);
2008-11-20 12:01:55 -08:00
for (m = 0; m < count; m++)
if (vd->vdev_ms[m] != NULL)
metaslab_fini(vd->vdev_ms[m]);
kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
vd->vdev_ms = NULL;
}
Add FASTWRITE algorithm for synchronous writes. Currently, ZIL blocks are spread over vdevs using hint block pointers managed by the ZIL commit code and passed to metaslab_alloc(). Spreading log blocks accross vdevs is important for performance: indeed, using mutliple disks in parallel decreases the ZIL commit latency, which is the main performance metric for synchronous writes. However, the current implementation suffers from the following issues: 1) It would be best if the ZIL module was not aware of such low-level details. They should be handled by the ZIO and metaslab modules; 2) Because the hint block pointer is managed per log, simultaneous commits from multiple logs might use the same vdevs at the same time, which is inefficient; 3) Because dmu_write() does not honor the block pointer hint, indirect writes are not spread. The naive solution of rotating the metaslab rotor each time a block is allocated for the ZIL or dmu_sync() doesn't work in practice because the first ZIL block to be written is actually allocated during the previous commit. Consequently, when metaslab_alloc() decides the vdev for this block, it will do so while a bunch of other allocations are happening at the same time (from dmu_sync() and other ZILs). This means the vdev for this block is chosen more or less at random. When the next commit happens, there is a high chance (especially when the number of blocks per commit is slightly less than the number of the disks) that one disk will have to write two blocks (with a potential seek) while other disks are sitting idle, which defeats spreading and increases the commit latency. This commit introduces a new concept in the metaslab allocator: fastwrites. Basically, each top-level vdev maintains a counter indicating the number of synchronous writes (from dmu_sync() and the ZIL) which have been allocated but not yet completed. When the metaslab is called with the FASTWRITE flag, it will choose the vdev with the least amount of pending synchronous writes. If there are multiple vdevs with the same value, the first matching vdev (starting from the rotor) is used. Once metaslab_alloc() has decided which vdev the block is allocated to, it updates the fastwrite counter for this vdev. The rationale goes like this: when an allocation is done with FASTWRITE, it "reserves" the vdev until the data is written. Until then, all future allocations will naturally avoid this vdev, even after a full rotation of the rotor. As a result, pending synchronous writes at a given point in time will be nicely spread over all vdevs. This contrasts with the previous algorithm, which is based on the implicit assumption that blocks are written instantaneously after they're allocated. metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to manually increase or decrease fastwrite counters, respectively. They should be used with caution, as there is no per-BP tracking of fastwrite information, so leaks and "double-unmarks" are possible. There is, however, an assert in the vdev teardown code which will fire if the fastwrite counters are not zero when the pool is exported or the vdev removed. Note that as stated above, marking is also done implictly by metaslab_alloc(). ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to the metaslab when allocating (assuming ZIO does the allocation, which is only true in the case of dmu_sync). This flag will also trigger an unmark when zio_done() fires. A side-effect of the new algorithm is that when a ZIL stops being used, its last block can stay in the pending state (allocated but not yet written) for a long time, polluting the fastwrite counters. To avoid that, I've implemented a somewhat crude but working solution which unmarks these pending blocks in zil_sync(), thus guaranteeing that linguering fastwrites will get pruned at each sync event. The best performance improvements are observed with pools using a large number of top-level vdevs and heavy synchronous write workflows (especially indirect writes and concurrent writes from multiple ZILs). Real-life testing shows a 200% to 300% performance increase with indirect writes and various commit sizes. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1013
2012-06-27 15:20:20 +02:00
ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
2008-11-20 12:01:55 -08:00
}
typedef struct vdev_probe_stats {
boolean_t vps_readable;
boolean_t vps_writeable;
int vps_flags;
} vdev_probe_stats_t;
static void
vdev_probe_done(zio_t *zio)
2008-11-20 12:01:55 -08:00
{
2009-01-15 13:59:39 -08:00
spa_t *spa = zio->io_spa;
2009-02-18 12:51:31 -08:00
vdev_t *vd = zio->io_vd;
vdev_probe_stats_t *vps = zio->io_private;
2009-02-18 12:51:31 -08:00
ASSERT(vd->vdev_probe_zio != NULL);
if (zio->io_type == ZIO_TYPE_READ) {
if (zio->io_error == 0)
vps->vps_readable = 1;
2009-01-15 13:59:39 -08:00
if (zio->io_error == 0 && spa_writeable(spa)) {
2009-02-18 12:51:31 -08:00
zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
zio->io_offset, zio->io_size, zio->io_data,
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
} else {
zio_buf_free(zio->io_data, zio->io_size);
}
} else if (zio->io_type == ZIO_TYPE_WRITE) {
if (zio->io_error == 0)
vps->vps_writeable = 1;
zio_buf_free(zio->io_data, zio->io_size);
} else if (zio->io_type == ZIO_TYPE_NULL) {
2009-02-18 12:51:31 -08:00
zio_t *pio;
vd->vdev_cant_read |= !vps->vps_readable;
vd->vdev_cant_write |= !vps->vps_writeable;
if (vdev_readable(vd) &&
2009-01-15 13:59:39 -08:00
(vdev_writeable(vd) || !spa_writeable(spa))) {
zio->io_error = 0;
} else {
ASSERT(zio->io_error != 0);
zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
2009-01-15 13:59:39 -08:00
spa, vd, NULL, 0, 0);
zio->io_error = ENXIO;
}
2009-02-18 12:51:31 -08:00
mutex_enter(&vd->vdev_probe_lock);
ASSERT(vd->vdev_probe_zio == zio);
vd->vdev_probe_zio = NULL;
mutex_exit(&vd->vdev_probe_lock);
while ((pio = zio_walk_parents(zio)) != NULL)
if (!vdev_accessible(vd, pio))
pio->io_error = ENXIO;
kmem_free(vps, sizeof (*vps));
}
}
2008-11-20 12:01:55 -08:00
/*
* Determine whether this device is accessible by reading and writing
* to several known locations: the pad regions of each vdev label
* but the first (which we leave alone in case it contains a VTOC).
*/
zio_t *
2009-02-18 12:51:31 -08:00
vdev_probe(vdev_t *vd, zio_t *zio)
{
spa_t *spa = vd->vdev_spa;
2009-02-18 12:51:31 -08:00
vdev_probe_stats_t *vps = NULL;
zio_t *pio;
int l;
2009-02-18 12:51:31 -08:00
ASSERT(vd->vdev_ops->vdev_op_leaf);
2008-11-20 12:01:55 -08:00
2009-02-18 12:51:31 -08:00
/*
* Don't probe the probe.
*/
if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
return (NULL);
2009-02-18 12:51:31 -08:00
/*
* To prevent 'probe storms' when a device fails, we create
* just one probe i/o at a time. All zios that want to probe
* this vdev will become parents of the probe io.
*/
mutex_enter(&vd->vdev_probe_lock);
2009-02-18 12:51:31 -08:00
if ((pio = vd->vdev_probe_zio) == NULL) {
vps = kmem_zalloc(sizeof (*vps), KM_PUSHPAGE);
2009-02-18 12:51:31 -08:00
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
2009-07-02 15:44:48 -07:00
ZIO_FLAG_TRYHARD;
2009-02-18 12:51:31 -08:00
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
* vdev_cant_read and vdev_cant_write can only
* transition from TRUE to FALSE when we have the
* SCL_ZIO lock as writer; otherwise they can only
* transition from FALSE to TRUE. This ensures that
* any zio looking at these values can assume that
* failures persist for the life of the I/O. That's
* important because when a device has intermittent
* connectivity problems, we want to ensure that
* they're ascribed to the device (ENXIO) and not
* the zio (EIO).
*
* Since we hold SCL_ZIO as writer here, clear both
* values so the probe can reevaluate from first
* principles.
*/
vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
}
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
/*
* We can't change the vdev state in this context, so we
* kick off an async task to do it on our behalf.
*/
2009-02-18 12:51:31 -08:00
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
}
}
2009-02-18 12:51:31 -08:00
if (zio != NULL)
zio_add_child(zio, pio);
2009-02-18 12:51:31 -08:00
mutex_exit(&vd->vdev_probe_lock);
2009-02-18 12:51:31 -08:00
if (vps == NULL) {
ASSERT(zio != NULL);
return (NULL);
}
for (l = 1; l < VDEV_LABELS; l++) {
2009-02-18 12:51:31 -08:00
zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l,
2009-07-02 15:44:48 -07:00
offsetof(vdev_label_t, vl_pad2)),
VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
}
2009-02-18 12:51:31 -08:00
if (zio == NULL)
return (pio);
zio_nowait(pio);
return (NULL);
2008-11-20 12:01:55 -08:00
}
2009-08-18 11:43:27 -07:00
static void
vdev_open_child(void *arg)
{
vdev_t *vd = arg;
vd->vdev_open_thread = curthread;
vd->vdev_open_error = vdev_open(vd);
vd->vdev_open_thread = NULL;
}
static boolean_t
vdev_uses_zvols(vdev_t *vd)
{
int c;
#ifdef _KERNEL
if (zvol_is_zvol(vd->vdev_path))
return (B_TRUE);
#endif
for (c = 0; c < vd->vdev_children; c++)
if (vdev_uses_zvols(vd->vdev_child[c]))
return (B_TRUE);
return (B_FALSE);
}
2009-08-18 11:43:27 -07:00
void
vdev_open_children(vdev_t *vd)
{
taskq_t *tq;
int children = vd->vdev_children;
int c;
2009-08-18 11:43:27 -07:00
/*
* in order to handle pools on top of zvols, do the opens
* in a single thread so that the same thread holds the
* spa_namespace_lock
*/
if (vdev_uses_zvols(vd)) {
for (c = 0; c < children; c++)
vd->vdev_child[c]->vdev_open_error =
vdev_open(vd->vdev_child[c]);
return;
}
2009-08-18 11:43:27 -07:00
tq = taskq_create("vdev_open", children, minclsyspri,
children, children, TASKQ_PREPOPULATE);
for (c = 0; c < children; c++)
2009-08-18 11:43:27 -07:00
VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
TQ_SLEEP) != 0);
2009-08-18 11:43:27 -07:00
taskq_destroy(tq);
}
2008-11-20 12:01:55 -08:00
/*
* Prepare a virtual device for access.
*/
int
vdev_open(vdev_t *vd)
{
2009-01-15 13:59:39 -08:00
spa_t *spa = vd->vdev_spa;
2008-11-20 12:01:55 -08:00
int error;
uint64_t osize = 0;
uint64_t max_osize = 0;
uint64_t asize, max_asize, psize;
2008-11-20 12:01:55 -08:00
uint64_t ashift = 0;
int c;
2008-11-20 12:01:55 -08:00
2009-08-18 11:43:27 -07:00
ASSERT(vd->vdev_open_thread == curthread ||
spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2008-11-20 12:01:55 -08:00
ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
vd->vdev_state == VDEV_STATE_CANT_OPEN ||
vd->vdev_state == VDEV_STATE_OFFLINE);
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
2009-07-02 15:44:48 -07:00
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
vd->vdev_min_asize = vdev_get_min_asize(vd);
2008-11-20 12:01:55 -08:00
/*
* If this vdev is not removed, check its fault status. If it's
* faulted, bail out of the open.
*/
2008-11-20 12:01:55 -08:00
if (!vd->vdev_removed && vd->vdev_faulted) {
ASSERT(vd->vdev_children == 0);
ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
2008-11-20 12:01:55 -08:00
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
vd->vdev_label_aux);
2008-11-20 12:01:55 -08:00
return (ENXIO);
} else if (vd->vdev_offline) {
ASSERT(vd->vdev_children == 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
return (ENXIO);
}
error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
2008-11-20 12:01:55 -08:00
/*
* Reset the vdev_reopening flag so that we actually close
* the vdev on error.
*/
vd->vdev_reopening = B_FALSE;
2008-11-20 12:01:55 -08:00
if (zio_injection_enabled && error == 0)
2009-07-02 15:44:48 -07:00
error = zio_handle_device_injection(vd, NULL, ENXIO);
2008-11-20 12:01:55 -08:00
if (error) {
if (vd->vdev_removed &&
vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
vd->vdev_removed = B_FALSE;
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
vd->vdev_stat.vs_aux);
return (error);
}
vd->vdev_removed = B_FALSE;
/*
* Recheck the faulted flag now that we have confirmed that
* the vdev is accessible. If we're faulted, bail.
*/
if (vd->vdev_faulted) {
ASSERT(vd->vdev_children == 0);
ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
vd->vdev_label_aux);
return (ENXIO);
}
2008-11-20 12:01:55 -08:00
if (vd->vdev_degraded) {
ASSERT(vd->vdev_children == 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_ERR_EXCEEDED);
} else {
vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
2008-11-20 12:01:55 -08:00
}
/*
* For hole or missing vdevs we just return success.
*/
if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
return (0);
for (c = 0; c < vd->vdev_children; c++) {
2008-11-20 12:01:55 -08:00
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_NONE);
break;
}
2009-07-02 15:44:48 -07:00
}
2008-11-20 12:01:55 -08:00
osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
2008-11-20 12:01:55 -08:00
if (vd->vdev_children == 0) {
if (osize < SPA_MINDEVSIZE) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_TOO_SMALL);
return (EOVERFLOW);
}
psize = osize;
asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
max_asize = max_osize - (VDEV_LABEL_START_SIZE +
VDEV_LABEL_END_SIZE);
2008-11-20 12:01:55 -08:00
} else {
if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_TOO_SMALL);
return (EOVERFLOW);
}
psize = 0;
asize = osize;
max_asize = max_osize;
2008-11-20 12:01:55 -08:00
}
vd->vdev_psize = psize;
2009-07-02 15:44:48 -07:00
/*
* Make sure the allocatable size hasn't shrunk.
*/
if (asize < vd->vdev_min_asize) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (EINVAL);
}
2008-11-20 12:01:55 -08:00
if (vd->vdev_asize == 0) {
/*
* This is the first-ever open, so use the computed values.
* For testing purposes, a higher ashift can be requested.
*/
vd->vdev_asize = asize;
vd->vdev_max_asize = max_asize;
2008-11-20 12:01:55 -08:00
vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
} else {
/*
* Detect if the alignment requirement has increased.
* We don't want to make the pool unavailable, just
* post an event instead.
2008-11-20 12:01:55 -08:00
*/
if (ashift > vd->vdev_top->vdev_ashift &&
vd->vdev_ops->vdev_op_leaf) {
zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
spa, vd, NULL, 0, 0);
2008-11-20 12:01:55 -08:00
}
vd->vdev_max_asize = max_asize;
2009-07-02 15:44:48 -07:00
}
2008-11-20 12:01:55 -08:00
2009-07-02 15:44:48 -07:00
/*
* If all children are healthy and the asize has increased,
* then we've experienced dynamic LUN growth. If automatic
* expansion is enabled then use the additional space.
*/
if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
(vd->vdev_expanding || spa->spa_autoexpand))
vd->vdev_asize = asize;
2008-11-20 12:01:55 -08:00
2009-07-02 15:44:48 -07:00
vdev_set_min_asize(vd);
2008-11-20 12:01:55 -08:00
/*
* Ensure we can issue some IO before declaring the
* vdev open for business.
*/
if (vd->vdev_ops->vdev_op_leaf &&
(error = zio_wait(vdev_probe(vd, NULL))) != 0) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
VDEV_AUX_ERR_EXCEEDED);
2008-11-20 12:01:55 -08:00
return (error);
}
/*
* If a leaf vdev has a DTL, and seems healthy, then kick off a
2009-01-15 13:59:39 -08:00
* resilver. But don't do this if we are doing a reopen for a scrub,
* since this would just restart the scrub we are already doing.
2008-11-20 12:01:55 -08:00
*/
2009-01-15 13:59:39 -08:00
if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
vdev_resilver_needed(vd, NULL, NULL))
spa_async_request(spa, SPA_ASYNC_RESILVER);
2008-11-20 12:01:55 -08:00
return (0);
}
/*
* Called once the vdevs are all opened, this routine validates the label
* contents. This needs to be done before vdev_load() so that we don't
* inadvertently do repair I/Os to the wrong device.
*
* If 'strict' is false ignore the spa guid check. This is necessary because
* if the machine crashed during a re-guid the new guid might have been written
* to all of the vdev labels, but not the cached config. The strict check
* will be performed when the pool is opened again using the mos config.
*
2008-11-20 12:01:55 -08:00
* This function will only return failure if one of the vdevs indicates that it
* has since been destroyed or exported. This is only possible if
* /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
* will be updated but the function will return 0.
*/
int
vdev_validate(vdev_t *vd, boolean_t strict)
2008-11-20 12:01:55 -08:00
{
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
uint64_t guid = 0, top_guid;
2008-11-20 12:01:55 -08:00
uint64_t state;
int c;
2008-11-20 12:01:55 -08:00
for (c = 0; c < vd->vdev_children; c++)
if (vdev_validate(vd->vdev_child[c], strict) != 0)
2008-11-20 12:01:55 -08:00
return (EBADF);
/*
* If the device has already failed, or was marked offline, don't do
* any further validation. Otherwise, label I/O will fail and we will
* overwrite the previous state.
*/
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
uint64_t aux_guid = 0;
nvlist_t *nvl;
2008-11-20 12:01:55 -08:00
if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) ==
NULL) {
2008-11-20 12:01:55 -08:00
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (0);
}
/*
* Determine if this vdev has been split off into another
* pool. If so, then refuse to open it.
*/
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
&aux_guid) == 0 && aux_guid == spa_guid(spa)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_SPLIT_POOL);
nvlist_free(label);
return (0);
}
if (strict && (nvlist_lookup_uint64(label,
ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
guid != spa_guid(spa))) {
2008-11-20 12:01:55 -08:00
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
return (0);
}
if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
!= 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
&aux_guid) != 0)
aux_guid = 0;
/*
* If this vdev just became a top-level vdev because its
* sibling was detached, it will have adopted the parent's
* vdev guid -- but the label may or may not be on disk yet.
* Fortunately, either version of the label will have the
* same top guid, so if we're a top-level vdev, we can
* safely compare to that instead.
*
* If we split this vdev off instead, then we also check the
* original pool's guid. We don't want to consider the vdev
* corrupt if it is partway through a split operation.
*/
2008-11-20 12:01:55 -08:00
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
&guid) != 0 ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
&top_guid) != 0 ||
((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
(vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
2008-11-20 12:01:55 -08:00
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
return (0);
}
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
&state) != 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
return (0);
}
nvlist_free(label);
2009-08-18 11:43:27 -07:00
/*
* If this is a verbatim import, no need to check the
2009-08-18 11:43:27 -07:00
* state of the pool.
*/
if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
spa_load_state(spa) == SPA_LOAD_OPEN &&
2008-11-20 12:01:55 -08:00
state != POOL_STATE_ACTIVE)
return (EBADF);
/*
* If we were able to open and validate a vdev that was
* previously marked permanently unavailable, clear that state
* now.
*/
if (vd->vdev_not_present)
vd->vdev_not_present = 0;
}
2008-11-20 12:01:55 -08:00
return (0);
}
/*
* Close a virtual device.
*/
void
vdev_close(vdev_t *vd)
{
vdev_t *pvd = vd->vdev_parent;
ASSERTV(spa_t *spa = vd->vdev_spa);
2009-01-15 13:59:39 -08:00
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
/*
* If our parent is reopening, then we are as well, unless we are
* going offline.
*/
if (pvd != NULL && pvd->vdev_reopening)
vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
2008-11-20 12:01:55 -08:00
vd->vdev_ops->vdev_op_close(vd);
vdev_cache_purge(vd);
/*
2009-07-02 15:44:48 -07:00
* We record the previous state before we close it, so that if we are
2008-11-20 12:01:55 -08:00
* doing a reopen(), we don't generate FMA ereports if we notice that
* it's still faulted.
*/
vd->vdev_prevstate = vd->vdev_state;
if (vd->vdev_offline)
vd->vdev_state = VDEV_STATE_OFFLINE;
else
vd->vdev_state = VDEV_STATE_CLOSED;
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
}
void
vdev_hold(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
int c;
ASSERT(spa_is_root(spa));
if (spa->spa_state == POOL_STATE_UNINITIALIZED)
return;
for (c = 0; c < vd->vdev_children; c++)
vdev_hold(vd->vdev_child[c]);
if (vd->vdev_ops->vdev_op_leaf)
vd->vdev_ops->vdev_op_hold(vd);
}
void
vdev_rele(vdev_t *vd)
{
int c;
ASSERT(spa_is_root(vd->vdev_spa));
for (c = 0; c < vd->vdev_children; c++)
vdev_rele(vd->vdev_child[c]);
if (vd->vdev_ops->vdev_op_leaf)
vd->vdev_ops->vdev_op_rele(vd);
}
/*
* Reopen all interior vdevs and any unopened leaves. We don't actually
* reopen leaf vdevs which had previously been opened as they might deadlock
* on the spa_config_lock. Instead we only obtain the leaf's physical size.
* If the leaf has never been opened then open it, as usual.
*/
2008-11-20 12:01:55 -08:00
void
vdev_reopen(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2008-11-20 12:01:55 -08:00
/* set the reopening flag unless we're taking the vdev offline */
vd->vdev_reopening = !vd->vdev_offline;
2008-11-20 12:01:55 -08:00
vdev_close(vd);
(void) vdev_open(vd);
/*
* Call vdev_validate() here to make sure we have the same device.
* Otherwise, a device with an invalid label could be successfully
* opened in response to vdev_reopen().
*/
if (vd->vdev_aux) {
(void) vdev_validate_aux(vd);
if (vdev_readable(vd) && vdev_writeable(vd) &&
2009-07-02 15:44:48 -07:00
vd->vdev_aux == &spa->spa_l2cache &&
!l2arc_vdev_present(vd))
l2arc_add_vdev(spa, vd);
} else {
(void) vdev_validate(vd, B_TRUE);
}
2008-11-20 12:01:55 -08:00
/*
* Reassess parent vdev's health.
*/
vdev_propagate_state(vd);
}
int
vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
{
int error;
/*
* Normally, partial opens (e.g. of a mirror) are allowed.
* For a create, however, we want to fail the request if
* there are any components we can't open.
*/
error = vdev_open(vd);
if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
vdev_close(vd);
return (error ? error : ENXIO);
}
/*
* Recursively initialize all labels.
*/
if ((error = vdev_label_init(vd, txg, isreplacing ?
VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
vdev_close(vd);
return (error);
}
return (0);
}
void
2009-07-02 15:44:48 -07:00
vdev_metaslab_set_size(vdev_t *vd)
2008-11-20 12:01:55 -08:00
{
/*
* Aim for roughly 200 metaslabs per vdev.
*/
vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
}
void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{
ASSERT(vd == vd->vdev_top);
ASSERT(!vd->vdev_ishole);
2008-11-20 12:01:55 -08:00
ASSERT(ISP2(flags));
ASSERT(spa_writeable(vd->vdev_spa));
2008-11-20 12:01:55 -08:00
if (flags & VDD_METASLAB)
(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
if (flags & VDD_DTL)
(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
}
2009-01-15 13:59:39 -08:00
/*
* DTLs.
*
* A vdev's DTL (dirty time log) is the set of transaction groups for which
* the vdev has less than perfect replication. There are four kinds of DTL:
2009-01-15 13:59:39 -08:00
*
* DTL_MISSING: txgs for which the vdev has no valid copies of the data
*
* DTL_PARTIAL: txgs for which data is available, but not fully replicated
*
* DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
* scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
* txgs that was scrubbed.
*
* DTL_OUTAGE: txgs which cannot currently be read, whether due to
* persistent errors or just some device being offline.
* Unlike the other three, the DTL_OUTAGE map is not generally
* maintained; it's only computed when needed, typically to
* determine whether a device can be detached.
*
* For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
* either has the data or it doesn't.
*
* For interior vdevs such as mirror and RAID-Z the picture is more complex.
* A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
* if any child is less than fully replicated, then so is its parent.
* A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
* comprising only those txgs which appear in 'maxfaults' or more children;
* those are the txgs we don't have enough replication to read. For example,
* double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
* thus, its DTL_MISSING consists of the set of txgs that appear in more than
* two child DTL_MISSING maps.
*
* It should be clear from the above that to compute the DTLs and outage maps
* for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
* Therefore, that is all we keep on disk. When loading the pool, or after
* a configuration change, we generate all other DTLs from first principles.
*/
2008-11-20 12:01:55 -08:00
void
2009-01-15 13:59:39 -08:00
vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
2008-11-20 12:01:55 -08:00
{
2009-01-15 13:59:39 -08:00
space_map_t *sm = &vd->vdev_dtl[t];
ASSERT(t < DTL_TYPES);
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
ASSERT(spa_writeable(vd->vdev_spa));
2009-01-15 13:59:39 -08:00
2008-11-20 12:01:55 -08:00
mutex_enter(sm->sm_lock);
if (!space_map_contains(sm, txg, size))
space_map_add(sm, txg, size);
mutex_exit(sm->sm_lock);
}
2009-01-15 13:59:39 -08:00
boolean_t
vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
2008-11-20 12:01:55 -08:00
{
2009-01-15 13:59:39 -08:00
space_map_t *sm = &vd->vdev_dtl[t];
boolean_t dirty = B_FALSE;
2008-11-20 12:01:55 -08:00
2009-01-15 13:59:39 -08:00
ASSERT(t < DTL_TYPES);
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
2008-11-20 12:01:55 -08:00
mutex_enter(sm->sm_lock);
2009-01-15 13:59:39 -08:00
if (sm->sm_space != 0)
dirty = space_map_contains(sm, txg, size);
2008-11-20 12:01:55 -08:00
mutex_exit(sm->sm_lock);
return (dirty);
}
2009-01-15 13:59:39 -08:00
boolean_t
vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
{
space_map_t *sm = &vd->vdev_dtl[t];
boolean_t empty;
mutex_enter(sm->sm_lock);
empty = (sm->sm_space == 0);
mutex_exit(sm->sm_lock);
return (empty);
}
2008-11-20 12:01:55 -08:00
/*
* Reassess DTLs after a config change or scrub completion.
*/
void
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
{
spa_t *spa = vd->vdev_spa;
2009-01-15 13:59:39 -08:00
avl_tree_t reftree;
int c, t, minref;
2008-11-20 12:01:55 -08:00
2009-01-15 13:59:39 -08:00
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2008-11-20 12:01:55 -08:00
for (c = 0; c < vd->vdev_children; c++)
2009-01-15 13:59:39 -08:00
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
2009-01-15 13:59:39 -08:00
return;
if (vd->vdev_ops->vdev_op_leaf) {
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
2008-11-20 12:01:55 -08:00
mutex_enter(&vd->vdev_dtl_lock);
if (scrub_txg != 0 &&
(spa->spa_scrub_started ||
(scn && scn->scn_phys.scn_errors == 0))) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
* will be valid, so excise the old region and
* fold in the scrub dtl. Otherwise, leave the
* dtl as-is if there was an error.
2009-01-15 13:59:39 -08:00
*
* There's little trick here: to excise the beginning
* of the DTL_MISSING map, we put it into a reference
* tree and then add a segment with refcnt -1 that
* covers the range [0, scrub_txg). This means
* that each txg in that range has refcnt -1 or 0.
* We then add DTL_SCRUB with a refcnt of 2, so that
* entries in the range [0, scrub_txg) will have a
* positive refcnt -- either 1 or 2. We then convert
* the reference tree into the new DTL_MISSING map.
*/
2009-01-15 13:59:39 -08:00
space_map_ref_create(&reftree);
space_map_ref_add_map(&reftree,
&vd->vdev_dtl[DTL_MISSING], 1);
space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
space_map_ref_add_map(&reftree,
&vd->vdev_dtl[DTL_SCRUB], 2);
space_map_ref_generate_map(&reftree,
&vd->vdev_dtl[DTL_MISSING], 1);
space_map_ref_destroy(&reftree);
2008-11-20 12:01:55 -08:00
}
2009-01-15 13:59:39 -08:00
space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
2008-11-20 12:01:55 -08:00
if (scrub_done)
2009-01-15 13:59:39 -08:00
space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
if (!vdev_readable(vd))
space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
else
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
2008-11-20 12:01:55 -08:00
mutex_exit(&vd->vdev_dtl_lock);
2008-11-20 12:01:55 -08:00
if (txg != 0)
vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
return;
}
mutex_enter(&vd->vdev_dtl_lock);
for (t = 0; t < DTL_TYPES; t++) {
/* account for child's outage in parent's missing map */
int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
2009-01-15 13:59:39 -08:00
if (t == DTL_SCRUB)
continue; /* leaf vdevs only */
if (t == DTL_PARTIAL)
minref = 1; /* i.e. non-zero */
else if (vd->vdev_nparity != 0)
minref = vd->vdev_nparity + 1; /* RAID-Z */
else
minref = vd->vdev_children; /* any kind of mirror */
space_map_ref_create(&reftree);
for (c = 0; c < vd->vdev_children; c++) {
2009-01-15 13:59:39 -08:00
vdev_t *cvd = vd->vdev_child[c];
mutex_enter(&cvd->vdev_dtl_lock);
space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
2009-01-15 13:59:39 -08:00
mutex_exit(&cvd->vdev_dtl_lock);
}
space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
space_map_ref_destroy(&reftree);
2008-11-20 12:01:55 -08:00
}
2009-01-15 13:59:39 -08:00
mutex_exit(&vd->vdev_dtl_lock);
2008-11-20 12:01:55 -08:00
}
static int
vdev_dtl_load(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
2009-01-15 13:59:39 -08:00
space_map_obj_t *smo = &vd->vdev_dtl_smo;
2008-11-20 12:01:55 -08:00
objset_t *mos = spa->spa_meta_objset;
dmu_buf_t *db;
int error;
ASSERT(vd->vdev_children == 0);
if (smo->smo_object == 0)
return (0);
ASSERT(!vd->vdev_ishole);
2008-11-20 12:01:55 -08:00
if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
return (error);
ASSERT3U(db->db_size, >=, sizeof (*smo));
bcopy(db->db_data, smo, sizeof (*smo));
dmu_buf_rele(db, FTAG);
mutex_enter(&vd->vdev_dtl_lock);
2009-01-15 13:59:39 -08:00
error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
NULL, SM_ALLOC, smo, mos);
2008-11-20 12:01:55 -08:00
mutex_exit(&vd->vdev_dtl_lock);
return (error);
}
void
vdev_dtl_sync(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
2009-01-15 13:59:39 -08:00
space_map_obj_t *smo = &vd->vdev_dtl_smo;
space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
2008-11-20 12:01:55 -08:00
objset_t *mos = spa->spa_meta_objset;
space_map_t smsync;
kmutex_t smlock;
dmu_buf_t *db;
dmu_tx_t *tx;
ASSERT(!vd->vdev_ishole);
2008-11-20 12:01:55 -08:00
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (vd->vdev_detached) {
if (smo->smo_object != 0) {
VERIFY(0 == dmu_object_free(mos, smo->smo_object, tx));
2008-11-20 12:01:55 -08:00
smo->smo_object = 0;
}
dmu_tx_commit(tx);
return;
}
if (smo->smo_object == 0) {
ASSERT(smo->smo_objsize == 0);
ASSERT(smo->smo_alloc == 0);
smo->smo_object = dmu_object_alloc(mos,
DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
ASSERT(smo->smo_object != 0);
vdev_config_dirty(vd->vdev_top);
}
mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
&smlock);
mutex_enter(&smlock);
mutex_enter(&vd->vdev_dtl_lock);
space_map_walk(sm, space_map_add, &smsync);
mutex_exit(&vd->vdev_dtl_lock);
space_map_truncate(smo, mos, tx);
space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
space_map_destroy(&smsync);
mutex_exit(&smlock);
mutex_destroy(&smlock);
VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
ASSERT3U(db->db_size, >=, sizeof (*smo));
bcopy(smo, db->db_data, sizeof (*smo));
dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
}
2009-01-15 13:59:39 -08:00
/*
* Determine whether the specified vdev can be offlined/detached/removed
* without losing data.
*/
boolean_t
vdev_dtl_required(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
vdev_t *tvd = vd->vdev_top;
uint8_t cant_read = vd->vdev_cant_read;
boolean_t required;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
if (vd == spa->spa_root_vdev || vd == tvd)
return (B_TRUE);
/*
* Temporarily mark the device as unreadable, and then determine
* whether this results in any DTL outages in the top-level vdev.
* If not, we can safely offline/detach/remove the device.
*/
vd->vdev_cant_read = B_TRUE;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
vd->vdev_cant_read = cant_read;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
if (!required && zio_injection_enabled)
required = !!zio_handle_device_injection(vd, NULL, ECHILD);
2009-01-15 13:59:39 -08:00
return (required);
}
/*
* Determine if resilver is needed, and if so the txg range.
*/
boolean_t
vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
{
boolean_t needed = B_FALSE;
uint64_t thismin = UINT64_MAX;
uint64_t thismax = 0;
int c;
if (vd->vdev_children == 0) {
mutex_enter(&vd->vdev_dtl_lock);
2009-01-15 13:59:39 -08:00
if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
vdev_writeable(vd)) {
space_seg_t *ss;
2009-01-15 13:59:39 -08:00
ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
thismin = ss->ss_start - 1;
2009-01-15 13:59:39 -08:00
ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
thismax = ss->ss_end;
needed = B_TRUE;
}
mutex_exit(&vd->vdev_dtl_lock);
} else {
for (c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
uint64_t cmin, cmax;
if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
thismin = MIN(thismin, cmin);
thismax = MAX(thismax, cmax);
needed = B_TRUE;
}
}
}
if (needed && minp) {
*minp = thismin;
*maxp = thismax;
}
return (needed);
}
2008-11-20 12:01:55 -08:00
void
vdev_load(vdev_t *vd)
{
int c;
2008-11-20 12:01:55 -08:00
/*
* Recursively load all children.
*/
for (c = 0; c < vd->vdev_children; c++)
2008-11-20 12:01:55 -08:00
vdev_load(vd->vdev_child[c]);
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
if (vd == vd->vdev_top && !vd->vdev_ishole &&
2008-11-20 12:01:55 -08:00
(vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
vdev_metaslab_init(vd, 0) != 0))
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
/*
* If this is a leaf vdev, load its DTL.
*/
if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
}
/*
* The special vdev case is used for hot spares and l2cache devices. Its
* sole purpose it to set the vdev state for the associated vdev. To do this,
* we make sure that we can open the underlying device, then try to read the
* label, and make sure that the label is sane and that it hasn't been
* repurposed to another pool.
*/
int
vdev_validate_aux(vdev_t *vd)
{
nvlist_t *label;
uint64_t guid, version;
uint64_t state;
if (!vdev_readable(vd))
return (0);
if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) {
2008-11-20 12:01:55 -08:00
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
return (-1);
}
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
!SPA_VERSION_IS_SUPPORTED(version) ||
2008-11-20 12:01:55 -08:00
nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
guid != vd->vdev_guid ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
return (-1);
}
/*
* We don't actually check the pool state here. If it's in fact in
* use by another pool, we update this fact on the fly when requested.
*/
nvlist_free(label);
return (0);
}
void
vdev_remove(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
dmu_tx_t *tx;
int m;
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
if (vd->vdev_dtl_smo.smo_object) {
ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
(void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
vd->vdev_dtl_smo.smo_object = 0;
}
if (vd->vdev_ms != NULL) {
for (m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
if (msp == NULL || msp->ms_smo.smo_object == 0)
continue;
ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
(void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
msp->ms_smo.smo_object = 0;
}
}
if (vd->vdev_ms_array) {
(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
vd->vdev_ms_array = 0;
vd->vdev_ms_shift = 0;
}
dmu_tx_commit(tx);
}
2008-11-20 12:01:55 -08:00
void
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
metaslab_t *msp;
boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
ASSERT(!vd->vdev_ishole);
2008-11-20 12:01:55 -08:00
while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))))
2008-11-20 12:01:55 -08:00
metaslab_sync_done(msp, txg);
if (reassess)
metaslab_sync_reassess(vd->vdev_mg);
2008-11-20 12:01:55 -08:00
}
void
vdev_sync(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
vdev_t *lvd;
metaslab_t *msp;
dmu_tx_t *tx;
ASSERT(!vd->vdev_ishole);
2008-11-20 12:01:55 -08:00
if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
ASSERT(vd == vd->vdev_top);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
ASSERT(vd->vdev_ms_array != 0);
vdev_config_dirty(vd);
dmu_tx_commit(tx);
}
/*
* Remove the metadata associated with this vdev once it's empty.
*/
if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
vdev_remove(vd, txg);
2008-11-20 12:01:55 -08:00
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
metaslab_sync(msp, txg);
(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
}
while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
vdev_dtl_sync(lvd, txg);
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
}
uint64_t
vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
{
return (vd->vdev_ops->vdev_op_asize(vd, psize));
}
/*
* Mark the given vdev faulted. A faulted vdev behaves as if the device could
* not be opened, and no I/O is attempted.
*/
int
vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2008-11-20 12:01:55 -08:00
{
vdev_t *vd, *tvd;
2008-11-20 12:01:55 -08:00
spa_vdev_state_enter(spa, SCL_NONE);
2008-11-20 12:01:55 -08:00
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
2008-11-20 12:01:55 -08:00
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2008-11-20 12:01:55 -08:00
tvd = vd->vdev_top;
/*
* We don't directly use the aux state here, but if we do a
* vdev_reopen(), we need this value to be present to remember why we
* were faulted.
*/
vd->vdev_label_aux = aux;
2008-11-20 12:01:55 -08:00
/*
* Faulted state takes precedence over degraded.
*/
vd->vdev_delayed_close = B_FALSE;
2008-11-20 12:01:55 -08:00
vd->vdev_faulted = 1ULL;
vd->vdev_degraded = 0ULL;
vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
2008-11-20 12:01:55 -08:00
/*
* If this device has the only valid copy of the data, then
* back off and simply mark the vdev as degraded instead.
2008-11-20 12:01:55 -08:00
*/
if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
2008-11-20 12:01:55 -08:00
vd->vdev_degraded = 1ULL;
vd->vdev_faulted = 0ULL;
/*
* If we reopen the device and it's not dead, only then do we
* mark it degraded.
*/
vdev_reopen(tvd);
2008-11-20 12:01:55 -08:00
if (vdev_readable(vd))
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
2008-11-20 12:01:55 -08:00
}
return (spa_vdev_state_exit(spa, vd, 0));
2008-11-20 12:01:55 -08:00
}
/*
* Mark the given vdev degraded. A degraded vdev is purely an indication to the
* user that something is wrong. The vdev continues to operate as normal as far
* as I/O is concerned.
*/
int
vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2008-11-20 12:01:55 -08:00
{
vdev_t *vd;
2008-11-20 12:01:55 -08:00
spa_vdev_state_enter(spa, SCL_NONE);
2008-11-20 12:01:55 -08:00
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
2008-11-20 12:01:55 -08:00
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2008-11-20 12:01:55 -08:00
/*
* If the vdev is already faulted, then don't do anything.
*/
if (vd->vdev_faulted || vd->vdev_degraded)
return (spa_vdev_state_exit(spa, NULL, 0));
2008-11-20 12:01:55 -08:00
vd->vdev_degraded = 1ULL;
if (!vdev_is_dead(vd))
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
aux);
2008-11-20 12:01:55 -08:00
return (spa_vdev_state_exit(spa, vd, 0));
2008-11-20 12:01:55 -08:00
}
/*
* Online the given vdev. If 'unspare' is set, it implies two things. First,
* any attached spare device should be detached when the device finishes
* resilvering. Second, the online should be treated like a 'test' online case,
* so no FMA events are generated if the device fails to open.
*/
int
vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
2008-11-20 12:01:55 -08:00
{
2009-07-02 15:44:48 -07:00
vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
2008-11-20 12:01:55 -08:00
spa_vdev_state_enter(spa, SCL_NONE);
2008-11-20 12:01:55 -08:00
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
2008-11-20 12:01:55 -08:00
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2008-11-20 12:01:55 -08:00
2009-07-02 15:44:48 -07:00
tvd = vd->vdev_top;
2008-11-20 12:01:55 -08:00
vd->vdev_offline = B_FALSE;
vd->vdev_tmpoffline = B_FALSE;
vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
2009-07-02 15:44:48 -07:00
/* XXX - L2ARC 1.0 does not support expansion */
if (!vd->vdev_aux) {
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
}
vdev_reopen(tvd);
2008-11-20 12:01:55 -08:00
vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
2009-07-02 15:44:48 -07:00
if (!vd->vdev_aux) {
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
pvd->vdev_expanding = B_FALSE;
}
2008-11-20 12:01:55 -08:00
if (newstate)
*newstate = vd->vdev_state;
if ((flags & ZFS_ONLINE_UNSPARE) &&
!vdev_is_dead(vd) && vd->vdev_parent &&
vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
vd->vdev_parent->vdev_child[0] == vd)
vd->vdev_unspare = B_TRUE;
2009-07-02 15:44:48 -07:00
if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
/* XXX - L2ARC 1.0 does not support expansion */
if (vd->vdev_aux)
return (spa_vdev_state_exit(spa, vd, ENOTSUP));
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
2009-01-15 13:59:39 -08:00
return (spa_vdev_state_exit(spa, vd, 0));
2008-11-20 12:01:55 -08:00
}
static int
vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
2008-11-20 12:01:55 -08:00
{
2009-07-02 15:44:48 -07:00
vdev_t *vd, *tvd;
int error = 0;
uint64_t generation;
metaslab_group_t *mg;
2008-11-20 12:01:55 -08:00
top:
spa_vdev_state_enter(spa, SCL_ALLOC);
2008-11-20 12:01:55 -08:00
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
2008-11-20 12:01:55 -08:00
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2008-11-20 12:01:55 -08:00
2009-07-02 15:44:48 -07:00
tvd = vd->vdev_top;
mg = tvd->vdev_mg;
generation = spa->spa_config_generation + 1;
2009-07-02 15:44:48 -07:00
2008-11-20 12:01:55 -08:00
/*
* If the device isn't already offline, try to offline it.
*/
if (!vd->vdev_offline) {
/*
2009-01-15 13:59:39 -08:00
* If this device has the only valid copy of some data,
2009-07-02 15:44:48 -07:00
* don't allow it to be offlined. Log devices are always
* expendable.
2008-11-20 12:01:55 -08:00
*/
2009-07-02 15:44:48 -07:00
if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
vdev_dtl_required(vd))
return (spa_vdev_state_exit(spa, NULL, EBUSY));
2008-11-20 12:01:55 -08:00
/*
* If the top-level is a slog and it has had allocations
* then proceed. We check that the vdev's metaslab group
* is not NULL since it's possible that we may have just
* added this vdev but not yet initialized its metaslabs.
*/
if (tvd->vdev_islog && mg != NULL) {
/*
* Prevent any future allocations.
*/
metaslab_group_passivate(mg);
(void) spa_vdev_state_exit(spa, vd, 0);
error = spa_offline_log(spa);
spa_vdev_state_enter(spa, SCL_ALLOC);
/*
* Check to see if the config has changed.
*/
if (error || generation != spa->spa_config_generation) {
metaslab_group_activate(mg);
if (error)
return (spa_vdev_state_exit(spa,
vd, error));
(void) spa_vdev_state_exit(spa, vd, 0);
goto top;
}
ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
}
2008-11-20 12:01:55 -08:00
/*
* Offline this device and reopen its top-level vdev.
2009-07-02 15:44:48 -07:00
* If the top-level vdev is a log device then just offline
* it. Otherwise, if this action results in the top-level
* vdev becoming unusable, undo it and fail the request.
2008-11-20 12:01:55 -08:00
*/
vd->vdev_offline = B_TRUE;
2009-07-02 15:44:48 -07:00
vdev_reopen(tvd);
if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
vdev_is_dead(tvd)) {
2008-11-20 12:01:55 -08:00
vd->vdev_offline = B_FALSE;
2009-07-02 15:44:48 -07:00
vdev_reopen(tvd);
return (spa_vdev_state_exit(spa, NULL, EBUSY));
2008-11-20 12:01:55 -08:00
}
/*
* Add the device back into the metaslab rotor so that
* once we online the device it's open for business.
*/
if (tvd->vdev_islog && mg != NULL)
metaslab_group_activate(mg);
2008-11-20 12:01:55 -08:00
}
vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
2008-11-20 12:01:55 -08:00
return (spa_vdev_state_exit(spa, vd, 0));
}
2009-07-02 15:44:48 -07:00
int
vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
{
int error;
2009-07-02 15:44:48 -07:00
mutex_enter(&spa->spa_vdev_top_lock);
error = vdev_offline_locked(spa, guid, flags);
mutex_exit(&spa->spa_vdev_top_lock);
return (error);
2008-11-20 12:01:55 -08:00
}
/*
* Clear the error counts associated with this vdev. Unlike vdev_online() and
* vdev_offline(), we assume the spa config is locked. We also clear all
* children. If 'vd' is NULL, then the user wants to clear all vdevs.
*/
void
vdev_clear(spa_t *spa, vdev_t *vd)
2008-11-20 12:01:55 -08:00
{
vdev_t *rvd = spa->spa_root_vdev;
int c;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2008-11-20 12:01:55 -08:00
if (vd == NULL)
vd = rvd;
2008-11-20 12:01:55 -08:00
vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
for (c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
2008-11-20 12:01:55 -08:00
/*
* If we're in the FAULTED state or have experienced failed I/O, then
* clear the persistent state and attempt to reopen the device. We
* also mark the vdev config dirty, so that the new faulted state is
* written out to disk.
2008-11-20 12:01:55 -08:00
*/
if (vd->vdev_faulted || vd->vdev_degraded ||
!vdev_readable(vd) || !vdev_writeable(vd)) {
/*
* When reopening in reponse to a clear event, it may be due to
* a fmadm repair request. In this case, if the device is
* still broken, we want to still post the ereport again.
*/
vd->vdev_forcefault = B_TRUE;
vd->vdev_faulted = vd->vdev_degraded = 0ULL;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
2008-11-20 12:01:55 -08:00
vd->vdev_forcefault = B_FALSE;
if (vd != rvd && vdev_writeable(vd->vdev_top))
vdev_state_dirty(vd->vdev_top);
if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
2008-11-20 12:01:55 -08:00
spa_async_request(spa, SPA_ASYNC_RESILVER);
Add linux events This topic branch leverages the Solaris style FMA call points in ZFS to create a user space visible event notification system under Linux. This new system is called zevent and it unifies all previous Solaris style ereports and sysevent notifications. Under this Linux specific scheme when a sysevent or ereport event occurs an nvlist describing the event is created which looks almost exactly like a Solaris ereport. These events are queued up in the kernel when they occur and conditionally logged to the console. It is then up to a user space application to consume the events and do whatever it likes with them. To make this possible the existing /dev/zfs ABI has been extended with two new ioctls which behave as follows. * ZFS_IOC_EVENTS_NEXT Get the next pending event. The kernel will keep track of the last event consumed by the file descriptor and provide the next one if available. If no new events are available the ioctl() will block waiting for the next event. This ioctl may also be called in a non-blocking mode by setting zc.zc_guid = ZEVENT_NONBLOCK. In the non-blocking case if no events are available ENOENT will be returned. It is possible that ESHUTDOWN will be returned if the ioctl() is called while module unloading is in progress. And finally ENOMEM may occur if the provided nvlist buffer is not large enough to contain the entire event. * ZFS_IOC_EVENTS_CLEAR Clear are events queued by the kernel. The kernel will keep a fairly large number of recent events queued, use this ioctl to clear the in kernel list. This will effect all user space processes consuming events. The zpool command has been extended to use this events ABI with the 'events' subcommand. You may run 'zpool events -v' to output a verbose log of all recent events. This is very similar to the Solaris 'fmdump -ev' command with the key difference being it also includes what would be considered sysevents under Solaris. You may also run in follow mode with the '-f' option. To clear the in kernel event queue use the '-c' option. $ sudo cmd/zpool/zpool events -fv TIME CLASS May 13 2010 16:31:15.777711000 ereport.fs.zfs.config.sync class = "ereport.fs.zfs.config.sync" ena = 0x40982b7897700001 detector = (embedded nvlist) version = 0x0 scheme = "zfs" pool = 0xed976600de75dfa6 (end detector) time = 0x4bec8bc3 0x2e5aed98 pool = "zpios" pool_guid = 0xed976600de75dfa6 pool_context = 0x0 While the 'zpool events' command is handy for interactive debugging it is not expected to be the primary consumer of zevents. This ABI was primarily added to facilitate the addition of a user space monitoring daemon. This daemon would consume all events posted by the kernel and based on the type of event perform an action. For most events simply forwarding them on to syslog is likely enough. But this interface also cleanly allows for more sophisticated actions to be taken such as generating an email for a failed drive. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2010-08-26 11:42:43 -07:00
spa_event_notify(spa, vd, FM_EREPORT_ZFS_DEVICE_CLEAR);
2008-11-20 12:01:55 -08:00
}
/*
* When clearing a FMA-diagnosed fault, we always want to
* unspare the device, as we assume that the original spare was
* done in response to the FMA fault.
*/
if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
vd->vdev_parent->vdev_child[0] == vd)
vd->vdev_unspare = B_TRUE;
2008-11-20 12:01:55 -08:00
}
boolean_t
vdev_is_dead(vdev_t *vd)
{
/*
* Holes and missing devices are always considered "dead".
* This simplifies the code since we don't have to check for
* these types of devices in the various code paths.
* Instead we rely on the fact that we skip over dead devices
* before issuing I/O to them.
*/
return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
vd->vdev_ops == &vdev_missing_ops);
}
boolean_t
2008-11-20 12:01:55 -08:00
vdev_readable(vdev_t *vd)
{
return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
2008-11-20 12:01:55 -08:00
}
boolean_t
2008-11-20 12:01:55 -08:00
vdev_writeable(vdev_t *vd)
{
return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
2008-11-20 12:01:55 -08:00
}
boolean_t
vdev_allocatable(vdev_t *vd)
2008-11-20 12:01:55 -08:00
{
2009-01-15 13:59:39 -08:00
uint64_t state = vd->vdev_state;
/*
2009-01-15 13:59:39 -08:00
* We currently allow allocations from vdevs which may be in the
* process of reopening (i.e. VDEV_STATE_CLOSED). If the device
* fails to reopen then we'll catch it later when we're holding
2009-01-15 13:59:39 -08:00
* the proper locks. Note that we have to get the vdev state
* in a local variable because although it changes atomically,
* we're asking two separate questions about it.
*/
2009-01-15 13:59:39 -08:00
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
!vd->vdev_cant_write && !vd->vdev_ishole);
2008-11-20 12:01:55 -08:00
}
boolean_t
vdev_accessible(vdev_t *vd, zio_t *zio)
2008-11-20 12:01:55 -08:00
{
ASSERT(zio->io_vd == vd);
2008-11-20 12:01:55 -08:00
if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
return (B_FALSE);
2008-11-20 12:01:55 -08:00
if (zio->io_type == ZIO_TYPE_READ)
return (!vd->vdev_cant_read);
2008-11-20 12:01:55 -08:00
if (zio->io_type == ZIO_TYPE_WRITE)
return (!vd->vdev_cant_write);
2008-11-20 12:01:55 -08:00
return (B_TRUE);
2008-11-20 12:01:55 -08:00
}
/*
* Get statistics for the given vdev.
*/
void
vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
{
vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
int c, t;
2008-11-20 12:01:55 -08:00
mutex_enter(&vd->vdev_stat_lock);
bcopy(&vd->vdev_stat, vs, sizeof (*vs));
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
2009-07-02 15:44:48 -07:00
vs->vs_rsize = vdev_get_min_asize(vd);
if (vd->vdev_ops->vdev_op_leaf)
vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
2008-11-20 12:01:55 -08:00
mutex_exit(&vd->vdev_stat_lock);
/*
* If we're getting stats on the root vdev, aggregate the I/O counts
* over all top-level vdevs (i.e. the direct children of the root).
*/
if (vd == rvd) {
for (c = 0; c < rvd->vdev_children; c++) {
2008-11-20 12:01:55 -08:00
vdev_t *cvd = rvd->vdev_child[c];
vdev_stat_t *cvs = &cvd->vdev_stat;
mutex_enter(&vd->vdev_stat_lock);
for (t = 0; t < ZIO_TYPES; t++) {
2008-11-20 12:01:55 -08:00
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
}
cvs->vs_scan_removing = cvd->vdev_removing;
2008-11-20 12:01:55 -08:00
mutex_exit(&vd->vdev_stat_lock);
}
}
}
void
vdev_clear_stats(vdev_t *vd)
{
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_space = 0;
vd->vdev_stat.vs_dspace = 0;
vd->vdev_stat.vs_alloc = 0;
mutex_exit(&vd->vdev_stat_lock);
}
void
vdev_scan_stat_init(vdev_t *vd)
{
vdev_stat_t *vs = &vd->vdev_stat;
int c;
for (c = 0; c < vd->vdev_children; c++)
vdev_scan_stat_init(vd->vdev_child[c]);
mutex_enter(&vd->vdev_stat_lock);
vs->vs_scan_processed = 0;
mutex_exit(&vd->vdev_stat_lock);
}
2008-11-20 12:01:55 -08:00
void
vdev_stat_update(zio_t *zio, uint64_t psize)
2008-11-20 12:01:55 -08:00
{
2009-01-15 13:59:39 -08:00
spa_t *spa = zio->io_spa;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
2008-11-20 12:01:55 -08:00
vdev_t *pvd;
uint64_t txg = zio->io_txg;
vdev_stat_t *vs = &vd->vdev_stat;
zio_type_t type = zio->io_type;
int flags = zio->io_flags;
/*
* If this i/o is a gang leader, it didn't do any actual work.
*/
if (zio->io_gang_tree)
return;
2008-11-20 12:01:55 -08:00
if (zio->io_error == 0) {
/*
* If this is a root i/o, don't count it -- we've already
* counted the top-level vdevs, and vdev_get_stats() will
* aggregate them when asked. This reduces contention on
* the root vdev_stat_lock and implicitly handles blocks
* that compress away to holes, for which there is no i/o.
* (Holes never create vdev children, so all the counters
* remain zero, which is what we want.)
*
* Note: this only applies to successful i/o (io_error == 0)
* because unlike i/o counts, errors are not additive.
* When reading a ditto block, for example, failure of
* one top-level vdev does not imply a root-level error.
*/
if (vd == rvd)
return;
ASSERT(vd == zio->io_vd);
2009-01-15 13:59:39 -08:00
if (flags & ZIO_FLAG_IO_BYPASS)
return;
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_IO_REPAIR) {
if (flags & ZIO_FLAG_SCAN_THREAD) {
dsl_scan_phys_t *scn_phys =
&spa->spa_dsl_pool->dp_scan->scn_phys;
uint64_t *processed = &scn_phys->scn_processed;
/* XXX cleanup? */
if (vd->vdev_ops->vdev_op_leaf)
atomic_add_64(processed, psize);
vs->vs_scan_processed += psize;
}
2009-01-15 13:59:39 -08:00
if (flags & ZIO_FLAG_SELF_HEAL)
vs->vs_self_healed += psize;
2008-11-20 12:01:55 -08:00
}
2009-01-15 13:59:39 -08:00
vs->vs_ops[type]++;
vs->vs_bytes[type] += psize;
mutex_exit(&vd->vdev_stat_lock);
2008-11-20 12:01:55 -08:00
return;
}
if (flags & ZIO_FLAG_SPECULATIVE)
return;
2009-07-02 15:44:48 -07:00
/*
* If this is an I/O error that is going to be retried, then ignore the
* error. Otherwise, the user may interpret B_FAILFAST I/O errors as
* hard errors, when in reality they can happen for any number of
* innocuous reasons (bus resets, MPxIO link failure, etc).
*/
if (zio->io_error == EIO &&
!(zio->io_flags & ZIO_FLAG_IO_RETRY))
return;
/*
* Intent logs writes won't propagate their error to the root
* I/O so don't mark these types of failures as pool-level
* errors.
*/
if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
return;
mutex_enter(&vd->vdev_stat_lock);
2009-07-02 15:44:48 -07:00
if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
if (zio->io_error == ECKSUM)
vs->vs_checksum_errors++;
else
vs->vs_read_errors++;
2008-11-20 12:01:55 -08:00
}
2009-07-02 15:44:48 -07:00
if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
vs->vs_write_errors++;
mutex_exit(&vd->vdev_stat_lock);
2008-11-20 12:01:55 -08:00
2009-01-15 13:59:39 -08:00
if (type == ZIO_TYPE_WRITE && txg != 0 &&
(!(flags & ZIO_FLAG_IO_REPAIR) ||
(flags & ZIO_FLAG_SCAN_THREAD) ||
spa->spa_claiming)) {
2009-01-15 13:59:39 -08:00
/*
* This is either a normal write (not a repair), or it's
* a repair induced by the scrub thread, or it's a repair
* made by zil_claim() during spa_load() in the first txg.
* In the normal case, we commit the DTL change in the same
* txg as the block was born. In the scrub-induced repair
* case, we know that scrubs run in first-pass syncing context,
* so we commit the DTL change in spa_syncing_txg(spa).
* In the zil_claim() case, we commit in spa_first_txg(spa).
2009-01-15 13:59:39 -08:00
*
* We currently do not make DTL entries for failed spontaneous
* self-healing writes triggered by normal (non-scrubbing)
* reads, because we have no transactional context in which to
* do so -- and it's not clear that it'd be desirable anyway.
*/
if (vd->vdev_ops->vdev_op_leaf) {
uint64_t commit_txg = txg;
if (flags & ZIO_FLAG_SCAN_THREAD) {
2009-01-15 13:59:39 -08:00
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
ASSERT(spa_sync_pass(spa) == 1);
vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
commit_txg = spa_syncing_txg(spa);
} else if (spa->spa_claiming) {
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
commit_txg = spa_first_txg(spa);
2009-01-15 13:59:39 -08:00
}
ASSERT(commit_txg >= spa_syncing_txg(spa));
2009-01-15 13:59:39 -08:00
if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
2008-11-20 12:01:55 -08:00
return;
2009-01-15 13:59:39 -08:00
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
2008-11-20 12:01:55 -08:00
}
2009-01-15 13:59:39 -08:00
if (vd != rvd)
vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
2008-11-20 12:01:55 -08:00
}
}
/*
* Update the in-core space usage stats for this vdev, its metaslab class,
* and the root vdev.
2008-11-20 12:01:55 -08:00
*/
void
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
int64_t space_delta)
2008-11-20 12:01:55 -08:00
{
int64_t dspace_delta = space_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
metaslab_group_t *mg = vd->vdev_mg;
metaslab_class_t *mc = mg ? mg->mg_class : NULL;
2008-11-20 12:01:55 -08:00
ASSERT(vd == vd->vdev_top);
/*
* Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
* factor. We must calculate this here and not at the root vdev
* because the root vdev's psize-to-asize is simply the max of its
* childrens', thus not accurate enough for us.
*/
ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
2009-07-02 15:44:48 -07:00
ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
2008-11-20 12:01:55 -08:00
dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
vd->vdev_deflate_ratio;
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_alloc += alloc_delta;
vd->vdev_stat.vs_space += space_delta;
2008-11-20 12:01:55 -08:00
vd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&vd->vdev_stat_lock);
if (mc == spa_normal_class(spa)) {
2008-11-20 12:01:55 -08:00
mutex_enter(&rvd->vdev_stat_lock);
rvd->vdev_stat.vs_alloc += alloc_delta;
rvd->vdev_stat.vs_space += space_delta;
2008-11-20 12:01:55 -08:00
rvd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&rvd->vdev_stat_lock);
}
if (mc != NULL) {
ASSERT(rvd == vd->vdev_parent);
ASSERT(vd->vdev_ms_count != 0);
metaslab_class_space_update(mc,
alloc_delta, defer_delta, space_delta, dspace_delta);
}
2008-11-20 12:01:55 -08:00
}
/*
* Mark a top-level vdev's config as dirty, placing it on the dirty list
* so that it will be written out next time the vdev configuration is synced.
* If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
*/
void
vdev_config_dirty(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
int c;
ASSERT(spa_writeable(spa));
2008-11-20 12:01:55 -08:00
/*
2009-07-02 15:44:48 -07:00
* If this is an aux vdev (as with l2cache and spare devices), then we
* update the vdev config manually and set the sync flag.
*/
if (vd->vdev_aux != NULL) {
spa_aux_vdev_t *sav = vd->vdev_aux;
nvlist_t **aux;
uint_t naux;
for (c = 0; c < sav->sav_count; c++) {
if (sav->sav_vdevs[c] == vd)
break;
}
if (c == sav->sav_count) {
/*
* We're being removed. There's nothing more to do.
*/
ASSERT(sav->sav_sync == B_TRUE);
return;
}
sav->sav_sync = B_TRUE;
2009-07-02 15:44:48 -07:00
if (nvlist_lookup_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
}
ASSERT(c < naux);
/*
* Setting the nvlist in the middle if the array is a little
* sketchy, but it will work.
*/
nvlist_free(aux[c]);
aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
return;
}
/*
* The dirty list is protected by the SCL_CONFIG lock. The caller
* must either hold SCL_CONFIG as writer, or must be the sync thread
* (which holds SCL_CONFIG as reader). There's only one sync thread,
2008-11-20 12:01:55 -08:00
* so this is sufficient to ensure mutual exclusion.
*/
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_CONFIG, RW_READER)));
2008-11-20 12:01:55 -08:00
if (vd == rvd) {
for (c = 0; c < rvd->vdev_children; c++)
vdev_config_dirty(rvd->vdev_child[c]);
} else {
ASSERT(vd == vd->vdev_top);
if (!list_link_active(&vd->vdev_config_dirty_node) &&
!vd->vdev_ishole)
list_insert_head(&spa->spa_config_dirty_list, vd);
2008-11-20 12:01:55 -08:00
}
}
void
vdev_config_clean(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_CONFIG, RW_READER)));
2008-11-20 12:01:55 -08:00
ASSERT(list_link_active(&vd->vdev_config_dirty_node));
list_remove(&spa->spa_config_dirty_list, vd);
2008-11-20 12:01:55 -08:00
}
/*
* Mark a top-level vdev's state as dirty, so that the next pass of
* spa_sync() can convert this into vdev_config_dirty(). We distinguish
* the state changes from larger config changes because they require
* much less locking, and are often needed for administrative actions.
*/
void
vdev_state_dirty(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_writeable(spa));
ASSERT(vd == vd->vdev_top);
/*
* The state list is protected by the SCL_STATE lock. The caller
* must either hold SCL_STATE as writer, or must be the sync thread
* (which holds SCL_STATE as reader). There's only one sync thread,
* so this is sufficient to ensure mutual exclusion.
*/
ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_STATE, RW_READER)));
if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
list_insert_head(&spa->spa_state_dirty_list, vd);
}
void
vdev_state_clean(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_STATE, RW_READER)));
ASSERT(list_link_active(&vd->vdev_state_dirty_node));
list_remove(&spa->spa_state_dirty_list, vd);
}
/*
* Propagate vdev state up from children to parent.
*/
2008-11-20 12:01:55 -08:00
void
vdev_propagate_state(vdev_t *vd)
{
2009-01-15 13:59:39 -08:00
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
2008-11-20 12:01:55 -08:00
int degraded = 0, faulted = 0;
int corrupted = 0;
vdev_t *child;
int c;
2008-11-20 12:01:55 -08:00
if (vd->vdev_children > 0) {
for (c = 0; c < vd->vdev_children; c++) {
2008-11-20 12:01:55 -08:00
child = vd->vdev_child[c];
/*
* Don't factor holes into the decision.
*/
if (child->vdev_ishole)
continue;
if (!vdev_readable(child) ||
2009-01-15 13:59:39 -08:00
(!vdev_writeable(child) && spa_writeable(spa))) {
/*
* Root special: if there is a top-level log
* device, treat the root vdev as if it were
* degraded.
*/
if (child->vdev_islog && vd == rvd)
degraded++;
else
faulted++;
} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
2008-11-20 12:01:55 -08:00
degraded++;
}
2008-11-20 12:01:55 -08:00
if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
corrupted++;
}
vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
/*
* Root special: if there is a top-level vdev that cannot be
2008-11-20 12:01:55 -08:00
* opened due to corrupted metadata, then propagate the root
* vdev's aux state as 'corrupt' rather than 'insufficient
* replicas'.
*/
if (corrupted && vd == rvd &&
rvd->vdev_state == VDEV_STATE_CANT_OPEN)
vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
}
if (vd->vdev_parent)
2008-11-20 12:01:55 -08:00
vdev_propagate_state(vd->vdev_parent);
}
/*
* Set a vdev's state. If this is during an open, we don't update the parent
* state, because we're in the process of opening children depth-first.
* Otherwise, we propagate the change to the parent.
*
* If this routine places a device in a faulted state, an appropriate ereport is
* generated.
*/
void
vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
{
uint64_t save_state;
spa_t *spa = vd->vdev_spa;
2008-11-20 12:01:55 -08:00
if (state == vd->vdev_state) {
vd->vdev_stat.vs_aux = aux;
return;
}
save_state = vd->vdev_state;
vd->vdev_state = state;
vd->vdev_stat.vs_aux = aux;
/*
* If we are setting the vdev state to anything but an open state, then
* always close the underlying device unless the device has requested
* a delayed close (i.e. we're about to remove or fault the device).
* Otherwise, we keep accessible but invalid devices open forever.
* We don't call vdev_close() itself, because that implies some extra
* checks (offline, etc) that we don't want here. This is limited to
* leaf devices, because otherwise closing the device will affect other
* children.
2008-11-20 12:01:55 -08:00
*/
if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
vd->vdev_ops->vdev_op_leaf)
2008-11-20 12:01:55 -08:00
vd->vdev_ops->vdev_op_close(vd);
/*
* If we have brought this vdev back into service, we need
* to notify fmd so that it can gracefully repair any outstanding
* cases due to a missing device. We do this in all cases, even those
* that probably don't correlate to a repaired fault. This is sure to
* catch all cases, and we let the zfs-retire agent sort it out. If
* this is a transient state it's OK, as the retire agent will
* double-check the state of the vdev before repairing it.
*/
if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
vd->vdev_prevstate != state)
zfs_post_state_change(spa, vd);
2008-11-20 12:01:55 -08:00
if (vd->vdev_removed &&
state == VDEV_STATE_CANT_OPEN &&
(aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
/*
* If the previous state is set to VDEV_STATE_REMOVED, then this
* device was previously marked removed and someone attempted to
* reopen it. If this failed due to a nonexistent device, then
* keep the device in the REMOVED state. We also let this be if
* it is one of our special test online cases, which is only
* attempting to online the device and shouldn't generate an FMA
* fault.
*/
vd->vdev_state = VDEV_STATE_REMOVED;
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
} else if (state == VDEV_STATE_REMOVED) {
vd->vdev_removed = B_TRUE;
} else if (state == VDEV_STATE_CANT_OPEN) {
/*
* If we fail to open a vdev during an import or recovery, we
* mark it as "not available", which signifies that it was
* never there to begin with. Failure to open such a device
* is not considered an error.
2008-11-20 12:01:55 -08:00
*/
if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
spa_load_state(spa) == SPA_LOAD_RECOVER) &&
2008-11-20 12:01:55 -08:00
vd->vdev_ops->vdev_op_leaf)
vd->vdev_not_present = 1;
/*
* Post the appropriate ereport. If the 'prevstate' field is
* set to something other than VDEV_STATE_UNKNOWN, it indicates
* that this is part of a vdev_reopen(). In this case, we don't
* want to post the ereport if the device was already in the
* CANT_OPEN state beforehand.
*
* If the 'checkremove' flag is set, then this is an attempt to
* online the device in response to an insertion event. If we
* hit this case, then we have detected an insertion event for a
* faulted or offline device that wasn't in the removed state.
* In this scenario, we don't post an ereport because we are
* about to replace the device, or attempt an online with
* vdev_forcefault, which will generate the fault for us.
*/
if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
!vd->vdev_not_present && !vd->vdev_checkremove &&
vd != spa->spa_root_vdev) {
2008-11-20 12:01:55 -08:00
const char *class;
switch (aux) {
case VDEV_AUX_OPEN_FAILED:
class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
break;
case VDEV_AUX_CORRUPT_DATA:
class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
break;
case VDEV_AUX_NO_REPLICAS:
class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
break;
case VDEV_AUX_BAD_GUID_SUM:
class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
break;
case VDEV_AUX_TOO_SMALL:
class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
break;
case VDEV_AUX_BAD_LABEL:
class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
break;
default:
class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
}
zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
2008-11-20 12:01:55 -08:00
}
/* Erase any notion of persistent removed state */
vd->vdev_removed = B_FALSE;
} else {
vd->vdev_removed = B_FALSE;
}
2009-07-02 15:44:48 -07:00
if (!isopen && vd->vdev_parent)
vdev_propagate_state(vd->vdev_parent);
2008-11-20 12:01:55 -08:00
}
/*
* Check the vdev configuration to ensure that it's capable of supporting
* a root pool.
*/
boolean_t
vdev_is_bootable(vdev_t *vd)
{
#if defined(__sun__) || defined(__sun)
/*
* Currently, we do not support RAID-Z or partial configuration.
* In addition, only a single top-level vdev is allowed and none of the
* leaves can be wholedisks.
*/
int c;
if (!vd->vdev_ops->vdev_op_leaf) {
char *vdev_type = vd->vdev_ops->vdev_op_type;
if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
vd->vdev_children > 1) {
return (B_FALSE);
} else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
return (B_FALSE);
}
} else if (vd->vdev_wholedisk == 1) {
return (B_FALSE);
}
for (c = 0; c < vd->vdev_children; c++) {
if (!vdev_is_bootable(vd->vdev_child[c]))
return (B_FALSE);
}
#endif /* __sun__ || __sun */
return (B_TRUE);
}
2009-07-02 15:44:48 -07:00
/*
* Load the state from the original vdev tree (ovd) which
* we've retrieved from the MOS config object. If the original
* vdev was offline or faulted then we transfer that state to the
* device in the current vdev tree (nvd).
*/
2009-07-02 15:44:48 -07:00
void
vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
2009-07-02 15:44:48 -07:00
{
int c;
2009-07-02 15:44:48 -07:00
ASSERT(nvd->vdev_top->vdev_islog);
ASSERT(spa_config_held(nvd->vdev_spa,
SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
2009-07-02 15:44:48 -07:00
for (c = 0; c < nvd->vdev_children; c++)
vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
2009-07-02 15:44:48 -07:00
if (nvd->vdev_ops->vdev_op_leaf) {
2009-07-02 15:44:48 -07:00
/*
* Restore the persistent vdev state
2009-07-02 15:44:48 -07:00
*/
nvd->vdev_offline = ovd->vdev_offline;
nvd->vdev_faulted = ovd->vdev_faulted;
nvd->vdev_degraded = ovd->vdev_degraded;
nvd->vdev_removed = ovd->vdev_removed;
2009-07-02 15:44:48 -07:00
}
}
/*
* Determine if a log device has valid content. If the vdev was
* removed or faulted in the MOS config then we know that
* the content on the log device has already been written to the pool.
*/
boolean_t
vdev_log_state_valid(vdev_t *vd)
{
int c;
if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
!vd->vdev_removed)
return (B_TRUE);
for (c = 0; c < vd->vdev_children; c++)
if (vdev_log_state_valid(vd->vdev_child[c]))
return (B_TRUE);
return (B_FALSE);
}
2009-07-02 15:44:48 -07:00
/*
* Expand a vdev if possible.
*/
void
vdev_expand(vdev_t *vd, uint64_t txg)
{
ASSERT(vd->vdev_top == vd);
ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
VERIFY(vdev_metaslab_init(vd, txg) == 0);
vdev_config_dirty(vd);
}
}
/*
* Split a vdev.
*/
void
vdev_split(vdev_t *vd)
{
vdev_t *cvd, *pvd = vd->vdev_parent;
vdev_remove_child(pvd, vd);
vdev_compact_children(pvd);
cvd = pvd->vdev_child[0];
if (pvd->vdev_children == 1) {
vdev_remove_parent(cvd);
cvd->vdev_splitting = B_TRUE;
}
vdev_propagate_state(cvd);
}
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(vdev_fault);
EXPORT_SYMBOL(vdev_degrade);
EXPORT_SYMBOL(vdev_online);
EXPORT_SYMBOL(vdev_offline);
EXPORT_SYMBOL(vdev_clear);
Add missing ZFS tunables This commit adds module options for all existing zfs tunables. Ideally the average user should never need to modify any of these values. However, in practice sometimes you do need to tweak these values for one reason or another. In those cases it's nice not to have to resort to rebuilding from source. All tunables are visable to modinfo and the list is as follows: $ modinfo module/zfs/zfs.ko filename: module/zfs/zfs.ko license: CDDL author: Sun Microsystems/Oracle, Lawrence Livermore National Laboratory description: ZFS srcversion: 8EAB1D71DACE05B5AA61567 depends: spl,znvpair,zcommon,zunicode,zavl vermagic: 2.6.32-131.0.5.el6.x86_64 SMP mod_unload modversions parm: zvol_major:Major number for zvol device (uint) parm: zvol_threads:Number of threads for zvol device (uint) parm: zio_injection_enabled:Enable fault injection (int) parm: zio_bulk_flags:Additional flags to pass to bulk buffers (int) parm: zio_delay_max:Max zio millisec delay before posting event (int) parm: zio_requeue_io_start_cut_in_line:Prioritize requeued I/O (bool) parm: zil_replay_disable:Disable intent logging replay (int) parm: zfs_nocacheflush:Disable cache flushes (bool) parm: zfs_read_chunk_size:Bytes to read per chunk (long) parm: zfs_vdev_max_pending:Max pending per-vdev I/Os (int) parm: zfs_vdev_min_pending:Min pending per-vdev I/Os (int) parm: zfs_vdev_aggregation_limit:Max vdev I/O aggregation size (int) parm: zfs_vdev_time_shift:Deadline time shift for vdev I/O (int) parm: zfs_vdev_ramp_rate:Exponential I/O issue ramp-up rate (int) parm: zfs_vdev_read_gap_limit:Aggregate read I/O over gap (int) parm: zfs_vdev_write_gap_limit:Aggregate write I/O over gap (int) parm: zfs_vdev_scheduler:I/O scheduler (charp) parm: zfs_vdev_cache_max:Inflate reads small than max (int) parm: zfs_vdev_cache_size:Total size of the per-disk cache (int) parm: zfs_vdev_cache_bshift:Shift size to inflate reads too (int) parm: zfs_scrub_limit:Max scrub/resilver I/O per leaf vdev (int) parm: zfs_recover:Set to attempt to recover from fatal errors (int) parm: spa_config_path:SPA config file (/etc/zfs/zpool.cache) (charp) parm: zfs_zevent_len_max:Max event queue length (int) parm: zfs_zevent_cols:Max event column width (int) parm: zfs_zevent_console:Log events to the console (int) parm: zfs_top_maxinflight:Max I/Os per top-level (int) parm: zfs_resilver_delay:Number of ticks to delay resilver (int) parm: zfs_scrub_delay:Number of ticks to delay scrub (int) parm: zfs_scan_idle:Idle window in clock ticks (int) parm: zfs_scan_min_time_ms:Min millisecs to scrub per txg (int) parm: zfs_free_min_time_ms:Min millisecs to free per txg (int) parm: zfs_resilver_min_time_ms:Min millisecs to resilver per txg (int) parm: zfs_no_scrub_io:Set to disable scrub I/O (bool) parm: zfs_no_scrub_prefetch:Set to disable scrub prefetching (bool) parm: zfs_txg_timeout:Max seconds worth of delta per txg (int) parm: zfs_no_write_throttle:Disable write throttling (int) parm: zfs_write_limit_shift:log2(fraction of memory) per txg (int) parm: zfs_txg_synctime_ms:Target milliseconds between tgx sync (int) parm: zfs_write_limit_min:Min tgx write limit (ulong) parm: zfs_write_limit_max:Max tgx write limit (ulong) parm: zfs_write_limit_inflated:Inflated tgx write limit (ulong) parm: zfs_write_limit_override:Override tgx write limit (ulong) parm: zfs_prefetch_disable:Disable all ZFS prefetching (int) parm: zfetch_max_streams:Max number of streams per zfetch (uint) parm: zfetch_min_sec_reap:Min time before stream reclaim (uint) parm: zfetch_block_cap:Max number of blocks to fetch at a time (uint) parm: zfetch_array_rd_sz:Number of bytes in a array_read (ulong) parm: zfs_pd_blks_max:Max number of blocks to prefetch (int) parm: zfs_dedup_prefetch:Enable prefetching dedup-ed blks (int) parm: zfs_arc_min:Min arc size (ulong) parm: zfs_arc_max:Max arc size (ulong) parm: zfs_arc_meta_limit:Meta limit for arc size (ulong) parm: zfs_arc_reduce_dnlc_percent:Meta reclaim percentage (int) parm: zfs_arc_grow_retry:Seconds before growing arc size (int) parm: zfs_arc_shrink_shift:log2(fraction of arc to reclaim) (int) parm: zfs_arc_p_min_shift:arc_c shift to calc min/max arc_p (int)
2011-05-03 15:09:28 -07:00
module_param(zfs_scrub_limit, int, 0644);
MODULE_PARM_DESC(zfs_scrub_limit, "Max scrub/resilver I/O per leaf vdev");
#endif