loader: rewrite zfs vdev initialization

In some cases the pool discovery will get stuck in infinite loop while setting
up the vdev children.

To fix, we split the vdev setup into two parts, first we create vdevs based on
configuration we do get from pool label, then, we process pool config from MOS
and update the pool config if needed.

Testing done: confirm previously hung loader is not hung any more.

MFC after:	1 week
This commit is contained in:
Toomas Soome 2019-12-15 21:52:40 +00:00
parent 686bcb5c14
commit 3c2db0ef43
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=355786
3 changed files with 451 additions and 268 deletions

View File

@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
#include <sys/stat.h>
#include <sys/stdint.h>
#include <sys/list.h>
#include <machine/_inttypes.h>
#include "zfsimpl.h"
#include "zfssubr.c"
@ -494,12 +495,12 @@ vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
}
rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
if (rc)
return (rc);
if (rc == 0) {
if (bp != NULL)
return (zio_checksum_verify(vdev->spa, bp, buf));
rc = zio_checksum_verify(vdev->v_spa, bp, buf);
}
return (0);
return (rc);
}
typedef struct remap_segment {
@ -567,6 +568,7 @@ vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os,
vim->vim_havecounts =
(vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0);
return (vim);
}
@ -777,8 +779,10 @@ static vdev_t *
vdev_lookup_top(spa_t *spa, uint64_t vdev)
{
vdev_t *rvd;
vdev_list_t *vlist;
STAILQ_FOREACH(rvd, &spa->spa_vdevs, v_childlink)
vlist = &spa->spa_root_vdev->v_children;
STAILQ_FOREACH(rvd, vlist, v_childlink)
if (rvd->v_id == vdev)
break;
@ -841,7 +845,7 @@ static void
vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg)
{
list_t stack;
spa_t *spa = vd->spa;
spa_t *spa = vd->v_spa;
zio_t *zio = arg;
remap_segment_t *rs;
@ -899,7 +903,6 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg)
*/
if (zio->io_error != 0)
break;
rs->rs_offset += inner_size;
rs->rs_asize -= inner_size;
rs->rs_split_offset += inner_size;
@ -935,19 +938,20 @@ static int
vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
off_t offset, size_t bytes)
{
zio_t zio = { 0 };
spa_t *spa = vdev->spa;
indirect_vsd_t *iv = malloc(sizeof (*iv));
zio_t zio;
spa_t *spa = vdev->v_spa;
indirect_vsd_t *iv;
indirect_split_t *first;
int rc = EIO;
iv = calloc(1, sizeof(*iv));
if (iv == NULL)
return (ENOMEM);
bzero(iv, sizeof (*iv));
list_create(&iv->iv_splits,
sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
bzero(&zio, sizeof(zio));
zio.io_spa = spa;
zio.io_bp = (blkptr_t *)bp;
zio.io_data = buf;
@ -1086,40 +1090,72 @@ vdev_create(uint64_t guid, vdev_read_t *_read)
vdev_t *vdev;
vdev_indirect_config_t *vic;
vdev = malloc(sizeof(vdev_t));
memset(vdev, 0, sizeof(vdev_t));
vdev = calloc(1, sizeof(vdev_t));
if (vdev != NULL) {
STAILQ_INIT(&vdev->v_children);
vdev->v_guid = guid;
vdev->v_state = VDEV_STATE_OFFLINE;
vdev->v_read = _read;
/*
* root vdev has no read function.
* We only point root vdev from spa.
*/
if (_read != NULL) {
vic = &vdev->vdev_indirect_config;
vic->vic_prev_indirect_vdev = UINT64_MAX;
STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
}
}
return (vdev);
}
static int
vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
vdev_t **vdevp, int is_newer)
static void
vdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist)
{
int rc;
uint64_t guid, id, ashift, asize, nparity;
const char *type;
const char *path;
vdev_t *vdev, *kid;
const unsigned char *kids;
int nkids, i, is_new;
uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
uint64_t is_log;
if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
NULL, &guid)
|| nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id)
|| nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
is_log = 0;
(void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
&is_offline);
(void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
&is_removed);
(void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
&is_faulted);
(void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64,
NULL, &is_degraded);
(void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64,
NULL, &isnt_present);
(void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
&is_log);
if (is_offline != 0)
vdev->v_state = VDEV_STATE_OFFLINE;
else if (is_removed != 0)
vdev->v_state = VDEV_STATE_REMOVED;
else if (is_faulted != 0)
vdev->v_state = VDEV_STATE_FAULTED;
else if (is_degraded != 0)
vdev->v_state = VDEV_STATE_DEGRADED;
else if (isnt_present != 0)
vdev->v_state = VDEV_STATE_CANT_OPEN;
vdev->v_islog = is_log == 1;
}
static int
vdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp)
{
uint64_t id, ashift, asize, nparity;
const char *path;
const char *type;
vdev_t *vdev;
if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) ||
nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
NULL, &type)) {
printf("ZFS: can't find vdev details\n");
return (ENOENT);
}
@ -1136,26 +1172,6 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
return (EIO);
}
is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
is_log = 0;
nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
&is_offline);
nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
&is_removed);
nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
&is_faulted);
nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL,
&is_degraded);
nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL,
&isnt_present);
nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
&is_log);
vdev = vdev_find(guid);
if (!vdev) {
is_new = 1;
if (strcmp(type, VDEV_TYPE_MIRROR) == 0)
vdev = vdev_create(guid, vdev_mirror_read);
else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0)
@ -1166,40 +1182,46 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
vdev_indirect_config_t *vic;
vdev = vdev_create(guid, vdev_indirect_read);
if (vdev != NULL) {
vdev->v_state = VDEV_STATE_HEALTHY;
vic = &vdev->vdev_indirect_config;
nvlist_find(nvlist,
ZPOOL_CONFIG_INDIRECT_OBJECT, DATA_TYPE_UINT64,
ZPOOL_CONFIG_INDIRECT_OBJECT,
DATA_TYPE_UINT64,
NULL, &vic->vic_mapping_object);
nvlist_find(nvlist,
ZPOOL_CONFIG_INDIRECT_BIRTHS, DATA_TYPE_UINT64,
ZPOOL_CONFIG_INDIRECT_BIRTHS,
DATA_TYPE_UINT64,
NULL, &vic->vic_births_object);
nvlist_find(nvlist,
ZPOOL_CONFIG_PREV_INDIRECT_VDEV, DATA_TYPE_UINT64,
ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
DATA_TYPE_UINT64,
NULL, &vic->vic_prev_indirect_vdev);
} else
vdev = vdev_create(guid, vdev_disk_read);
vdev->v_id = id;
vdev->v_top = pvdev != NULL ? pvdev : vdev;
if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
DATA_TYPE_UINT64, NULL, &ashift) == 0) {
vdev->v_ashift = ashift;
} else {
vdev->v_ashift = 0;
}
} else {
vdev = vdev_create(guid, vdev_disk_read);
}
if (vdev == NULL)
return (ENOMEM);
vdev_set_initial_state(vdev, nvlist);
vdev->v_id = id;
if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
DATA_TYPE_UINT64, NULL, &ashift) == 0)
vdev->v_ashift = ashift;
if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
DATA_TYPE_UINT64, NULL, &asize) == 0) {
vdev->v_psize = asize +
VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
DATA_TYPE_UINT64, NULL, &nparity) == 0) {
DATA_TYPE_UINT64, NULL, &nparity) == 0)
vdev->v_nparity = nparity;
} else {
vdev->v_nparity = 0;
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
DATA_TYPE_STRING, NULL, &path) == 0) {
if (strncmp(path, "/dev/", 5) == 0)
@ -1208,6 +1230,7 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
} else {
char *name;
name = NULL;
if (strcmp(type, "raidz") == 0) {
if (vdev->v_nparity < 1 ||
vdev->v_nparity > 3) {
@ -1216,61 +1239,155 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
"vdevs\n");
return (EIO);
}
rc = asprintf(&name, "%s%d-%jd", type,
(void) asprintf(&name, "%s%d-%" PRIu64, type,
vdev->v_nparity, id);
} else {
rc = asprintf(&name, "%s-%jd", type, id);
(void) asprintf(&name, "%s-%" PRIu64, type, id);
}
if (rc < 0)
return (ENOMEM);
vdev->v_name = name;
}
vdev->v_islog = is_log == 1;
} else {
is_new = 0;
*vdevp = vdev;
return (0);
}
if (is_new || is_newer) {
/*
* This is either new vdev or we've already seen this vdev,
* but from an older vdev label, so let's refresh its state
* from the newer label.
* Find slot for vdev. We return either NULL to signal to use
* STAILQ_INSERT_HEAD, or we return link element to be used with
* STAILQ_INSERT_AFTER.
*/
if (is_offline)
vdev->v_state = VDEV_STATE_OFFLINE;
else if (is_removed)
vdev->v_state = VDEV_STATE_REMOVED;
else if (is_faulted)
vdev->v_state = VDEV_STATE_FAULTED;
else if (is_degraded)
vdev->v_state = VDEV_STATE_DEGRADED;
else if (isnt_present)
vdev->v_state = VDEV_STATE_CANT_OPEN;
static vdev_t *
vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev)
{
vdev_t *v, *previous;
if (STAILQ_EMPTY(&top_vdev->v_children))
return (NULL);
previous = NULL;
STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) {
if (v->v_id > vdev->v_id)
return (previous);
if (v->v_id == vdev->v_id)
return (v);
if (v->v_id < vdev->v_id)
previous = v;
}
return (previous);
}
static size_t
vdev_child_count(vdev_t *vdev)
{
vdev_t *v;
size_t count;
count = 0;
STAILQ_FOREACH(v, &vdev->v_children, v_childlink) {
count++;
}
return (count);
}
/*
* Insert vdev into top_vdev children list. List is ordered by v_id.
*/
static void
vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
{
vdev_t *previous;
size_t count;
/*
* The top level vdev can appear in random order, depending how
* the firmware is presenting the disk devices.
* However, we will insert vdev to create list ordered by v_id,
* so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER
* as STAILQ does not have insert before.
*/
previous = vdev_find_previous(top_vdev, vdev);
if (previous == NULL) {
STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink);
count = vdev_child_count(top_vdev);
if (top_vdev->v_nchildren < count)
top_vdev->v_nchildren = count;
return;
}
if (previous->v_id == vdev->v_id)
return;
STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev, v_childlink);
count = vdev_child_count(top_vdev);
if (top_vdev->v_nchildren < count)
top_vdev->v_nchildren = count;
}
static int
vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist)
{
vdev_t *top_vdev, *vdev;
const unsigned char *kids;
int rc, nkids;
/* Get top vdev. */
top_vdev = vdev_find(top_guid);
if (top_vdev == NULL) {
rc = vdev_init(top_guid, nvlist, &top_vdev);
if (rc != 0)
return (rc);
top_vdev->v_spa = spa;
top_vdev->v_top = top_vdev;
vdev_insert(spa->spa_root_vdev, top_vdev);
}
/* Add children if there are any. */
rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
&nkids, &kids);
/*
* Its ok if we don't have any kids.
*/
if (rc == 0) {
vdev->v_nchildren = nkids;
for (i = 0; i < nkids; i++) {
rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
if (rc)
for (int i = 0; i < nkids; i++) {
uint64_t guid;
rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
DATA_TYPE_UINT64, NULL, &guid);
if (rc != 0)
return (rc);
if (is_new)
STAILQ_INSERT_TAIL(&vdev->v_children, kid,
v_childlink);
rc = vdev_init(guid, kids, &vdev);
if (rc != 0)
return (rc);
vdev->v_spa = spa;
vdev->v_top = top_vdev;
vdev_insert(top_vdev, vdev);
kids = nvlist_next(kids);
}
} else {
vdev->v_nchildren = 0;
rc = 0;
}
if (vdevp)
*vdevp = vdev;
return (0);
return (rc);
}
static int
vdev_init_from_label(spa_t *spa, const unsigned char *nvlist)
{
uint64_t pool_guid, top_guid;
const unsigned char *vdevs;
if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
NULL, &pool_guid) ||
nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
NULL, &top_guid) ||
nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
NULL, &vdevs)) {
printf("ZFS: can't find vdev details\n");
return (ENOENT);
}
return (vdev_from_nvlist(spa, top_guid, vdevs));
}
static void
@ -1280,6 +1397,10 @@ vdev_set_state(vdev_t *vdev)
int good_kids;
int bad_kids;
STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
vdev_set_state(kid);
}
/*
* A mirror or raidz is healthy if all its kids are healthy. A
* mirror is degraded if any of its kids is healthy; a raidz
@ -1314,6 +1435,104 @@ vdev_set_state(vdev_t *vdev)
}
}
static int
vdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist)
{
vdev_t *vdev;
const unsigned char *kids;
int rc, nkids;
/* Update top vdev. */
vdev = vdev_find(top_guid);
if (vdev != NULL)
vdev_set_initial_state(vdev, nvlist);
/* Update children if there are any. */
rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
&nkids, &kids);
if (rc == 0) {
for (int i = 0; i < nkids; i++) {
uint64_t guid;
rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
DATA_TYPE_UINT64, NULL, &guid);
if (rc != 0)
break;
vdev = vdev_find(guid);
if (vdev != NULL)
vdev_set_initial_state(vdev, kids);
kids = nvlist_next(kids);
}
} else {
rc = 0;
}
return (rc);
}
static int
vdev_init_from_nvlist(spa_t *spa, const unsigned char *nvlist)
{
uint64_t pool_guid, vdev_children;
const unsigned char *vdevs, *kids;
int rc, nkids;
if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
NULL, &pool_guid) ||
nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64,
NULL, &vdev_children) ||
nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
NULL, &vdevs)) {
printf("ZFS: can't find vdev details\n");
return (ENOENT);
}
/* Wrong guid?! */
if (spa->spa_guid != pool_guid)
return (EIO);
spa->spa_root_vdev->v_nchildren = vdev_children;
rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
&nkids, &kids);
/*
* MOS config has at least one child for root vdev.
*/
if (rc != 0)
return (EIO);
for (int i = 0; i < nkids; i++) {
uint64_t guid;
vdev_t *vdev;
rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
NULL, &guid);
if (rc != 0)
break;
vdev = vdev_find(guid);
/*
* Top level vdev is missing, create it.
*/
if (vdev == NULL)
rc = vdev_from_nvlist(spa, guid, kids);
else
rc = vdev_update_from_nvlist(guid, kids);
if (rc != 0)
break;
kids = nvlist_next(kids);
}
/*
* Re-evaluate top-level vdev state.
*/
vdev_set_state(spa->spa_root_vdev);
return (rc);
}
static spa_t *
spa_find_by_guid(uint64_t guid)
{
@ -1356,7 +1575,7 @@ spa_get_primary_vdev(const spa_t *spa)
spa = spa_get_primary();
if (spa == NULL)
return (NULL);
vdev = STAILQ_FIRST(&spa->spa_vdevs);
vdev = spa->spa_root_vdev;
if (vdev == NULL)
return (NULL);
for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
@ -1377,8 +1596,14 @@ spa_create(uint64_t guid, const char *name)
free(spa);
return (NULL);
}
STAILQ_INIT(&spa->spa_vdevs);
spa->spa_guid = guid;
spa->spa_root_vdev = vdev_create(guid, NULL);
if (spa->spa_root_vdev == NULL) {
free(spa->spa_name);
free(spa);
return (NULL);
}
spa->spa_root_vdev->v_name = strdup("root");
STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
return (spa);
@ -1413,9 +1638,8 @@ pager_printf(const char *fmt, ...)
va_list args;
va_start(args, fmt);
vsprintf(line, fmt, args);
vsnprintf(line, sizeof(line), fmt, args);
va_end(args);
return (pager_output(line));
}
@ -1426,14 +1650,13 @@ pager_printf(const char *fmt, ...)
static int
print_state(int indent, const char *name, vdev_state_t state)
{
char buf[512];
int i;
char buf[512];
buf[0] = 0;
for (i = 0; i < indent; i++)
strcat(buf, " ");
strcat(buf, name);
return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
}
@ -1465,6 +1688,7 @@ spa_status(spa_t *spa)
{
static char bootfs[ZFS_MAXNAMELEN];
uint64_t rootid;
vdev_list_t *vlist;
vdev_t *vdev;
int good_kids, bad_kids, degraded_kids, ret;
vdev_state_t state;
@ -1493,7 +1717,8 @@ spa_status(spa_t *spa)
good_kids = 0;
degraded_kids = 0;
bad_kids = 0;
STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
vlist = &spa->spa_root_vdev->v_children;
STAILQ_FOREACH(vdev, vlist, v_childlink) {
if (vdev->v_state == VDEV_STATE_HEALTHY)
good_kids++;
else if (vdev->v_state == VDEV_STATE_DEGRADED)
@ -1511,7 +1736,8 @@ spa_status(spa_t *spa)
ret = print_state(0, spa->spa_name, state);
if (ret != 0)
return (ret);
STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
STAILQ_FOREACH(vdev, vlist, v_childlink) {
ret = vdev_status(vdev, 1);
if (ret != 0)
return (ret);
@ -1703,15 +1929,14 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
{
vdev_t vtmp;
spa_t *spa;
vdev_t *vdev, *top_vdev, *pool_vdev;
vdev_t *vdev;
unsigned char *nvlist;
uint64_t val;
uint64_t guid;
uint64_t guid, vdev_children;
uint64_t pool_txg, pool_guid;
const char *pool_name;
const unsigned char *vdevs;
const unsigned char *features;
int rc, is_newer;
int rc;
/*
* Load the vdev label and figure out which
@ -1783,18 +2008,17 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
*/
spa = spa_find_by_guid(pool_guid);
if (spa == NULL) {
nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN,
DATA_TYPE_UINT64, NULL, &vdev_children);
spa = spa_create(pool_guid, pool_name);
if (spa == NULL) {
free(nvlist);
return (ENOMEM);
}
spa->spa_root_vdev->v_nchildren = vdev_children;
}
if (pool_txg > spa->spa_txg) {
if (pool_txg > spa->spa_txg)
spa->spa_txg = pool_txg;
is_newer = 1;
} else {
is_newer = 0;
}
/*
* Get the vdev tree and create our in-core copy of it.
@ -1814,39 +2038,25 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
return (EIO);
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
NULL, &vdevs)) {
free(nvlist);
return (EIO);
}
rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
rc = vdev_init_from_label(spa, nvlist);
free(nvlist);
if (rc != 0)
return (rc);
/*
* Add the toplevel vdev to the pool if its not already there.
*/
STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
if (top_vdev == pool_vdev)
break;
if (!pool_vdev && top_vdev) {
top_vdev->spa = spa;
STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
}
/*
* We should already have created an incomplete vdev for this
* vdev. Find it and initialise it with our read proc.
*/
vdev = vdev_find(guid);
if (vdev) {
if (vdev != NULL) {
vdev->v_phys_read = _read;
vdev->v_read_priv = read_priv;
vdev->v_state = VDEV_STATE_HEALTHY;
vdev->v_psize = vtmp.v_psize;
/*
* If no other state is set, mark vdev healthy.
*/
if (vdev->v_state == VDEV_STATE_UNKNOWN)
vdev->v_state = VDEV_STATE_HEALTHY;
} else {
printf("ZFS: inconsistent nvlist contents\n");
return (EIO);
@ -1858,7 +2068,7 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
/*
* Re-evaluate top-level vdev state.
*/
vdev_set_state(top_vdev);
vdev_set_state(vdev->v_top);
/*
* Ok, we are happy with the pool so far. Lets find
@ -1867,7 +2077,6 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
*/
vdev_uberblock_load(vdev, &spa->spa_uberblock);
vdev->spa = spa;
if (spap != NULL)
*spap = spa;
return (0);
@ -1962,7 +2171,8 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
for (i = 0; i < SPA_DVAS_PER_BP; i++) {
const dva_t *dva = &bp->blk_dva[i];
vdev_t *vdev;
int vdevid;
vdev_list_t *vlist;
uint64_t vdevid;
off_t offset;
if (!dva->dva_word[0] && !dva->dva_word[1])
@ -1970,7 +2180,8 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
vdevid = DVA_GET_VDEV(dva);
offset = DVA_GET_OFFSET(dva);
STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
vlist = &spa->spa_root_vdev->v_children;
STAILQ_FOREACH(vdev, vlist, v_childlink) {
if (vdev->v_id == vdevid)
break;
}
@ -1979,7 +2190,7 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
size = BP_GET_PSIZE(bp);
if (vdev->v_read == vdev_raidz_read) {
align = 1ULL << vdev->v_top->v_ashift;
align = 1ULL << vdev->v_ashift;
if (P2PHASE(size, align) != 0)
size = P2ROUNDUP(size, align);
}
@ -2222,6 +2433,26 @@ fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
}
}
static int
fzap_check_size(uint64_t integer_size, uint64_t num_integers)
{
switch (integer_size) {
case 1:
case 2:
case 4:
case 8:
break;
default:
return (EINVAL);
}
if (integer_size * num_integers > ZAP_MAXVALUELEN)
return (E2BIG);
return (0);
}
/*
* Lookup a value in a fatzap directory. Assumes that the zap scratch
* buffer contains the directory header.
@ -2240,6 +2471,9 @@ fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
if (zh.zap_magic != ZAP_MAGIC)
return (EIO);
if ((rc = fzap_check_size(integer_size, num_integers)) != 0)
return (rc);
z.zap_block_shift = ilog2(bsize);
z.zap_phys = (zap_phys_t *)zap_scratch;
@ -2388,7 +2622,7 @@ fzap_list(const spa_t *spa, const dnode_phys_t *dnode,
zap_leaf_t zl;
zl.l_bs = z.zap_block_shift;
for (i = 0; i < zh.zap_num_leafs; i++) {
off_t off = (i + 1) << zl.l_bs;
off_t off = ((off_t)(i + 1)) << zl.l_bs;
char name[256], *p;
uint64_t value;
@ -2554,7 +2788,7 @@ fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name,
zap_leaf_t zl;
zl.l_bs = z.zap_block_shift;
for (i = 0; i < zh.zap_num_leafs; i++) {
off_t off = (i + 1) << zl.l_bs;
off_t off = ((off_t)(i + 1)) << zl.l_bs;
if (dnode_read(spa, dnode, off, zap_scratch, bsize))
return (EIO);
@ -2844,8 +3078,7 @@ zfs_get_root(const spa_t *spa, uint64_t *objid)
sizeof(props), 1, &props) == 0 &&
objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0 &&
zap_lookup(spa, &propdir, "bootfs",
sizeof(bootfs), 1, &bootfs) == 0 &&
bootfs != 0) {
sizeof(bootfs), 1, &bootfs) == 0 && bootfs != 0) {
*objid = bootfs;
return (0);
}
@ -2995,9 +3228,7 @@ zfs_spa_init(spa_t *spa)
dnode_phys_t dir;
uint64_t config_object;
unsigned char *nvlist;
char *type;
const unsigned char *nv;
int nkids, rc;
int rc;
if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
@ -3036,61 +3267,7 @@ zfs_spa_init(spa_t *spa)
return (rc);
/* Update vdevs from MOS config. */
if (nvlist_find(nvlist + 4, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
NULL, &nv)) {
rc = EIO;
goto done;
}
if (nvlist_find(nv, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL, &type)) {
printf("ZFS: can't find vdev details\n");
rc = ENOENT;
goto done;
}
if (strcmp(type, VDEV_TYPE_ROOT) != 0) {
rc = ENOENT;
goto done;
}
rc = nvlist_find(nv, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
&nkids, &nv);
if (rc != 0)
goto done;
for (int i = 0; i < nkids; i++) {
vdev_t *vd, *prev, *kid = NULL;
rc = vdev_init_from_nvlist(nv, NULL, &kid, 0);
if (rc != 0) {
printf("vdev_init_from_nvlist: %d\n", rc);
break;
}
kid->spa = spa;
prev = NULL;
STAILQ_FOREACH(vd, &spa->spa_vdevs, v_childlink) {
/* Already present? */
if (kid->v_id == vd->v_id) {
kid = NULL;
break;
}
if (vd->v_id > kid->v_id) {
if (prev == NULL) {
STAILQ_INSERT_HEAD(&spa->spa_vdevs,
kid, v_childlink);
} else {
STAILQ_INSERT_AFTER(&spa->spa_vdevs,
prev, kid, v_childlink);
}
kid = NULL;
break;
}
prev = vd;
}
if (kid != NULL)
STAILQ_INSERT_TAIL(&spa->spa_vdevs, kid, v_childlink);
nv = nvlist_next(nv);
}
rc = 0;
done:
rc = vdev_init_from_nvlist(spa, nvlist + 4);
free(nvlist);
return (rc);
}

View File

@ -760,6 +760,7 @@ typedef enum {
#define ZPOOL_CONFIG_IS_LOG "is_log"
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read"
#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
/*
* The persistent vdev state is stored as separate values rather than a single
@ -1173,6 +1174,8 @@ typedef enum dmu_objset_type {
DMU_OST_NUMTYPES
} dmu_objset_type_t;
#define ZAP_MAXVALUELEN (1024 * 8)
/*
* header for all bonus and spill buffers.
* The header has a fixed portion with a variable number
@ -1755,13 +1758,13 @@ typedef struct vdev {
int v_ashift; /* offset to block shift */
int v_nparity; /* # parity for raidz */
struct vdev *v_top; /* parent vdev */
int v_nchildren; /* # children */
size_t v_nchildren; /* # children */
vdev_state_t v_state; /* current state */
vdev_phys_read_t *v_phys_read; /* read from raw leaf vdev */
vdev_read_t *v_read; /* read from vdev */
void *v_read_priv; /* private data for read function */
boolean_t v_islog;
struct spa *spa; /* link to spa */
struct spa *v_spa; /* link to spa */
/*
* Values stored in the config for an indirect or removing vdev.
*/
@ -1780,11 +1783,10 @@ typedef struct spa {
uint64_t spa_guid; /* pool guid */
uint64_t spa_txg; /* most recent transaction */
struct uberblock spa_uberblock; /* best uberblock so far */
vdev_list_t spa_vdevs; /* list of all toplevel vdevs */
vdev_t *spa_root_vdev; /* toplevel vdev container */
objset_phys_t spa_mos; /* MOS for this pool */
zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
int spa_inited; /* initialized */
boolean_t spa_with_log; /* this pool has log */
} spa_t;

View File

@ -1637,8 +1637,11 @@ vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
* any errors.
*/
if (total_errors <= rm->rm_firstdatacol - parity_untried) {
int rv;
if (data_errors == 0) {
if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
if (rv == 0) {
/*
* If we read parity information (unnecessarily
* as it happens since no reconstruction was
@ -1683,7 +1686,8 @@ vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
code = vdev_raidz_reconstruct(rm, tgts, n);
if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
if (rv == 0) {
/*
* If we read more parity disks than were used
* for reconstruction, confirm that the other
@ -1757,7 +1761,7 @@ vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
if (total_errors > rm->rm_firstdatacol) {
error = EIO;
} else if (total_errors < rm->rm_firstdatacol &&
(code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
(code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
total_errors, data_errors)) != 0) {
/*
* If we didn't use all the available parity for the