loader: zfs reader should check all labels

The current zfs reader is only checking first label from each device, however,
we do have 4 labels on device and we should check all 4 to be protected
against disk failures and incomplete label updates.

The difficulty is about the fact that 2 label copies are in front of the
pool data, and 2 are at the end, which means, we have to know the size of
the pool data area.

Since we have now the mechanism from common/disk.c to use the partition
information, it does help us in this task; however, there are still some
corner cases.

Namely, if the pool is created without partition, directly on the disk,
and firmware will give us the wrong size for the disk, we only can check
the first two label copies.

Reviewed by:	allanjude
Differential Revision:	https://reviews.freebsd.org/D10203
This commit is contained in:
Toomas Soome 2017-04-06 18:17:29 +00:00
parent 8844e55d4a
commit e41fab8d40
7 changed files with 184 additions and 95 deletions

View File

@ -40,6 +40,15 @@
static dev_info_t *devices;
uint64_t
ldi_get_size(void *priv)
{
dev_info_t *devinfo = priv;
return (devinfo->dev->Media->BlockSize *
(devinfo->dev->Media->LastBlock + 1));
}
static int
vdev_read(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes)
{

View File

@ -28,6 +28,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/disk.h>
#include <sys/param.h>
#include <sys/reboot.h>
#include <sys/boot.h>
@ -838,4 +839,14 @@ efi_zfs_probe(void)
}
}
}
uint64_t
ldi_get_size(void *priv)
{
int fd = (uintptr_t) priv;
uint64_t size;
ioctl(fd, DIOCGMEDIASIZE, &size);
return (size);
}
#endif

View File

@ -36,7 +36,7 @@ struct dsk {
unsigned int slice;
int part;
daddr_t start;
int init;
uint64_t size;
};
int drvread(struct dsk *dskp, void *buf, daddr_t lba, unsigned nblk);

View File

@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$");
#include <machine/bootinfo.h>
#include <machine/cpufunc.h>
#include <machine/psl.h>
#include <sys/disk.h>
#include <sys/reboot.h>
#include <common/drv.h>
@ -463,4 +464,14 @@ i386_zfs_probe(void)
zfs_probe_dev(devname, NULL);
}
}
uint64_t
ldi_get_size(void *priv)
{
int fd = (uintptr_t) priv;
uint64_t size;
ioctl(fd, DIOCGMEDIASIZE, &size);
return (size);
}
#endif

View File

@ -468,6 +468,23 @@ copy_dsk(struct dsk *dsk)
return (newdsk);
}
/*
* The "layered" ioctl to read disk/partition size. Unfortunately
* the zfsboot case is hardest, because we do not have full software
* stack available, so we need to do some manual work here.
*/
uint64_t
ldi_get_size(void *priv)
{
struct dsk *dskp = priv;
uint64_t size = dskp->size;
if (dskp->start == 0)
size = drvsize(dskp);
return (size * DEV_BSIZE);
}
static void
probe_drive(struct dsk *dsk)
{
@ -549,6 +566,7 @@ probe_drive(struct dsk *dsk)
if (memcmp(&ent->ent_type, &freebsd_zfs_uuid,
sizeof(uuid_t)) == 0) {
dsk->start = ent->ent_lba_start;
dsk->size = ent->ent_lba_end - ent->ent_lba_start + 1;
dsk->slice = part + 1;
dsk->part = 255;
if (vdev_probe(vdev_read, dsk, NULL) == 0) {
@ -593,6 +611,7 @@ probe_drive(struct dsk *dsk)
if (!dp[i].dp_typ)
continue;
dsk->start = dp[i].dp_start;
dsk->size = dp[i].dp_size;
dsk->slice = i + 1;
if (vdev_probe(vdev_read, dsk, NULL) == 0) {
dsk = copy_dsk(dsk);
@ -648,7 +667,7 @@ main(void)
dsk->slice = *(uint8_t *)PTOV(ARGS + 1) + 1;
dsk->part = 0;
dsk->start = 0;
dsk->init = 0;
dsk->size = 0;
bootinfo.bi_version = BOOTINFO_VERSION;
bootinfo.bi_size = sizeof(bootinfo);
@ -699,7 +718,7 @@ main(void)
dsk->slice = 0;
dsk->part = 0;
dsk->start = 0;
dsk->init = 0;
dsk->size = 0;
probe_drive(dsk);
}

View File

@ -81,6 +81,7 @@ int zfs_parsedev(struct zfs_devdesc *dev, const char *devspec,
char *zfs_fmtdev(void *vdev);
int zfs_probe_dev(const char *devname, uint64_t *pool_guid);
int zfs_list(const char *name);
uint64_t ldi_get_size(void *);
void init_zfs_bootenv(char *currdev);
int zfs_bootenv(const char *name);
int zfs_belist_add(const char *name, uint64_t __unused);

View File

@ -68,7 +68,7 @@ static const char *features_for_read[] = {
*/
static spa_list_t zfs_pools;
static const dnode_phys_t *dnode_cache_obj = NULL;
static const dnode_phys_t *dnode_cache_obj;
static uint64_t dnode_cache_bn;
static char *dnode_cache_buf;
static char *zap_scratch;
@ -523,12 +523,11 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
int nkids, i, is_new;
uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
DATA_TYPE_UINT64, 0, &guid)
|| nvlist_find(nvlist, ZPOOL_CONFIG_ID,
DATA_TYPE_UINT64, 0, &id)
|| nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
DATA_TYPE_STRING, 0, &type)) {
if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
NULL, &guid)
|| nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id)
|| nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
NULL, &type)) {
printf("ZFS: can't find vdev details\n");
return (ENOENT);
}
@ -546,15 +545,15 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
&is_offline);
nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
&is_removed);
nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
&is_faulted);
nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL,
&is_degraded);
nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL,
&isnt_present);
vdev = vdev_find(guid);
@ -573,17 +572,19 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
vdev->v_id = id;
vdev->v_top = pvdev != NULL ? pvdev : vdev;
if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
DATA_TYPE_UINT64, 0, &ashift) == 0)
DATA_TYPE_UINT64, NULL, &ashift) == 0) {
vdev->v_ashift = ashift;
else
} else {
vdev->v_ashift = 0;
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
DATA_TYPE_UINT64, 0, &nparity) == 0)
DATA_TYPE_UINT64, NULL, &nparity) == 0) {
vdev->v_nparity = nparity;
else
} else {
vdev->v_nparity = 0;
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
DATA_TYPE_STRING, 0, &path) == 0) {
DATA_TYPE_STRING, NULL, &path) == 0) {
if (strncmp(path, "/dev/", 5) == 0)
path += 5;
vdev->v_name = strdup(path);
@ -625,8 +626,8 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
vdev->v_state = VDEV_STATE_CANT_OPEN;
}
rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
&nkids, &kids);
/*
* Its ok if we don't have any kids.
*/
@ -744,12 +745,17 @@ spa_get_primary_vdev(const spa_t *spa)
#endif
static spa_t *
spa_create(uint64_t guid)
spa_create(uint64_t guid, const char *name)
{
spa_t *spa;
spa = malloc(sizeof(spa_t));
if ((spa = malloc(sizeof(spa_t))) == NULL)
return (NULL);
memset(spa, 0, sizeof(spa_t));
if ((spa->spa_name = strdup(name)) == NULL) {
free(spa);
return (NULL);
}
STAILQ_INIT(&spa->spa_vdevs);
spa->spa_guid = guid;
STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
@ -905,24 +911,39 @@ spa_all_status(void)
return (ret);
}
uint64_t
vdev_label_offset(uint64_t psize, int l, uint64_t offset)
{
uint64_t label_offset;
if (l < VDEV_LABELS / 2)
label_offset = 0;
else
label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t);
return (offset + l * sizeof (vdev_label_t) + label_offset);
}
static int
vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
{
vdev_t vtmp;
vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
vdev_phys_t *tmp_label = zfs_alloc(sizeof(vdev_phys_t));
spa_t *spa;
vdev_t *vdev, *top_vdev, *pool_vdev;
off_t off;
blkptr_t bp;
const unsigned char *nvlist;
const unsigned char *nvlist = NULL;
uint64_t val;
uint64_t guid;
uint64_t best_txg = 0;
uint64_t pool_txg, pool_guid;
uint64_t is_log;
uint64_t psize;
const char *pool_name;
const unsigned char *vdevs;
const unsigned char *features;
int i, rc, is_newer;
int i, l, rc, is_newer;
char *upbuf;
const struct uberblock *up;
@ -933,26 +954,47 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
memset(&vtmp, 0, sizeof(vtmp));
vtmp.v_phys_read = _read;
vtmp.v_read_priv = read_priv;
off = offsetof(vdev_label_t, vl_vdev_phys);
BP_ZERO(&bp);
BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
return (EIO);
psize = P2ALIGN(ldi_get_size(read_priv),
(uint64_t)sizeof (vdev_label_t));
if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
return (EIO);
for (l = 0; l < VDEV_LABELS; l++) {
off = vdev_label_offset(psize, l,
offsetof(vdev_label_t, vl_vdev_phys));
BP_ZERO(&bp);
BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0))
continue;
if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR)
continue;
nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4;
if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
DATA_TYPE_UINT64, NULL, &pool_txg) != 0)
continue;
if (best_txg <= pool_txg) {
best_txg = pool_txg;
memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t));
}
}
zfs_free(tmp_label, sizeof (vdev_phys_t));
if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR)
return (EIO);
nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
if (nvlist_find(nvlist,
ZPOOL_CONFIG_VERSION,
DATA_TYPE_UINT64, 0, &val)) {
if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
NULL, &val) != 0) {
return (EIO);
}
@ -963,15 +1005,14 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
}
/* Check ZFS features for read */
if (nvlist_find(nvlist,
ZPOOL_CONFIG_FEATURES_FOR_READ,
DATA_TYPE_NVLIST, 0, &features) == 0
&& nvlist_check_features_for_read(features) != 0)
if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
DATA_TYPE_NVLIST, NULL, &features) == 0 &&
nvlist_check_features_for_read(features) != 0) {
return (EIO);
}
if (nvlist_find(nvlist,
ZPOOL_CONFIG_POOL_STATE,
DATA_TYPE_UINT64, 0, &val)) {
if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
NULL, &val) != 0) {
return (EIO);
}
@ -980,15 +1021,12 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
return (EIO);
}
if (nvlist_find(nvlist,
ZPOOL_CONFIG_POOL_TXG,
DATA_TYPE_UINT64, 0, &pool_txg)
|| nvlist_find(nvlist,
ZPOOL_CONFIG_POOL_GUID,
DATA_TYPE_UINT64, 0, &pool_guid)
|| nvlist_find(nvlist,
ZPOOL_CONFIG_POOL_NAME,
DATA_TYPE_STRING, 0, &pool_name)) {
if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
NULL, &pool_txg) != 0 ||
nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
NULL, &pool_guid) != 0 ||
nvlist_find(nvlist, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
NULL, &pool_name) != 0) {
/*
* Cache and spare devices end up here - just ignore
* them.
@ -997,25 +1035,26 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
return (EIO);
}
is_log = 0;
(void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
&is_log);
if (is_log)
if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64,
NULL, &val) == 0 && val != 0) {
return (EIO);
}
/*
* Create the pool if this is the first time we've seen it.
*/
spa = spa_find_by_guid(pool_guid);
if (!spa) {
spa = spa_create(pool_guid);
spa->spa_name = strdup(pool_name);
if (spa == NULL) {
spa = spa_create(pool_guid, pool_name);
if (spa == NULL)
return (ENOMEM);
}
if (pool_txg > spa->spa_txg) {
spa->spa_txg = pool_txg;
is_newer = 1;
} else
} else {
is_newer = 0;
}
/*
* Get the vdev tree and create our in-core copy of it.
@ -1023,23 +1062,21 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
* be some kind of alias (overlapping slices, dangerously dedicated
* disks etc).
*/
if (nvlist_find(nvlist,
ZPOOL_CONFIG_GUID,
DATA_TYPE_UINT64, 0, &guid)) {
if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
NULL, &guid) != 0) {
return (EIO);
}
vdev = vdev_find(guid);
if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */
return (EIO);
if (nvlist_find(nvlist,
ZPOOL_CONFIG_VDEV_TREE,
DATA_TYPE_NVLIST, 0, &vdevs)) {
if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
NULL, &vdevs)) {
return (EIO);
}
rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
if (rc)
if (rc != 0)
return (rc);
/*
@ -1079,36 +1116,37 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
*/
upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
up = (const struct uberblock *)upbuf;
for (i = 0;
i < VDEV_UBERBLOCK_COUNT(vdev);
i++) {
off = VDEV_UBERBLOCK_OFFSET(vdev, i);
BP_ZERO(&bp);
DVA_SET_OFFSET(&bp.blk_dva[0], off);
BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
for (l = 0; l < VDEV_LABELS; l++) {
for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) {
off = vdev_label_offset(psize, l,
VDEV_UBERBLOCK_OFFSET(vdev, i));
BP_ZERO(&bp);
DVA_SET_OFFSET(&bp.blk_dva[0], off);
BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
continue;
if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
continue;
if (up->ub_magic != UBERBLOCK_MAGIC)
continue;
if (up->ub_txg < spa->spa_txg)
continue;
if (up->ub_txg > spa->spa_uberblock.ub_txg) {
spa->spa_uberblock = *up;
} else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
if (up->ub_magic != UBERBLOCK_MAGIC)
continue;
if (up->ub_txg < spa->spa_txg)
continue;
if (up->ub_txg > spa->spa_uberblock.ub_txg ||
(up->ub_txg == spa->spa_uberblock.ub_txg &&
up->ub_timestamp >
spa->spa_uberblock.ub_timestamp)) {
spa->spa_uberblock = *up;
}
}
}
zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
vdev->spa = spa;
if (spap)
if (spap != NULL)
*spap = spa;
return (0);
}