loader: factor out label and uberblock load from vdev_probe, add MMP checks

Clean up the label read.
This commit is contained in:
Toomas Soome 2019-11-03 21:19:52 +00:00
parent 371d37d89f
commit 79a4bf8975
2 changed files with 261 additions and 97 deletions

View File

@ -1548,27 +1548,164 @@ vdev_label_offset(uint64_t psize, int l, uint64_t offset)
return (offset + l * sizeof (vdev_label_t) + label_offset);
}
static int
vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
{
unsigned int seq1 = 0;
unsigned int seq2 = 0;
int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
if (cmp != 0)
return (cmp);
cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
if (cmp != 0)
return (cmp);
if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
seq1 = MMP_SEQ(ub1);
if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
seq2 = MMP_SEQ(ub2);
return (AVL_CMP(seq1, seq2));
}
static int
uberblock_verify(uberblock_t *ub)
{
if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) {
byteswap_uint64_array(ub, sizeof (uberblock_t));
}
if (ub->ub_magic != UBERBLOCK_MAGIC ||
!SPA_VERSION_IS_SUPPORTED(ub->ub_version))
return (EINVAL);
return (0);
}
static int
vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset,
size_t size)
{
blkptr_t bp;
off_t off;
off = vdev_label_offset(vd->v_psize, l, offset);
BP_ZERO(&bp);
BP_SET_LSIZE(&bp, size);
BP_SET_PSIZE(&bp, size);
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
return (vdev_read_phys(vd, &bp, buf, off, size));
}
static unsigned char *
vdev_label_read_config(vdev_t *vd, uint64_t txg)
{
vdev_phys_t *label;
uint64_t best_txg = 0;
uint64_t label_txg = 0;
uint64_t asize;
unsigned char *nvl;
size_t nvl_size;
int error;
label = malloc(sizeof (vdev_phys_t));
if (label == NULL)
return (NULL);
nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4;
nvl = malloc(nvl_size);
if (nvl == NULL) {
free(label);
return (NULL);
}
for (int l = 0; l < VDEV_LABELS; l++) {
const unsigned char *nvlist;
if (vdev_label_read(vd, l, label,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t)))
continue;
if (label->vp_nvlist[0] != NV_ENCODE_XDR)
continue;
nvlist = (const unsigned char *) label->vp_nvlist + 4;
error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
DATA_TYPE_UINT64, NULL, &label_txg);
if (error != 0 || label_txg == 0)
return (nvl);
if (label_txg <= txg && label_txg > best_txg) {
best_txg = label_txg;
memcpy(nvl, nvlist, nvl_size);
/*
* Use asize from pool config. We need this
* because we can get bad value from BIOS.
*/
if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
DATA_TYPE_UINT64, NULL, &asize) == 0) {
vd->v_psize = asize +
VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
}
}
}
if (best_txg == 0) {
free(nvl);
nvl = NULL;
}
return (nvl);
}
static void
vdev_uberblock_load(vdev_t *vd, uberblock_t *ub)
{
uberblock_t *buf;
buf = malloc(VDEV_UBERBLOCK_SIZE(vd));
if (buf == NULL)
return;
for (int l = 0; l < VDEV_LABELS; l++) {
for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
if (vdev_label_read(vd, l, buf,
VDEV_UBERBLOCK_OFFSET(vd, n),
VDEV_UBERBLOCK_SIZE(vd)))
continue;
if (uberblock_verify(buf) != 0)
continue;
if (vdev_uberblock_compare(buf, ub) > 0)
*ub = *buf;
}
}
free(buf);
}
static int
vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
{
vdev_t vtmp;
vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
vdev_phys_t *tmp_label;
spa_t *spa;
vdev_t *vdev, *top_vdev, *pool_vdev;
off_t off;
blkptr_t bp;
const unsigned char *nvlist = NULL;
unsigned char *nvlist;
uint64_t val;
uint64_t guid;
uint64_t best_txg = 0;
uint64_t pool_txg, pool_guid;
const char *pool_name;
const unsigned char *vdevs;
const unsigned char *features;
int i, l, rc, is_newer;
char *upbuf;
const struct uberblock *up;
int rc, is_newer;
/*
* Load the vdev label and figure out which
@ -1580,71 +1717,24 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
(uint64_t)sizeof (vdev_label_t));
/* Test for minimum pool size. */
/* Test for minimum device size. */
if (vtmp.v_psize < SPA_MINDEVSIZE)
return (EIO);
tmp_label = zfs_alloc(sizeof(vdev_phys_t));
for (l = 0; l < VDEV_LABELS; l++) {
off = vdev_label_offset(vtmp.v_psize, l,
offsetof(vdev_label_t, vl_vdev_phys));
BP_ZERO(&bp);
BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0))
continue;
if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR)
continue;
nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4;
if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
DATA_TYPE_UINT64, NULL, &pool_txg) != 0)
continue;
if (best_txg <= pool_txg) {
uint64_t asize;
best_txg = pool_txg;
memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t));
/*
* Use asize from pool config. We need this
* because we can get bad value from BIOS.
*/
if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
DATA_TYPE_UINT64, NULL, &asize) == 0) {
vtmp.v_psize = asize +
VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
}
}
}
zfs_free(tmp_label, sizeof (vdev_phys_t));
if (best_txg == 0)
nvlist = vdev_label_read_config(&vtmp, UINT64_MAX);
if (nvlist == NULL)
return (EIO);
if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR)
return (EIO);
nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
NULL, &val) != 0) {
free(nvlist);
return (EIO);
}
if (!SPA_VERSION_IS_SUPPORTED(val)) {
printf("ZFS: unsupported ZFS version %u (should be %u)\n",
(unsigned) val, (unsigned) SPA_VERSION);
free(nvlist);
return (EIO);
}
@ -1652,16 +1742,19 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
DATA_TYPE_NVLIST, NULL, &features) == 0 &&
nvlist_check_features_for_read(features) != 0) {
free(nvlist);
return (EIO);
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
NULL, &val) != 0) {
free(nvlist);
return (EIO);
}
if (val == POOL_STATE_DESTROYED) {
/* We don't boot only from destroyed pools. */
free(nvlist);
return (EIO);
}
@ -1675,12 +1768,13 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
* Cache and spare devices end up here - just ignore
* them.
*/
/*printf("ZFS: can't find pool details\n");*/
free(nvlist);
return (EIO);
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64,
NULL, &val) == 0 && val != 0) {
free(nvlist);
return (EIO);
}
@ -1690,8 +1784,10 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
spa = spa_find_by_guid(pool_guid);
if (spa == NULL) {
spa = spa_create(pool_guid, pool_name);
if (spa == NULL)
if (spa == NULL) {
free(nvlist);
return (ENOMEM);
}
}
if (pool_txg > spa->spa_txg) {
spa->spa_txg = pool_txg;
@ -1708,18 +1804,24 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
*/
if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
NULL, &guid) != 0) {
free(nvlist);
return (EIO);
}
vdev = vdev_find(guid);
if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */
/* Has this vdev already been inited? */
if (vdev && vdev->v_phys_read) {
free(nvlist);
return (EIO);
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
NULL, &vdevs)) {
free(nvlist);
return (EIO);
}
rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
free(nvlist);
if (rc != 0)
return (rc);
@ -1729,6 +1831,7 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
if (top_vdev == pool_vdev)
break;
if (!pool_vdev && top_vdev) {
top_vdev->spa = spa;
STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
@ -1765,36 +1868,7 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
* the best uberblock and then we can actually access
* the contents of the pool.
*/
upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
up = (const struct uberblock *)upbuf;
for (l = 0; l < VDEV_LABELS; l++) {
for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) {
off = vdev_label_offset(vdev->v_psize, l,
VDEV_UBERBLOCK_OFFSET(vdev, i));
BP_ZERO(&bp);
DVA_SET_OFFSET(&bp.blk_dva[0], off);
BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
continue;
if (up->ub_magic != UBERBLOCK_MAGIC)
continue;
if (up->ub_txg < spa->spa_txg)
continue;
if (up->ub_txg > spa->spa_uberblock.ub_txg ||
(up->ub_txg == spa->spa_uberblock.ub_txg &&
up->ub_timestamp >
spa->spa_uberblock.ub_timestamp)) {
spa->spa_uberblock = *up;
}
}
}
zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
vdev_uberblock_load(vdev, &spa->spa_uberblock);
vdev->spa = spa;
if (spap != NULL)

View File

@ -63,6 +63,14 @@
#define _NOTE(s)
/*
* AVL comparator helpers
*/
#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0))
#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
#define AVL_PCMP(a, b) \
(((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
typedef enum { B_FALSE, B_TRUE } boolean_t;
/* CRC64 table */
@ -490,8 +498,16 @@ typedef struct zio_gbh {
#define VDEV_PHYS_SIZE (112 << 10)
#define VDEV_UBERBLOCK_RING (128 << 10)
/*
* MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
* ring when MMP is enabled.
*/
#define MMP_BLOCKS_PER_LABEL 1
/* The largest uberblock we support is 8k. */
#define MAX_UBERBLOCK_SHIFT (13)
#define VDEV_UBERBLOCK_SHIFT(vd) \
MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT)
MIN(MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT), MAX_UBERBLOCK_SHIFT)
#define VDEV_UBERBLOCK_COUNT(vd) \
(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
#define VDEV_UBERBLOCK_OFFSET(vd, n) \
@ -841,14 +857,88 @@ typedef enum pool_state {
#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
#define UBERBLOCK_SHIFT 10 /* up to 1K */
struct uberblock {
#define MMP_MAGIC 0xa11cea11 /* all-see-all */
#define MMP_INTERVAL_VALID_BIT 0x01
#define MMP_SEQ_VALID_BIT 0x02
#define MMP_FAIL_INT_VALID_BIT 0x04
#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
ubp->ub_mmp_magic == MMP_MAGIC)
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
MMP_INTERVAL_VALID_BIT))
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
MMP_SEQ_VALID_BIT))
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
MMP_FAIL_INT_VALID_BIT))
#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
>> 8)
#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
>> 32)
#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
>> 48)
typedef struct uberblock {
uint64_t ub_magic; /* UBERBLOCK_MAGIC */
uint64_t ub_version; /* SPA_VERSION */
uint64_t ub_txg; /* txg of last sync */
uint64_t ub_guid_sum; /* sum of all vdev guids */
uint64_t ub_timestamp; /* UTC time of last sync */
blkptr_t ub_rootbp; /* MOS objset_phys_t */
};
/* highest SPA_VERSION supported by software that wrote this txg */
uint64_t ub_software_version;
/* Maybe missing in uberblocks we read, but always written */
uint64_t ub_mmp_magic;
/*
* If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
* Otherwise, nanosec since last MMP write.
*/
uint64_t ub_mmp_delay;
/*
* The ub_mmp_config contains the multihost write interval, multihost
* fail intervals, sequence number for sub-second granularity, and
* valid bit mask. This layout is as follows:
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 0 | Fail Intervals| Seq | Write Interval (ms) | VALID |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* This allows a write_interval of (2^24/1000)s, over 4.5 hours
*
* VALID Bits:
* - 0x01 - Write Interval (ms)
* - 0x02 - Sequence number exists
* - 0x04 - Fail Intervals
* - 0xf8 - Reserved
*/
uint64_t ub_mmp_config;
/*
* ub_checkpoint_txg indicates two things about the current uberblock:
*
* 1] If it is not zero then this uberblock is a checkpoint. If it is
* zero, then this uberblock is not a checkpoint.
*
* 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
* the ub_txg that the uberblock had at the time we moved it to
* the MOS config.
*
* The field is set when we checkpoint the uberblock and continues to
* hold that value even after we've rewound (unlike the ub_txg that
* is reset to a higher value).
*
* Besides checks used to determine whether we are reopening the
* pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
* the value of the field is used to determine which ZIL blocks have
* been allocated according to the ms_sm when we are rewinding to a
* checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
* the ZIL block is not allocated [see uses of spa_min_claim_txg()].
*/
uint64_t ub_checkpoint_txg;
} uberblock_t;
/*
* Flags.