Add support for booting from raidz1 and raidz2 pools.

This commit is contained in:
Doug Rabson 2009-05-16 10:48:20 +00:00
parent e27fb776f3
commit e1899ef6c8
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=192194
5 changed files with 908 additions and 82 deletions

View File

@ -413,6 +413,20 @@ int13probe(int drive)
return(0);
}
/*
* We call this when we find a ZFS vdev - ZFS consumes the dsk
* structure so we must make a new one.
*/
static struct dsk *
copy_dsk(struct dsk *dsk)
{
struct dsk *newdsk;
newdsk = malloc(sizeof(struct dsk));
*newdsk = *dsk;
return (newdsk);
}
static void
probe_drive(struct dsk *dsk, spa_t **spap)
{
@ -426,9 +440,6 @@ probe_drive(struct dsk *dsk, spa_t **spap)
char *sec;
unsigned i;
if (!int13probe(dsk->drive))
return;
/*
* If we find a vdev on the whole disk, stop here. Otherwise dig
* out the MBR and probe each slice in turn for a vdev.
@ -473,7 +484,7 @@ probe_drive(struct dsk *dsk, spa_t **spap)
if (vdev_probe(vdev_read, dsk, spap) == 0) {
/*
* We record the first pool we find (we will try
* to boot from that one.
* to boot from that one).
*/
spap = 0;
@ -481,10 +492,7 @@ probe_drive(struct dsk *dsk, spa_t **spap)
* This slice had a vdev. We need a new dsk
* structure now since the vdev now owns this one.
*/
struct dsk *newdsk;
newdsk = malloc(sizeof(struct dsk));
*newdsk = *dsk;
dsk = newdsk;
dsk = copy_dsk(dsk);
}
break;
}
@ -514,10 +522,7 @@ probe_drive(struct dsk *dsk, spa_t **spap)
* This slice had a vdev. We need a new dsk structure now
* since the vdev now owns this one.
*/
struct dsk *newdsk;
newdsk = malloc(sizeof(struct dsk));
*newdsk = *dsk;
dsk = newdsk;
dsk = copy_dsk(dsk);
}
}
}
@ -569,10 +574,13 @@ main(void)
* will find any other available pools and it may fill in missing
* vdevs for the boot pool.
*/
for (i = 0; i < 4; i++) {
for (i = 0; i < 128; i++) {
if ((i | DRV_HARD) == *(uint8_t *)PTOV(ARGS))
continue;
if (!int13probe(i | DRV_HARD))
break;
dsk = malloc(sizeof(struct dsk));
dsk->drive = i | DRV_HARD;
dsk->type = dsk->drive & TYPE_AD;
@ -944,7 +952,7 @@ static int
drvread(struct dsk *dsk, void *buf, unsigned lba, unsigned nblk)
{
#ifdef GPT
static unsigned c = 0x2d5c7c2f;
static unsigned c = 0x2d5c7c2f;
if (!OPT_CHECK(RBX_QUIET))
printf("%c\b", c = c << 8 | c >> 24);

View File

@ -45,16 +45,13 @@ static vdev_list_t zfs_vdevs;
static spa_list_t zfs_pools;
static uint64_t zfs_crc64_table[256];
static char *zfs_decomp_buf;
static const dnode_phys_t *dnode_cache_obj = 0;
static uint64_t dnode_cache_bn;
static char *dnode_cache_buf;
static char *zap_scratch;
static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
/*
* Forward declarations.
*/
static int zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset);
#define TEMP_SIZE (1*SPA_MAXBLOCKSIZE)
static void
zfs_init(void)
@ -62,13 +59,37 @@ zfs_init(void)
STAILQ_INIT(&zfs_vdevs);
STAILQ_INIT(&zfs_pools);
zfs_decomp_buf = malloc(128*1024);
dnode_cache_buf = malloc(128*1024);
zap_scratch = malloc(128*1024);
zfs_temp_buf = malloc(TEMP_SIZE);
zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
zfs_temp_ptr = zfs_temp_buf;
dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
zap_scratch = malloc(SPA_MAXBLOCKSIZE);
zfs_init_crc();
}
static char *
zfs_alloc_temp(size_t sz)
{
char *p;
if (zfs_temp_ptr + sz > zfs_temp_end) {
printf("ZFS: out of temporary buffer space\n");
for (;;) ;
}
p = zfs_temp_ptr;
zfs_temp_ptr += sz;
return (p);
}
static void
zfs_reset_temp(void)
{
zfs_temp_ptr = zfs_temp_buf;
}
static int
xdr_int(const unsigned char **xdr, int *ip)
{
@ -299,7 +320,41 @@ nvlist_print(const unsigned char *nvlist, unsigned int indent)
#endif
static int
vdev_mirror_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size)
vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
off_t offset, size_t size)
{
size_t psize;
int rc;
if (bp) {
psize = BP_GET_PSIZE(bp);
} else {
psize = size;
}
/*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
if (rc)
return (rc);
if (bp && zio_checksum_error(bp, buf))
return (EIO);
return (0);
}
static int
vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
off_t offset, size_t bytes)
{
return (vdev_read_phys(vdev, bp, buf,
offset + VDEV_LABEL_START_SIZE, bytes));
}
static int
vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
off_t offset, size_t bytes)
{
vdev_t *kid;
int rc;
@ -308,7 +363,7 @@ vdev_mirror_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size)
STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
if (kid->v_state != VDEV_STATE_HEALTHY)
continue;
rc = kid->v_read(kid, kid->v_read_priv, offset, buf, size);
rc = kid->v_read(kid, bp, buf, offset, bytes);
if (!rc)
return (0);
}
@ -329,7 +384,7 @@ vdev_find(uint64_t guid)
}
static vdev_t *
vdev_create(uint64_t guid, vdev_read_t *read, void *read_priv)
vdev_create(uint64_t guid, vdev_read_t *read)
{
vdev_t *vdev;
@ -339,7 +394,8 @@ vdev_create(uint64_t guid, vdev_read_t *read, void *read_priv)
vdev->v_guid = guid;
vdev->v_state = VDEV_STATE_OFFLINE;
vdev->v_read = read;
vdev->v_read_priv = read_priv;
vdev->v_phys_read = 0;
vdev->v_read_priv = 0;
STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
return (vdev);
@ -349,7 +405,7 @@ static int
vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
{
int rc;
uint64_t guid, id;
uint64_t guid, id, ashift, nparity;
const char *type;
const char *path;
vdev_t *vdev, *kid;
@ -378,17 +434,30 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
}
if (strcmp(type, VDEV_TYPE_MIRROR)
&& strcmp(type, VDEV_TYPE_DISK)) {
printf("ZFS: can only boot from disk or mirror vdevs\n");
&& strcmp(type, VDEV_TYPE_DISK)
&& strcmp(type, VDEV_TYPE_RAIDZ)) {
printf("ZFS: can only boot from disk, mirror or raidz vdevs\n");
return (EIO);
}
if (!strcmp(type, VDEV_TYPE_MIRROR))
vdev = vdev_create(guid, vdev_mirror_read, 0);
vdev = vdev_create(guid, vdev_mirror_read);
else if (!strcmp(type, VDEV_TYPE_RAIDZ))
vdev = vdev_create(guid, vdev_raidz_read);
else
vdev = vdev_create(guid, 0, 0);
vdev = vdev_create(guid, vdev_disk_read);
vdev->v_id = id;
if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
DATA_TYPE_UINT64, 0, &ashift) == 0)
vdev->v_ashift = ashift;
else
vdev->v_ashift = 0;
if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
DATA_TYPE_UINT64, 0, &nparity) == 0)
vdev->v_nparity = nparity;
else
vdev->v_nparity = 0;
if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
DATA_TYPE_STRING, 0, &path) == 0) {
if (strlen(path) > 5
@ -400,15 +469,22 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
path += 5;
vdev->v_name = strdup(path);
} else {
vdev->v_name = strdup(type);
if (!strcmp(type, "raidz")) {
if (vdev->v_nparity == 1)
vdev->v_name = "raidz1";
else
vdev->v_name = "raidz2";
} else {
vdev->v_name = strdup(type);
}
}
vdev->v_id = id;
rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
/*
* Its ok if we don't have any kids.
*/
if (rc == 0) {
vdev->v_nchildren = nkids;
for (i = 0; i < nkids; i++) {
rc = vdev_init_from_nvlist(kids, &kid);
if (rc)
@ -416,6 +492,8 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink);
kids = nvlist_next(kids);
}
} else {
vdev->v_nchildren = 0;
}
if (vdevp)
@ -431,11 +509,10 @@ vdev_set_state(vdev_t *vdev)
int bad_kids;
/*
* We assume that if we have kids, we are a mirror. A mirror
* is healthy if all its kids are healthy. Its degraded (but
* working) if at least one kid is healty.
* A mirror or raidz is healthy if all its kids are healthy. A
* mirror is degraded if any of its kids is healthy; a raidz
* is degraded if at most nparity kids are offline.
*/
if (STAILQ_FIRST(&vdev->v_children)) {
good_kids = 0;
bad_kids = 0;
@ -445,13 +522,22 @@ vdev_set_state(vdev_t *vdev)
else
bad_kids++;
}
if (good_kids) {
if (!bad_kids && good_kids)
vdev->v_state = VDEV_STATE_HEALTHY;
else
vdev->v_state = VDEV_STATE_DEGRADED;
if (bad_kids == 0) {
vdev->v_state = VDEV_STATE_HEALTHY;
} else {
vdev->v_state = VDEV_STATE_OFFLINE;
if (vdev->v_read == vdev_mirror_read) {
if (good_kids) {
vdev->v_state = VDEV_STATE_DEGRADED;
} else {
vdev->v_state = VDEV_STATE_OFFLINE;
}
} else if (vdev->v_read == vdev_raidz_read) {
if (bad_kids > vdev->v_nparity) {
vdev->v_state = VDEV_STATE_OFFLINE;
} else {
vdev->v_state = VDEV_STATE_DEGRADED;
}
}
}
}
}
@ -609,7 +695,7 @@ spa_all_status(void)
}
static int
vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap)
vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
{
vdev_t vtmp;
vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
@ -632,7 +718,7 @@ vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap)
* uberblock is most current.
*/
memset(&vtmp, 0, sizeof(vtmp));
vtmp.v_read = read;
vtmp.v_phys_read = read;
vtmp.v_read_priv = read_priv;
off = offsetof(vdev_label_t, vl_vdev_phys);
BP_ZERO(&bp);
@ -641,7 +727,7 @@ vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap)
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
if (zio_read_phys(&vtmp, &bp, vdev_label, off))
if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
return (EIO);
if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
@ -668,6 +754,7 @@ vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap)
return (EIO);
}
#ifndef TEST
if (val != POOL_STATE_ACTIVE) {
/*
* Don't print a message here. If we happen to reboot
@ -677,6 +764,7 @@ vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap)
/*printf("ZFS: pool is not active\n");*/
return (EIO);
}
#endif
if (nvlist_find(nvlist,
ZPOOL_CONFIG_POOL_TXG,
@ -687,7 +775,11 @@ vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap)
|| nvlist_find(nvlist,
ZPOOL_CONFIG_POOL_NAME,
DATA_TYPE_STRING, 0, &pool_name)) {
printf("ZFS: can't find pool details\n");
/*
* Cache and spare devices end up here - just ignore
* them.
*/
/*printf("ZFS: can't find pool details\n");*/
return (EIO);
}
@ -742,7 +834,7 @@ vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap)
*/
vdev = vdev_find(guid);
if (vdev) {
vdev->v_read = read;
vdev->v_phys_read = read;
vdev->v_read_priv = read_priv;
vdev->v_state = VDEV_STATE_HEALTHY;
} else {
@ -772,7 +864,7 @@ vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap)
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
if (zio_read_phys(vdev, &bp, upbuf, off))
if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
continue;
up = (const struct uberblock *) upbuf;
@ -805,39 +897,20 @@ ilog2(int n)
}
static int
zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset)
zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
{
int cpfunc = BP_GET_COMPRESS(bp);
size_t lsize = BP_GET_LSIZE(bp);
size_t psize = BP_GET_PSIZE(bp);
int rc;
/*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
if (cpfunc != ZIO_COMPRESS_OFF) {
rc = vdev->v_read(vdev, vdev->v_read_priv, offset, zfs_decomp_buf, psize);
if (rc)
return (rc);
if (zio_checksum_error(bp, zfs_decomp_buf))
return (EIO);
if (zio_decompress_data(cpfunc, zfs_decomp_buf, psize,
buf, lsize))
return (EIO);
} else {
rc = vdev->v_read(vdev, vdev->v_read_priv, offset, buf, psize);
if (rc)
return (rc);
if (zio_checksum_error(bp, buf))
return (EIO);
}
return (0);
}
static int
zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
{
void *pbuf;
int i;
zfs_reset_temp();
if (cpfunc != ZIO_COMPRESS_OFF)
pbuf = zfs_alloc_temp(psize);
else
pbuf = buf;
for (i = 0; i < SPA_DVAS_PER_BP; i++) {
const dva_t *dva = &bp->blk_dva[i];
vdev_t *vdev;
@ -848,15 +921,21 @@ zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
continue;
vdevid = DVA_GET_VDEV(dva);
offset = DVA_GET_OFFSET(dva) + VDEV_LABEL_START_SIZE;
offset = DVA_GET_OFFSET(dva);
STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
if (vdev->v_id == vdevid)
break;
if (!vdev || !vdev->v_read)
continue;
if (zio_read_phys(vdev, bp, buf, offset))
if (vdev->v_read(vdev, bp, pbuf, offset, psize))
continue;
if (cpfunc != ZIO_COMPRESS_OFF) {
if (zio_decompress_data(cpfunc, pbuf, psize,
buf, lsize))
return (EIO);
}
return (0);
}
printf("ZFS: i/o error - all block copies unavailable\n");

View File

@ -6,7 +6,7 @@ are used by the ZFS bootstrap:
fletcher.c checksum support
sha256.c checksum support
lzjb.c compression support
zfssubr.c mostly checksum and compression support
zfssubr.c checksum, compression and raidz support
zfsimpl.h mostly describing the physical layout
The files fletcher.c, lzjb.c and sha256.c are largely identical to the

View File

@ -1137,7 +1137,10 @@ typedef struct znode_phys {
* In-core vdev representation.
*/
struct vdev;
typedef int vdev_read_t(struct vdev *vdev, void *priv, off_t offset, void *buf, size_t bytes);
typedef int vdev_phys_read_t(struct vdev *vdev, void *priv,
off_t offset, void *buf, size_t bytes);
typedef int vdev_read_t(struct vdev *vdev, const blkptr_t *bp,
void *buf, off_t offset, size_t bytes);
typedef STAILQ_HEAD(vdev_list, vdev) vdev_list_t;
@ -1148,8 +1151,12 @@ typedef struct vdev {
char *v_name; /* vdev name */
uint64_t v_guid; /* vdev guid */
int v_id; /* index in parent */
int v_ashift; /* offset to block shift */
int v_nparity; /* # parity for raidz */
int v_nchildren; /* # children */
vdev_state_t v_state; /* current state */
vdev_read_t *v_read; /* function to read from this vdev */
vdev_phys_read_t *v_phys_read; /* read from raw leaf vdev */
vdev_read_t *v_read; /* read from vdev */
void *v_read_priv; /* private data for read function */
} vdev_t;

View File

@ -191,3 +191,735 @@ zap_hash(uint64_t salt, const char *name)
return (crc);
}
static char *zfs_alloc_temp(size_t sz);
typedef struct raidz_col {
uint64_t rc_devidx; /* child device index for I/O */
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
void *rc_data; /* I/O data */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */
} raidz_col_t;
#define VDEV_RAIDZ_P 0
#define VDEV_RAIDZ_Q 1
static void
vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x)
{
uint64_t *dst, *src, xcount, ccount, count, i;
int c;
xcount = cols[x].rc_size / sizeof (src[0]);
//ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
//ASSERT(xcount > 0);
src = cols[VDEV_RAIDZ_P].rc_data;
dst = cols[x].rc_data;
for (i = 0; i < xcount; i++, dst++, src++) {
*dst = *src;
}
for (c = nparity; c < acols; c++) {
src = cols[c].rc_data;
dst = cols[x].rc_data;
if (c == x)
continue;
ccount = cols[c].rc_size / sizeof (src[0]);
count = MIN(ccount, xcount);
for (i = 0; i < count; i++, dst++, src++) {
*dst ^= *src;
}
}
}
/*
* These two tables represent powers and logs of 2 in the Galois field defined
* above. These values were computed by repeatedly multiplying by 2 as above.
*/
static const uint8_t vdev_raidz_pow2[256] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
};
static const uint8_t vdev_raidz_log2[256] = {
0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
};
/*
* Multiply a given number by 2 raised to the given power.
*/
static uint8_t
vdev_raidz_exp2(uint8_t a, int exp)
{
if (a == 0)
return (0);
//ASSERT(exp >= 0);
//ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
exp += vdev_raidz_log2[a];
if (exp > 255)
exp -= 255;
return (vdev_raidz_pow2[exp]);
}
static void
vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols)
{
uint64_t *q, *p, *src, pcount, ccount, mask, i;
int c;
pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
//ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size);
for (c = nparity; c < acols; c++) {
src = cols[c].rc_data;
p = cols[VDEV_RAIDZ_P].rc_data;
q = cols[VDEV_RAIDZ_Q].rc_data;
ccount = cols[c].rc_size / sizeof (src[0]);
if (c == nparity) {
//ASSERT(ccount == pcount || ccount == 0);
for (i = 0; i < ccount; i++, p++, q++, src++) {
*q = *src;
*p = *src;
}
for (; i < pcount; i++, p++, q++, src++) {
*q = 0;
*p = 0;
}
} else {
//ASSERT(ccount <= pcount);
/*
* Rather than multiplying each byte
* individually (as described above), we are
* able to handle 8 at once by generating a
* mask based on the high bit in each byte and
* using that to conditionally XOR in 0x1d.
*/
for (i = 0; i < ccount; i++, p++, q++, src++) {
mask = *q & 0x8080808080808080ULL;
mask = (mask << 1) - (mask >> 7);
*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
(mask & 0x1d1d1d1d1d1d1d1dULL);
*q ^= *src;
*p ^= *src;
}
/*
* Treat short columns as though they are full of 0s.
*/
for (; i < pcount; i++, q++) {
mask = *q & 0x8080808080808080ULL;
mask = (mask << 1) - (mask >> 7);
*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
(mask & 0x1d1d1d1d1d1d1d1dULL);
}
}
}
}
static void
vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x)
{
uint64_t *dst, *src, xcount, ccount, count, mask, i;
uint8_t *b;
int c, j, exp;
xcount = cols[x].rc_size / sizeof (src[0]);
//ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
for (c = nparity; c < acols; c++) {
src = cols[c].rc_data;
dst = cols[x].rc_data;
if (c == x)
ccount = 0;
else
ccount = cols[c].rc_size / sizeof (src[0]);
count = MIN(ccount, xcount);
if (c == nparity) {
for (i = 0; i < count; i++, dst++, src++) {
*dst = *src;
}
for (; i < xcount; i++, dst++) {
*dst = 0;
}
} else {
/*
* For an explanation of this, see the comment in
* vdev_raidz_generate_parity_pq() above.
*/
for (i = 0; i < count; i++, dst++, src++) {
mask = *dst & 0x8080808080808080ULL;
mask = (mask << 1) - (mask >> 7);
*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
(mask & 0x1d1d1d1d1d1d1d1dULL);
*dst ^= *src;
}
for (; i < xcount; i++, dst++) {
mask = *dst & 0x8080808080808080ULL;
mask = (mask << 1) - (mask >> 7);
*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
(mask & 0x1d1d1d1d1d1d1d1dULL);
}
}
}
src = cols[VDEV_RAIDZ_Q].rc_data;
dst = cols[x].rc_data;
exp = 255 - (acols - 1 - x);
for (i = 0; i < xcount; i++, dst++, src++) {
*dst ^= *src;
for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
*b = vdev_raidz_exp2(*b, exp);
}
}
}
static void
vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
int x, int y)
{
uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
void *pdata, *qdata;
uint64_t xsize, ysize, i;
//ASSERT(x < y);
//ASSERT(x >= nparity);
//ASSERT(y < acols);
//ASSERT(cols[x].rc_size >= cols[y].rc_size);
/*
* Move the parity data aside -- we're going to compute parity as
* though columns x and y were full of zeros -- Pxy and Qxy. We want to
* reuse the parity generation mechanism without trashing the actual
* parity so we make those columns appear to be full of zeros by
* setting their lengths to zero.
*/
pdata = cols[VDEV_RAIDZ_P].rc_data;
qdata = cols[VDEV_RAIDZ_Q].rc_data;
xsize = cols[x].rc_size;
ysize = cols[y].rc_size;
cols[VDEV_RAIDZ_P].rc_data =
zfs_alloc_temp(cols[VDEV_RAIDZ_P].rc_size);
cols[VDEV_RAIDZ_Q].rc_data =
zfs_alloc_temp(cols[VDEV_RAIDZ_Q].rc_size);
cols[x].rc_size = 0;
cols[y].rc_size = 0;
vdev_raidz_generate_parity_pq(cols, nparity, acols);
cols[x].rc_size = xsize;
cols[y].rc_size = ysize;
p = pdata;
q = qdata;
pxy = cols[VDEV_RAIDZ_P].rc_data;
qxy = cols[VDEV_RAIDZ_Q].rc_data;
xd = cols[x].rc_data;
yd = cols[y].rc_data;
/*
* We now have:
* Pxy = P + D_x + D_y
* Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
*
* We can then solve for D_x:
* D_x = A * (P + Pxy) + B * (Q + Qxy)
* where
* A = 2^(x - y) * (2^(x - y) + 1)^-1
* B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
*
* With D_x in hand, we can easily solve for D_y:
* D_y = P + Pxy + D_x
*/
a = vdev_raidz_pow2[255 + x - y];
b = vdev_raidz_pow2[255 - (acols - 1 - x)];
tmp = 255 - vdev_raidz_log2[a ^ 1];
aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
vdev_raidz_exp2(*q ^ *qxy, bexp);
if (i < ysize)
*yd = *p ^ *pxy ^ *xd;
}
/*
* Restore the saved parity data.
*/
cols[VDEV_RAIDZ_P].rc_data = pdata;
cols[VDEV_RAIDZ_Q].rc_data = qdata;
}
static int
vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
off_t offset, size_t bytes)
{
size_t psize = BP_GET_PSIZE(bp);
vdev_t *kid;
int unit_shift = vdev->v_ashift;
int dcols = vdev->v_nchildren;
int nparity = vdev->v_nparity;
int missingdata, missingparity;
int parity_errors, data_errors, unexpected_errors, total_errors;
int parity_untried;
uint64_t b = offset >> unit_shift;
uint64_t s = psize >> unit_shift;
uint64_t f = b % dcols;
uint64_t o = (b / dcols) << unit_shift;
int q, r, c, c1, bc, col, acols, coff, devidx, asize, n;
static raidz_col_t cols[16];
raidz_col_t *rc, *rc1;
q = s / (dcols - nparity);
r = s - q * (dcols - nparity);
bc = (r == 0 ? 0 : r + nparity);
acols = (q == 0 ? bc : dcols);
asize = 0;
for (c = 0; c < acols; c++) {
col = f + c;
coff = o;
if (col >= dcols) {
col -= dcols;
coff += 1ULL << unit_shift;
}
cols[c].rc_devidx = col;
cols[c].rc_offset = coff;
cols[c].rc_size = (q + (c < bc)) << unit_shift;
cols[c].rc_data = NULL;
cols[c].rc_error = 0;
cols[c].rc_tried = 0;
cols[c].rc_skipped = 0;
asize += cols[c].rc_size;
}
asize = roundup(asize, (nparity + 1) << unit_shift);
for (c = 0; c < nparity; c++) {
cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size);
}
cols[c].rc_data = buf;
for (c = c + 1; c < acols; c++)
cols[c].rc_data = (char *)cols[c - 1].rc_data +
cols[c - 1].rc_size;
/*
* If all data stored spans all columns, there's a danger that
* parity will always be on the same device and, since parity
* isn't read during normal operation, that that device's I/O
* bandwidth won't be used effectively. We therefore switch
* the parity every 1MB.
*
* ... at least that was, ostensibly, the theory. As a
* practical matter unless we juggle the parity between all
* devices evenly, we won't see any benefit. Further,
* occasional writes that aren't a multiple of the LCM of the
* number of children and the minimum stripe width are
* sufficient to avoid pessimal behavior. Unfortunately, this
* decision created an implicit on-disk format requirement
* that we need to support for all eternity, but only for
* single-parity RAID-Z.
*/
//ASSERT(acols >= 2);
//ASSERT(cols[0].rc_size == cols[1].rc_size);
if (nparity == 1 && (offset & (1ULL << 20))) {
devidx = cols[0].rc_devidx;
o = cols[0].rc_offset;
cols[0].rc_devidx = cols[1].rc_devidx;
cols[0].rc_offset = cols[1].rc_offset;
cols[1].rc_devidx = devidx;
cols[1].rc_offset = o;
}
/*
* Iterate over the columns in reverse order so that we hit
* the parity last -- any errors along the way will force us
* to read the parity data.
*/
missingdata = 0;
missingparity = 0;
for (c = acols - 1; c >= 0; c--) {
rc = &cols[c];
devidx = rc->rc_devidx;
STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
if (kid->v_id == devidx)
break;
if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
if (c >= nparity)
missingdata++;
else
missingparity++;
rc->rc_error = ENXIO;
rc->rc_tried = 1; /* don't even try */
rc->rc_skipped = 1;
continue;
}
#if 0
/*
* Too hard for the bootcode
*/
if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
if (c >= nparity)
rm->rm_missingdata++;
else
rm->rm_missingparity++;
rc->rc_error = ESTALE;
rc->rc_skipped = 1;
continue;
}
#endif
if (c >= nparity || missingdata > 0) {
if (rc->rc_data)
rc->rc_error = kid->v_read(kid, NULL,
rc->rc_data, rc->rc_offset, rc->rc_size);
else
rc->rc_error = ENXIO;
rc->rc_tried = 1;
rc->rc_skipped = 0;
}
}
reconstruct:
parity_errors = 0;
data_errors = 0;
unexpected_errors = 0;
total_errors = 0;
parity_untried = 0;
for (c = 0; c < acols; c++) {
rc = &cols[c];
if (rc->rc_error) {
if (c < nparity)
parity_errors++;
else
data_errors++;
if (!rc->rc_skipped)
unexpected_errors++;
total_errors++;
} else if (c < nparity && !rc->rc_tried) {
parity_untried++;
}
}
/*
* There are three potential phases for a read:
* 1. produce valid data from the columns read
* 2. read all disks and try again
* 3. perform combinatorial reconstruction
*
* Each phase is progressively both more expensive and less
* likely to occur. If we encounter more errors than we can
* repair or all phases fail, we have no choice but to return
* an error.
*/
/*
* If the number of errors we saw was correctable -- less than
* or equal to the number of parity disks read -- attempt to
* produce data that has a valid checksum. Naturally, this
* case applies in the absence of any errors.
*/
if (total_errors <= nparity - parity_untried) {
switch (data_errors) {
case 0:
if (zio_checksum_error(bp, buf) == 0)
return (0);
break;
case 1:
/*
* We either attempt to read all the parity columns or
* none of them. If we didn't try to read parity, we
* wouldn't be here in the correctable case. There must
* also have been fewer parity errors than parity
* columns or, again, we wouldn't be in this code path.
*/
//ASSERT(parity_untried == 0);
//ASSERT(parity_errors < nparity);
/*
* Find the column that reported the error.
*/
for (c = nparity; c < acols; c++) {
rc = &cols[c];
if (rc->rc_error != 0)
break;
}
//ASSERT(c != acols);
//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
if (cols[VDEV_RAIDZ_P].rc_error == 0) {
vdev_raidz_reconstruct_p(cols, nparity,
acols, c);
} else {
//ASSERT(nparity > 1);
vdev_raidz_reconstruct_q(cols, nparity,
acols, c);
}
if (zio_checksum_error(bp, buf) == 0)
return (0);
break;
case 2:
/*
* Two data column errors require double parity.
*/
//ASSERT(nparity == 2);
/*
* Find the two columns that reported errors.
*/
for (c = nparity; c < acols; c++) {
rc = &cols[c];
if (rc->rc_error != 0)
break;
}
//ASSERT(c != acols);
//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
for (c1 = c++; c < acols; c++) {
rc = &cols[c];
if (rc->rc_error != 0)
break;
}
//ASSERT(c != acols);
//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
vdev_raidz_reconstruct_pq(cols, nparity, acols,
c1, c);
if (zio_checksum_error(bp, buf) == 0)
return (0);
break;
default:
break;
//ASSERT(nparity <= 2);
//ASSERT(0);
}
}
/*
* This isn't a typical situation -- either we got a read
* error or a child silently returned bad data. Read every
* block so we can try again with as much data and parity as
* we can track down. If we've already been through once
* before, all children will be marked as tried so we'll
* proceed to combinatorial reconstruction.
*/
n = 0;
for (c = 0; c < acols; c++) {
rc = &cols[c];
if (rc->rc_tried)
continue;
devidx = rc->rc_devidx;
STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
if (kid->v_id == devidx)
break;
if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
rc->rc_error = ENXIO;
rc->rc_tried = 1; /* don't even try */
rc->rc_skipped = 1;
continue;
}
if (rc->rc_data)
rc->rc_error = kid->v_read(kid, NULL,
rc->rc_data, rc->rc_offset, rc->rc_size);
else
rc->rc_error = ENXIO;
if (rc->rc_error == 0)
n++;
rc->rc_tried = 1;
rc->rc_skipped = 0;
}
/*
* If we managed to read anything more, retry the
* reconstruction.
*/
if (n)
goto reconstruct;
/*
* At this point we've attempted to reconstruct the data given the
* errors we detected, and we've attempted to read all columns. There
* must, therefore, be one or more additional problems -- silent errors
* resulting in invalid data rather than explicit I/O errors resulting
* in absent data. Before we attempt combinatorial reconstruction make
* sure we have a chance of coming up with the right answer.
*/
if (total_errors >= nparity) {
return (EIO);
}
asize = 0;
for (c = 0; c < acols; c++) {
rc = &cols[c];
if (rc->rc_size > asize)
asize = rc->rc_size;
}
if (cols[VDEV_RAIDZ_P].rc_error == 0) {
/*
* Attempt to reconstruct the data from parity P.
*/
void *orig;
orig = zfs_alloc_temp(asize);
for (c = nparity; c < acols; c++) {
rc = &cols[c];
memcpy(orig, rc->rc_data, rc->rc_size);
vdev_raidz_reconstruct_p(cols, nparity, acols, c);
if (zio_checksum_error(bp, buf) == 0)
return (0);
memcpy(rc->rc_data, orig, rc->rc_size);
}
}
if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) {
/*
* Attempt to reconstruct the data from parity Q.
*/
void *orig;
orig = zfs_alloc_temp(asize);
for (c = nparity; c < acols; c++) {
rc = &cols[c];
memcpy(orig, rc->rc_data, rc->rc_size);
vdev_raidz_reconstruct_q(cols, nparity, acols, c);
if (zio_checksum_error(bp, buf) == 0)
return (0);
memcpy(rc->rc_data, orig, rc->rc_size);
}
}
if (nparity > 1 &&
cols[VDEV_RAIDZ_P].rc_error == 0 &&
cols[VDEV_RAIDZ_Q].rc_error == 0) {
/*
* Attempt to reconstruct the data from both P and Q.
*/
void *orig, *orig1;
orig = zfs_alloc_temp(asize);
orig1 = zfs_alloc_temp(asize);
for (c = nparity; c < acols - 1; c++) {
rc = &cols[c];
memcpy(orig, rc->rc_data, rc->rc_size);
for (c1 = c + 1; c1 < acols; c1++) {
rc1 = &cols[c1];
memcpy(orig1, rc1->rc_data, rc1->rc_size);
vdev_raidz_reconstruct_pq(cols, nparity,
acols, c, c1);
if (zio_checksum_error(bp, buf) == 0)
return (0);
memcpy(rc1->rc_data, orig1, rc1->rc_size);
}
memcpy(rc->rc_data, orig, rc->rc_size);
}
}
return (EIO);
}