9485 Optimize possible split block search space
illumos/illumos-gate@a21fe34979a21fe34979
https://www.illumos.org/issues/9485 Port this commit from ZoL:4589f3ae4c
Author: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
parent
0a48ec6f62
commit
66c1f9ba9c
@ -196,6 +196,7 @@ extern boolean_t zfs_compressed_arc_enabled;
|
|||||||
extern boolean_t zfs_abd_scatter_enabled;
|
extern boolean_t zfs_abd_scatter_enabled;
|
||||||
extern int dmu_object_alloc_chunk_shift;
|
extern int dmu_object_alloc_chunk_shift;
|
||||||
extern boolean_t zfs_force_some_double_word_sm_entries;
|
extern boolean_t zfs_force_some_double_word_sm_entries;
|
||||||
|
extern unsigned long zfs_reconstruct_indirect_damage_fraction;
|
||||||
|
|
||||||
static ztest_shared_opts_t *ztest_shared_opts;
|
static ztest_shared_opts_t *ztest_shared_opts;
|
||||||
static ztest_shared_opts_t ztest_opts;
|
static ztest_shared_opts_t ztest_opts;
|
||||||
@ -5697,7 +5698,8 @@ ztest_run_zdb(char *pool)
|
|||||||
isa = strdup(isa);
|
isa = strdup(isa);
|
||||||
/* LINTED */
|
/* LINTED */
|
||||||
(void) sprintf(bin,
|
(void) sprintf(bin,
|
||||||
"/usr/sbin%.*s/zdb -bcc%s%s -G -d -U %s %s",
|
"/usr/sbin%.*s/zdb -bcc%s%s -G -d -U %s "
|
||||||
|
"-o zfs_reconstruct_indirect_combinations_max=65536 %s",
|
||||||
isalen,
|
isalen,
|
||||||
isa,
|
isa,
|
||||||
ztest_opts.zo_verbose >= 3 ? "s" : "",
|
ztest_opts.zo_verbose >= 3 ? "s" : "",
|
||||||
@ -6648,6 +6650,13 @@ main(int argc, char **argv)
|
|||||||
*/
|
*/
|
||||||
zfs_force_some_double_word_sm_entries = B_TRUE;
|
zfs_force_some_double_word_sm_entries = B_TRUE;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Verify that even extensively damaged split blocks with many
|
||||||
|
* segments can be reconstructed in a reasonable amount of time
|
||||||
|
* when reconstruction is known to be possible.
|
||||||
|
*/
|
||||||
|
zfs_reconstruct_indirect_damage_fraction = 4;
|
||||||
|
|
||||||
ztest_fd_rand = open("/dev/urandom", O_RDONLY);
|
ztest_fd_rand = open("/dev/urandom", O_RDONLY);
|
||||||
ASSERT3S(ztest_fd_rand, >=, 0);
|
ASSERT3S(ztest_fd_rand, >=, 0);
|
||||||
|
|
||||||
|
@ -206,17 +206,21 @@ uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
|
|||||||
int zfs_condense_indirect_commit_entry_delay_ticks = 0;
|
int zfs_condense_indirect_commit_entry_delay_ticks = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If a split block contains more than this many segments, consider it too
|
* If an indirect split block contains more than this many possible unique
|
||||||
* computationally expensive to check all (2^num_segments) possible
|
* combinations when being reconstructed, consider it too computationally
|
||||||
* combinations. Instead, try at most 2^_segments_max randomly-selected
|
* expensive to check them all. Instead, try at most 100 randomly-selected
|
||||||
* combinations.
|
* combinations each time the block is accessed. This allows all segment
|
||||||
*
|
* copies to participate fairly in the reconstruction when all combinations
|
||||||
* This is reasonable if only a few segment copies are damaged and the
|
* cannot be checked and prevents repeated use of one bad copy.
|
||||||
* majority of segment copies are good. This allows all the segment copies to
|
|
||||||
* participate fairly in the reconstruction and prevents the repeated use of
|
|
||||||
* one bad copy.
|
|
||||||
*/
|
*/
|
||||||
int zfs_reconstruct_indirect_segments_max = 10;
|
int zfs_reconstruct_indirect_combinations_max = 256;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Enable to simulate damaged segments and validate reconstruction.
|
||||||
|
* Used by ztest
|
||||||
|
*/
|
||||||
|
unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The indirect_child_t represents the vdev that we will read from, when we
|
* The indirect_child_t represents the vdev that we will read from, when we
|
||||||
@ -228,6 +232,13 @@ int zfs_reconstruct_indirect_segments_max = 10;
|
|||||||
typedef struct indirect_child {
|
typedef struct indirect_child {
|
||||||
abd_t *ic_data;
|
abd_t *ic_data;
|
||||||
vdev_t *ic_vdev;
|
vdev_t *ic_vdev;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ic_duplicate is NULL when the ic_data contents are unique, when it
|
||||||
|
* is determined to be a duplicate it references the primary child.
|
||||||
|
*/
|
||||||
|
struct indirect_child *ic_duplicate;
|
||||||
|
list_node_t ic_node; /* node on is_unique_child */
|
||||||
} indirect_child_t;
|
} indirect_child_t;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -249,12 +260,14 @@ typedef struct indirect_split {
|
|||||||
uint64_t is_target_offset; /* offset on is_vdev */
|
uint64_t is_target_offset; /* offset on is_vdev */
|
||||||
uint64_t is_size;
|
uint64_t is_size;
|
||||||
int is_children; /* number of entries in is_child[] */
|
int is_children; /* number of entries in is_child[] */
|
||||||
|
int is_unique_children; /* number of entries in is_unique_child */
|
||||||
|
list_t is_unique_child;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* is_good_child is the child that we are currently using to
|
* is_good_child is the child that we are currently using to
|
||||||
* attempt reconstruction.
|
* attempt reconstruction.
|
||||||
*/
|
*/
|
||||||
int is_good_child;
|
indirect_child_t *is_good_child;
|
||||||
|
|
||||||
indirect_child_t is_child[1]; /* variable-length */
|
indirect_child_t is_child[1]; /* variable-length */
|
||||||
} indirect_split_t;
|
} indirect_split_t;
|
||||||
@ -266,6 +279,9 @@ typedef struct indirect_split {
|
|||||||
typedef struct indirect_vsd {
|
typedef struct indirect_vsd {
|
||||||
boolean_t iv_split_block;
|
boolean_t iv_split_block;
|
||||||
boolean_t iv_reconstruct;
|
boolean_t iv_reconstruct;
|
||||||
|
uint64_t iv_unique_combinations;
|
||||||
|
uint64_t iv_attempts;
|
||||||
|
uint64_t iv_attempts_max;
|
||||||
|
|
||||||
list_t iv_splits; /* list of indirect_split_t's */
|
list_t iv_splits; /* list of indirect_split_t's */
|
||||||
} indirect_vsd_t;
|
} indirect_vsd_t;
|
||||||
@ -283,6 +299,13 @@ vdev_indirect_map_free(zio_t *zio)
|
|||||||
abd_free(ic->ic_data);
|
abd_free(ic->ic_data);
|
||||||
}
|
}
|
||||||
list_remove(&iv->iv_splits, is);
|
list_remove(&iv->iv_splits, is);
|
||||||
|
|
||||||
|
indirect_child_t *ic;
|
||||||
|
while ((ic = list_head(&is->is_unique_child)) != NULL)
|
||||||
|
list_remove(&is->is_unique_child, ic);
|
||||||
|
|
||||||
|
list_destroy(&is->is_unique_child);
|
||||||
|
|
||||||
kmem_free(is,
|
kmem_free(is,
|
||||||
offsetof(indirect_split_t, is_child[is->is_children]));
|
offsetof(indirect_split_t, is_child[is->is_children]));
|
||||||
}
|
}
|
||||||
@ -1177,6 +1200,8 @@ vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
|
|||||||
is->is_split_offset = split_offset;
|
is->is_split_offset = split_offset;
|
||||||
is->is_target_offset = offset;
|
is->is_target_offset = offset;
|
||||||
is->is_vdev = vd;
|
is->is_vdev = vd;
|
||||||
|
list_create(&is->is_unique_child, sizeof (indirect_child_t),
|
||||||
|
offsetof(indirect_child_t, ic_node));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Note that we only consider multiple copies of the data for
|
* Note that we only consider multiple copies of the data for
|
||||||
@ -1187,6 +1212,7 @@ vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
|
|||||||
if (vd->vdev_ops == &vdev_mirror_ops) {
|
if (vd->vdev_ops == &vdev_mirror_ops) {
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
is->is_child[i].ic_vdev = vd->vdev_child[i];
|
is->is_child[i].ic_vdev = vd->vdev_child[i];
|
||||||
|
list_link_init(&is->is_child[i].ic_node);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
is->is_child[0].ic_vdev = vd;
|
is->is_child[0].ic_vdev = vd;
|
||||||
@ -1239,6 +1265,7 @@ vdev_indirect_read_all(zio_t *zio)
|
|||||||
|
|
||||||
ic->ic_data = abd_alloc_sametype(zio->io_abd,
|
ic->ic_data = abd_alloc_sametype(zio->io_abd,
|
||||||
is->is_size);
|
is->is_size);
|
||||||
|
ic->ic_duplicate = NULL;
|
||||||
|
|
||||||
zio_nowait(zio_vdev_child_io(zio, NULL,
|
zio_nowait(zio_vdev_child_io(zio, NULL,
|
||||||
ic->ic_vdev, is->is_target_offset, ic->ic_data,
|
ic->ic_vdev, is->is_target_offset, ic->ic_data,
|
||||||
@ -1350,7 +1377,7 @@ vdev_indirect_checksum_error(zio_t *zio,
|
|||||||
|
|
||||||
zio_bad_cksum_t zbc = { 0 };
|
zio_bad_cksum_t zbc = { 0 };
|
||||||
void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
|
void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
|
||||||
abd_t *good_abd = is->is_child[is->is_good_child].ic_data;
|
abd_t *good_abd = is->is_good_child->ic_data;
|
||||||
void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
|
void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
|
||||||
zfs_ereport_post_checksum(zio->io_spa, vd, zio,
|
zfs_ereport_post_checksum(zio->io_spa, vd, zio,
|
||||||
is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
|
is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
|
||||||
@ -1383,21 +1410,18 @@ vdev_indirect_repair(zio_t *zio)
|
|||||||
|
|
||||||
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
||||||
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
||||||
indirect_child_t *good_child = &is->is_child[is->is_good_child];
|
|
||||||
|
|
||||||
for (int c = 0; c < is->is_children; c++) {
|
for (int c = 0; c < is->is_children; c++) {
|
||||||
indirect_child_t *ic = &is->is_child[c];
|
indirect_child_t *ic = &is->is_child[c];
|
||||||
if (ic == good_child)
|
if (ic == is->is_good_child)
|
||||||
continue;
|
continue;
|
||||||
if (ic->ic_data == NULL)
|
if (ic->ic_data == NULL)
|
||||||
continue;
|
continue;
|
||||||
if (abd_cmp(good_child->ic_data, ic->ic_data,
|
if (ic->ic_duplicate == is->is_good_child)
|
||||||
is->is_size) == 0)
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
zio_nowait(zio_vdev_child_io(zio, NULL,
|
zio_nowait(zio_vdev_child_io(zio, NULL,
|
||||||
ic->ic_vdev, is->is_target_offset,
|
ic->ic_vdev, is->is_target_offset,
|
||||||
good_child->ic_data, is->is_size,
|
is->is_good_child->ic_data, is->is_size,
|
||||||
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
|
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
|
||||||
ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
|
ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
|
||||||
NULL, NULL));
|
NULL, NULL));
|
||||||
@ -1439,21 +1463,194 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copy data from all the splits to a main zio then validate the checksum.
|
||||||
|
* If then checksum is successfully validated return success.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio)
|
||||||
|
{
|
||||||
|
zio_bad_cksum_t zbc;
|
||||||
|
|
||||||
|
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
||||||
|
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
||||||
|
|
||||||
|
ASSERT3P(is->is_good_child->ic_data, !=, NULL);
|
||||||
|
ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
|
||||||
|
|
||||||
|
abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
|
||||||
|
is->is_split_offset, 0, is->is_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (zio_checksum_error(zio, &zbc));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There are relatively few possible combinations making it feasible to
|
||||||
|
* deterministically check them all. We do this by setting the good_child
|
||||||
|
* to the next unique split version. If we reach the end of the list then
|
||||||
|
* "carry over" to the next unique split version (like counting in base
|
||||||
|
* is_unique_children, but each digit can have a different base).
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio)
|
||||||
|
{
|
||||||
|
boolean_t more = B_TRUE;
|
||||||
|
|
||||||
|
iv->iv_attempts = 0;
|
||||||
|
|
||||||
|
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
||||||
|
is != NULL; is = list_next(&iv->iv_splits, is))
|
||||||
|
is->is_good_child = list_head(&is->is_unique_child);
|
||||||
|
|
||||||
|
while (more == B_TRUE) {
|
||||||
|
iv->iv_attempts++;
|
||||||
|
more = B_FALSE;
|
||||||
|
|
||||||
|
if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
|
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
||||||
|
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
||||||
|
is->is_good_child = list_next(&is->is_unique_child,
|
||||||
|
is->is_good_child);
|
||||||
|
if (is->is_good_child != NULL) {
|
||||||
|
more = B_TRUE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
is->is_good_child = list_head(&is->is_unique_child);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
|
||||||
|
|
||||||
|
return (SET_ERROR(ECKSUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There are too many combinations to try all of them in a reasonable amount
|
||||||
|
* of time. So try a fixed number of random combinations from the unique
|
||||||
|
* split versions, after which we'll consider the block unrecoverable.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio)
|
||||||
|
{
|
||||||
|
iv->iv_attempts = 0;
|
||||||
|
|
||||||
|
while (iv->iv_attempts < iv->iv_attempts_max) {
|
||||||
|
iv->iv_attempts++;
|
||||||
|
|
||||||
|
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
||||||
|
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
||||||
|
indirect_child_t *ic = list_head(&is->is_unique_child);
|
||||||
|
int children = is->is_unique_children;
|
||||||
|
|
||||||
|
for (int i = spa_get_random(children); i > 0; i--)
|
||||||
|
ic = list_next(&is->is_unique_child, ic);
|
||||||
|
|
||||||
|
ASSERT3P(ic, !=, NULL);
|
||||||
|
is->is_good_child = ic;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (SET_ERROR(ECKSUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is a validation function for reconstruction. It randomly selects
|
||||||
|
* a good combination, if one can be found, and then it intentionally
|
||||||
|
* damages all other segment copes by zeroing them. This forces the
|
||||||
|
* reconstruction algorithm to locate the one remaining known good copy.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
|
||||||
|
{
|
||||||
|
/* Presume all the copies are unique for initial selection. */
|
||||||
|
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
||||||
|
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
||||||
|
is->is_unique_children = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < is->is_children; i++) {
|
||||||
|
indirect_child_t *ic = &is->is_child[i];
|
||||||
|
if (ic->ic_data != NULL) {
|
||||||
|
is->is_unique_children++;
|
||||||
|
list_insert_tail(&is->is_unique_child, ic);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Set each is_good_child to a randomly-selected child which
|
||||||
|
* is known to contain validated data.
|
||||||
|
*/
|
||||||
|
int error = vdev_indirect_splits_enumerate_randomly(iv, zio);
|
||||||
|
if (error)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Damage all but the known good copy by zeroing it. This will
|
||||||
|
* result in two or less unique copies per indirect_child_t.
|
||||||
|
* Both may need to be checked in order to reconstruct the block.
|
||||||
|
* Set iv->iv_attempts_max such that all unique combinations will
|
||||||
|
* enumerated, but limit the damage to at most 16 indirect splits.
|
||||||
|
*/
|
||||||
|
iv->iv_attempts_max = 1;
|
||||||
|
|
||||||
|
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
||||||
|
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
||||||
|
for (int c = 0; c < is->is_children; c++) {
|
||||||
|
indirect_child_t *ic = &is->is_child[c];
|
||||||
|
|
||||||
|
if (ic == is->is_good_child)
|
||||||
|
continue;
|
||||||
|
if (ic->ic_data == NULL)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
abd_zero(ic->ic_data, ic->ic_data->abd_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
iv->iv_attempts_max *= 2;
|
||||||
|
if (iv->iv_attempts_max > (1ULL << 16)) {
|
||||||
|
iv->iv_attempts_max = UINT64_MAX;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out:
|
||||||
|
/* Empty the unique children lists so they can be reconstructed. */
|
||||||
|
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
||||||
|
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
||||||
|
indirect_child_t *ic;
|
||||||
|
while ((ic = list_head(&is->is_unique_child)) != NULL)
|
||||||
|
list_remove(&is->is_unique_child, ic);
|
||||||
|
|
||||||
|
is->is_unique_children = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (error);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This function is called when we have read all copies of the data and need
|
* This function is called when we have read all copies of the data and need
|
||||||
* to try to find a combination of copies that gives us the right checksum.
|
* to try to find a combination of copies that gives us the right checksum.
|
||||||
*
|
*
|
||||||
* If we pointed to any mirror vdevs, this effectively does the job of the
|
* If we pointed to any mirror vdevs, this effectively does the job of the
|
||||||
* mirror. The mirror vdev code can't do its own job because we don't know
|
* mirror. The mirror vdev code can't do its own job because we don't know
|
||||||
* the checksum of each split segment individually. We have to try every
|
* the checksum of each split segment individually.
|
||||||
* combination of copies of split segments, until we find one that checksums
|
|
||||||
* correctly. (Or until we have tried all combinations, or have tried
|
|
||||||
* 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we
|
|
||||||
* set io_error to ECKSUM to propagate the error up to the user.)
|
|
||||||
*
|
*
|
||||||
* For example, if we have 3 segments in the split,
|
* We have to try every unique combination of copies of split segments, until
|
||||||
* and each points to a 2-way mirror, we will have the following pieces of
|
* we find one that checksums correctly. Duplicate segment copies are first
|
||||||
* data:
|
* identified and latter skipped during reconstruction. This optimization
|
||||||
|
* reduces the search space and ensures that of the remaining combinations
|
||||||
|
* at most one is correct.
|
||||||
|
*
|
||||||
|
* When the total number of combinations is small they can all be checked.
|
||||||
|
* For example, if we have 3 segments in the split, and each points to a
|
||||||
|
* 2-way mirror with unique copies, we will have the following pieces of data:
|
||||||
*
|
*
|
||||||
* | mirror child
|
* | mirror child
|
||||||
* split | [0] [1]
|
* split | [0] [1]
|
||||||
@ -1480,10 +1677,10 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
|
|||||||
* data_A_1 data_B_1 data_C_1
|
* data_A_1 data_B_1 data_C_1
|
||||||
*
|
*
|
||||||
* Note that the split segments may be on the same or different top-level
|
* Note that the split segments may be on the same or different top-level
|
||||||
* vdevs. In either case, we try lots of combinations (see
|
* vdevs. In either case, we may need to try lots of combinations (see
|
||||||
* zfs_reconstruct_indirect_segments_max). This ensures that if a mirror has
|
* zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror
|
||||||
* small silent errors on all of its children, we can still reconstruct the
|
* has small silent errors on all of its children, we can still reconstruct
|
||||||
* correct data, as long as those errors are at sufficiently-separated
|
* the correct data, as long as those errors are at sufficiently-separated
|
||||||
* offsets (specifically, separated by the largest block size - default of
|
* offsets (specifically, separated by the largest block size - default of
|
||||||
* 128KB, but up to 16MB).
|
* 128KB, but up to 16MB).
|
||||||
*/
|
*/
|
||||||
@ -1491,90 +1688,91 @@ static void
|
|||||||
vdev_indirect_reconstruct_io_done(zio_t *zio)
|
vdev_indirect_reconstruct_io_done(zio_t *zio)
|
||||||
{
|
{
|
||||||
indirect_vsd_t *iv = zio->io_vsd;
|
indirect_vsd_t *iv = zio->io_vsd;
|
||||||
uint64_t attempts = 0;
|
boolean_t known_good = B_FALSE;
|
||||||
uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
|
int error;
|
||||||
int segments = 0;
|
|
||||||
|
|
||||||
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
iv->iv_unique_combinations = 1;
|
||||||
is != NULL; is = list_next(&iv->iv_splits, is))
|
iv->iv_attempts_max = UINT64_MAX;
|
||||||
segments++;
|
|
||||||
|
|
||||||
for (;;) {
|
if (zfs_reconstruct_indirect_combinations_max > 0)
|
||||||
/* copy data from splits to main zio */
|
iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
|
||||||
int ret;
|
|
||||||
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
|
||||||
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If this child failed, its ic_data will be NULL.
|
* If nonzero, every 1/x blocks will be damaged, in order to validate
|
||||||
* Skip this combination.
|
* reconstruction when there are split segments with damaged copies.
|
||||||
|
* Known_good will TRUE when reconstruction is known to be possible.
|
||||||
*/
|
*/
|
||||||
if (is->is_child[is->is_good_child].ic_data == NULL) {
|
if (zfs_reconstruct_indirect_damage_fraction != 0 &&
|
||||||
ret = EIO;
|
spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0)
|
||||||
goto next;
|
known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
|
||||||
}
|
|
||||||
|
|
||||||
abd_copy_off(zio->io_abd,
|
|
||||||
is->is_child[is->is_good_child].ic_data,
|
|
||||||
is->is_split_offset, 0, is->is_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* See if this checksum matches. */
|
|
||||||
zio_bad_cksum_t zbc;
|
|
||||||
ret = zio_checksum_error(zio, &zbc);
|
|
||||||
if (ret == 0) {
|
|
||||||
/* Found a matching checksum. Issue repair i/os. */
|
|
||||||
vdev_indirect_repair(zio);
|
|
||||||
zio_checksum_verified(zio);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Checksum failed; try a different combination of split
|
* Determine the unique children for a split segment and add them
|
||||||
* children.
|
* to the is_unique_child list. By restricting reconstruction
|
||||||
*/
|
* to these children, only unique combinations will be considered.
|
||||||
boolean_t more;
|
* This can vastly reduce the search space when there are a large
|
||||||
next:
|
* number of indirect splits.
|
||||||
more = B_FALSE;
|
|
||||||
if (segments <= zfs_reconstruct_indirect_segments_max) {
|
|
||||||
/*
|
|
||||||
* There are relatively few segments, so
|
|
||||||
* deterministically check all combinations. We do
|
|
||||||
* this by by adding one to the first split's
|
|
||||||
* good_child. If it overflows, then "carry over" to
|
|
||||||
* the next split (like counting in base is_children,
|
|
||||||
* but each digit can have a different base).
|
|
||||||
*/
|
*/
|
||||||
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
||||||
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
||||||
is->is_good_child++;
|
is->is_unique_children = 0;
|
||||||
if (is->is_good_child < is->is_children) {
|
|
||||||
more = B_TRUE;
|
for (int i = 0; i < is->is_children; i++) {
|
||||||
break;
|
indirect_child_t *ic_i = &is->is_child[i];
|
||||||
|
|
||||||
|
if (ic_i->ic_data == NULL ||
|
||||||
|
ic_i->ic_duplicate != NULL)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
for (int j = i + 1; j < is->is_children; j++) {
|
||||||
|
indirect_child_t *ic_j = &is->is_child[j];
|
||||||
|
|
||||||
|
if (ic_j->ic_data == NULL ||
|
||||||
|
ic_j->ic_duplicate != NULL)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (abd_cmp(ic_i->ic_data, ic_j->ic_data,
|
||||||
|
is->is_size) == 0) {
|
||||||
|
ic_j->ic_duplicate = ic_i;
|
||||||
}
|
}
|
||||||
is->is_good_child = 0;
|
|
||||||
}
|
}
|
||||||
} else if (++attempts < attempts_max) {
|
|
||||||
/*
|
is->is_unique_children++;
|
||||||
* There are too many combinations to try all of them
|
list_insert_tail(&is->is_unique_child, ic_i);
|
||||||
* in a reasonable amount of time, so try a fixed
|
|
||||||
* number of random combinations, after which we'll
|
|
||||||
* consider the block unrecoverable.
|
|
||||||
*/
|
|
||||||
for (indirect_split_t *is = list_head(&iv->iv_splits);
|
|
||||||
is != NULL; is = list_next(&iv->iv_splits, is)) {
|
|
||||||
is->is_good_child =
|
|
||||||
spa_get_random(is->is_children);
|
|
||||||
}
|
}
|
||||||
more = B_TRUE;
|
|
||||||
}
|
/* Reconstruction is impossible, no valid children */
|
||||||
if (!more) {
|
EQUIV(list_is_empty(&is->is_unique_child),
|
||||||
/* All combinations failed. */
|
is->is_unique_children == 0);
|
||||||
zio->io_error = ret;
|
if (list_is_empty(&is->is_unique_child)) {
|
||||||
|
zio->io_error = EIO;
|
||||||
vdev_indirect_all_checksum_errors(zio);
|
vdev_indirect_all_checksum_errors(zio);
|
||||||
zio_checksum_verified(zio);
|
zio_checksum_verified(zio);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
iv->iv_unique_combinations *= is->is_unique_children;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (iv->iv_unique_combinations <= iv->iv_attempts_max)
|
||||||
|
error = vdev_indirect_splits_enumerate_all(iv, zio);
|
||||||
|
else
|
||||||
|
error = vdev_indirect_splits_enumerate_randomly(iv, zio);
|
||||||
|
|
||||||
|
if (error != 0) {
|
||||||
|
/* All attempted combinations failed. */
|
||||||
|
ASSERT3B(known_good, ==, B_FALSE);
|
||||||
|
zio->io_error = error;
|
||||||
|
vdev_indirect_all_checksum_errors(zio);
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* The checksum has been successfully validated. Issue
|
||||||
|
* repair I/Os to any copies of splits which don't match
|
||||||
|
* the validated version.
|
||||||
|
*/
|
||||||
|
ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
|
||||||
|
vdev_indirect_repair(zio);
|
||||||
|
zio_checksum_verified(zio);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user