diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c index 82d0a32d6baa..41afcd0c90bf 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ @@ -77,8 +77,10 @@ dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \ dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN") #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ - (((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ? \ - DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES)) + (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ + DMU_OT_ZAP_OTHER : \ + (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ + DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) #ifndef lint extern int reference_tracking_enable; @@ -667,8 +669,8 @@ get_metaslab_refcount(vdev_t *vd) { int refcount = 0; - if (vd->vdev_top == vd && !vd->vdev_removing) { - for (unsigned m = 0; m < vd->vdev_ms_count; m++) { + if (vd->vdev_top == vd) { + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { space_map_t *sm = vd->vdev_ms[m]->ms_sm; if (sm != NULL && @@ -682,6 +684,45 @@ get_metaslab_refcount(vdev_t *vd) return (refcount); } +static int +get_obsolete_refcount(vdev_t *vd) +{ + int refcount = 0; + + uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); + if (vd->vdev_top == vd && obsolete_sm_obj != 0) { + dmu_object_info_t doi; + VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, + obsolete_sm_obj, &doi)); + if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { + refcount++; + } + } else { + ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); + ASSERT3U(obsolete_sm_obj, ==, 0); + } + for (unsigned c = 0; c < vd->vdev_children; c++) { + refcount += get_obsolete_refcount(vd->vdev_child[c]); + } + + return (refcount); +} + +static int +get_prev_obsolete_spacemap_refcount(spa_t *spa) +{ + uint64_t prev_obj = + spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; + if (prev_obj != 0) { + dmu_object_info_t doi; + VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); + if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { + return (1); + } + } + return (0); +} + static int verify_spacemap_refcounts(spa_t *spa) { @@ -693,6 +734,8 @@ verify_spacemap_refcounts(spa_t *spa) &expected_refcount); actual_refcount = get_dtl_refcount(spa->spa_root_vdev); actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); + actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); + actual_refcount += get_prev_obsolete_spacemap_refcount(spa); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " @@ -708,12 +751,19 @@ static void dump_spacemap(objset_t *os, space_map_t *sm) { uint64_t alloc, offset, entry; - const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", - "INVALID", "INVALID", "INVALID", "INVALID" }; + char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", + "INVALID", "INVALID", "INVALID", "INVALID" }; if (sm == NULL) return; + (void) printf("space map object %llu:\n", + (longlong_t)sm->sm_phys->smp_object); + (void) printf(" smp_objsize = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_objsize); + (void) printf(" smp_alloc = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_alloc); + /* * Print out the freelist entries in both encoded and decoded form. */ @@ -818,9 +868,7 @@ dump_metaslab(metaslab_t *msp) if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); - mutex_enter(&msp->ms_lock); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); - mutex_exit(&msp->ms_lock); } } @@ -877,6 +925,78 @@ dump_metaslab_groups(spa_t *spa) dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } +static void +print_vdev_indirect(vdev_t *vd) +{ + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + vdev_indirect_births_t *vib = vd->vdev_indirect_births; + + if (vim == NULL) { + ASSERT3P(vib, ==, NULL); + return; + } + + ASSERT3U(vdev_indirect_mapping_object(vim), ==, + vic->vic_mapping_object); + ASSERT3U(vdev_indirect_births_object(vib), ==, + vic->vic_births_object); + + (void) printf("indirect births obj %llu:\n", + (longlong_t)vic->vic_births_object); + (void) printf(" vib_count = %llu\n", + (longlong_t)vdev_indirect_births_count(vib)); + for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { + vdev_indirect_birth_entry_phys_t *cur_vibe = + &vib->vib_entries[i]; + (void) printf("\toffset %llx -> txg %llu\n", + (longlong_t)cur_vibe->vibe_offset, + (longlong_t)cur_vibe->vibe_phys_birth_txg); + } + (void) printf("\n"); + + (void) printf("indirect mapping obj %llu:\n", + (longlong_t)vic->vic_mapping_object); + (void) printf(" vim_max_offset = 0x%llx\n", + (longlong_t)vdev_indirect_mapping_max_offset(vim)); + (void) printf(" vim_bytes_mapped = 0x%llx\n", + (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); + (void) printf(" vim_count = %llu\n", + (longlong_t)vdev_indirect_mapping_num_entries(vim)); + + if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) + return; + + uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); + + for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[i]; + (void) printf("\t<%llx:%llx:%llx> -> " + "<%llx:%llx:%llx> (%x obsolete)\n", + (longlong_t)vd->vdev_id, + (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), + (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), + (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), + (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + counts[i]); + } + (void) printf("\n"); + + uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); + if (obsolete_sm_object != 0) { + objset_t *mos = vd->vdev_spa->spa_meta_objset; + (void) printf("obsolete space map object %llu:\n", + (u_longlong_t)obsolete_sm_object); + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, + obsolete_sm_object); + dump_spacemap(mos, vd->vdev_obsolete_sm); + (void) printf("\n"); + } +} + static void dump_metaslabs(spa_t *spa) { @@ -913,6 +1033,8 @@ dump_metaslabs(spa_t *spa) vd = rvd->vdev_child[c]; print_vdev_metaslab_header(vd); + print_vdev_indirect(vd); + for (m = 0; m < vd->vdev_ms_count; m++) dump_metaslab(vd->vdev_ms[m]); (void) printf("\n"); @@ -1090,9 +1212,7 @@ dump_dtl(vdev_t *vd, int indent) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); - mutex_enter(rt->rt_lock); range_tree_walk(rt, dump_dtl_seg, prefix); - mutex_exit(rt->rt_lock); if (dump_opt['d'] > 5 && vd->vdev_children == 0) dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); } @@ -2107,8 +2227,15 @@ dump_dir(objset_t *os) if (dump_opt['i'] != 0 || verbosity >= 2) dump_intent_log(dmu_objset_zil(os)); - if (dmu_objset_ds(os) != NULL) - dump_deadlist(&dmu_objset_ds(os)->ds_deadlist); + if (dmu_objset_ds(os) != NULL) { + dsl_dataset_t *ds = dmu_objset_ds(os); + dump_deadlist(&ds->ds_deadlist); + + if (dsl_dataset_remap_deadlist_exists(ds)) { + (void) printf("ds_remap_deadlist:\n"); + dump_deadlist(&ds->ds_remap_deadlist); + } + } if (verbosity < 2) return; @@ -2452,6 +2579,7 @@ dump_label(const char *dev) } static uint64_t dataset_feature_count[SPA_FEATURES]; +static uint64_t remap_deadlist_count = 0; /*ARGSUSED*/ static int @@ -2472,6 +2600,10 @@ dump_one_dir(const char *dsname, void *arg) dataset_feature_count[f]++; } + if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { + remap_deadlist_count++; + } + dump_dir(os); close_objset(os, FTAG); fuid_table_destroy(); @@ -2511,6 +2643,7 @@ static const char *zdb_ot_extname[] = { typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; + uint64_t zcb_removing_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; @@ -2523,6 +2656,7 @@ typedef struct zdb_cb { int zcb_readfails; int zcb_haderrors; spa_t *zcb_spa; + uint32_t **zcb_vd_obsolete_counts; } zdb_cb_t; static void @@ -2797,12 +2931,208 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) ASSERT(error == ENOENT); } +/* ARGSUSED */ +static void +claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + /* + * This callback was called through a remap from + * a device being removed. Therefore, the vdev that + * this callback is applied to is a concrete + * vdev. + */ + ASSERT(vdev_is_concrete(vd)); + + VERIFY0(metaslab_claim_impl(vd, offset, size, + spa_first_txg(vd->vdev_spa))); +} + +static void +claim_segment_cb(void *arg, uint64_t offset, uint64_t size) +{ + vdev_t *vd = arg; + + vdev_indirect_ops.vdev_op_remap(vd, offset, size, + claim_segment_impl_cb, NULL); +} + +/* + * After accounting for all allocated blocks that are directly referenced, + * we might have missed a reference to a block from a partially complete + * (and thus unused) indirect mapping object. We perform a secondary pass + * through the metaslabs we have already mapped and claim the destination + * blocks. + */ +static void +zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) +{ + if (spa->spa_vdev_removal == NULL) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_t *vd = svr->svr_vdev; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + + if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) + break; + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + svr->svr_allocd_segs, SM_ALLOC)); + + /* + * Clear everything past what has been synced, + * because we have not allocated mappings for it yet. + */ + range_tree_clear(svr->svr_allocd_segs, + vdev_indirect_mapping_max_offset(vim), + msp->ms_sm->sm_start + msp->ms_sm->sm_size - + vdev_indirect_mapping_max_offset(vim)); + } + + zcb->zcb_removing_size += + range_tree_space(svr->svr_allocd_segs); + range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +/* + * vm_idxp is an in-out parameter which (for indirect vdevs) is the + * index in vim_entries that has the first entry in this metaslab. On + * return, it will be set to the first entry after this metaslab. + */ +static void +zdb_leak_init_ms(metaslab_t *msp, uint64_t *vim_idxp) +{ + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + + mutex_enter(&msp->ms_lock); + metaslab_unload(msp); + + /* + * We don't want to spend the CPU manipulating the size-ordered + * tree, so clear the range_tree ops. + */ + msp->ms_tree->rt_ops = NULL; + + (void) fprintf(stderr, + "\rloading vdev %llu of %llu, metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)rvd->vdev_children, + (longlong_t)msp->ms_id, + (longlong_t)vd->vdev_ms_count); + + /* + * For leak detection, we overload the metaslab ms_tree to + * contain allocated segments instead of free segments. As a + * result, we can't use the normal metaslab_load/unload + * interfaces. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); + (*vim_idxp)++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[*vim_idxp]; + uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); + uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); + ASSERT3U(ent_offset, >=, msp->ms_start); + if (ent_offset >= msp->ms_start + msp->ms_size) + break; + + /* + * Mappings do not cross metaslab boundaries, + * because we create them by walking the metaslabs. + */ + ASSERT3U(ent_offset + ent_len, <=, + msp->ms_start + msp->ms_size); + range_tree_add(msp->ms_tree, ent_offset, ent_len); + } + } else if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC)); + } + + if (!msp->ms_loaded) { + msp->ms_loaded = B_TRUE; + } + mutex_exit(&msp->ms_lock); +} + +/* ARGSUSED */ +static int +increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zdb_cb_t *zcb = arg; + spa_t *spa = zcb->zcb_spa; + vdev_t *vd; + const dva_t *dva = &bp->blk_dva[0]; + + ASSERT(!dump_opt['L']); + ASSERT3U(BP_GET_NDVAS(bp), ==, 1); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); + ASSERT3P(vd, !=, NULL); + spa_config_exit(spa, SCL_VDEV, FTAG); + + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); + + vdev_indirect_mapping_increment_obsolete_count( + vd->vdev_indirect_mapping, + DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), + zcb->zcb_vd_obsolete_counts[vd->vdev_id]); + + return (0); +} + +static uint32_t * +zdb_load_obsolete_counts(vdev_t *vd) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + spa_t *spa = vd->vdev_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + uint32_t *counts; + + EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL); + counts = vdev_indirect_mapping_load_obsolete_counts(vim); + if (vd->vdev_obsolete_sm != NULL) { + vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, + vd->vdev_obsolete_sm); + } + if (scip->scip_vdev == vd->vdev_id && + scip->scip_prev_obsolete_sm_object != 0) { + space_map_t *prev_obsolete_sm = NULL; + VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); + space_map_update(prev_obsolete_sm); + vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, + prev_obsolete_sm); + space_map_close(prev_obsolete_sm); + } + return (counts); +} + static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; if (!dump_opt['L']) { + dsl_pool_t *dp = spa->spa_dsl_pool; vdev_t *rvd = spa->spa_root_vdev; /* @@ -2813,50 +3143,51 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + zcb->zcb_vd_obsolete_counts = + umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), + UMEM_NOFAIL); + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; - metaslab_group_t *mg = vd->vdev_mg; - for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - ASSERT3P(msp->ms_group, ==, mg); - mutex_enter(&msp->ms_lock); - metaslab_unload(msp); + uint64_t vim_idx = 0; + + ASSERT3U(c, ==, vd->vdev_id); + + /* + * Note: we don't check for mapping leaks on + * removing vdevs because their ms_tree's are + * used to look for leaks in allocated space. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + zcb->zcb_vd_obsolete_counts[c] = + zdb_load_obsolete_counts(vd); /* - * For leak detection, we overload the metaslab - * ms_tree to contain allocated segments - * instead of free segments. As a result, - * we can't use the normal metaslab_load/unload - * interfaces. + * Normally, indirect vdevs don't have any + * metaslabs. We want to set them up for + * zio_claim(). */ - if (msp->ms_sm != NULL) { - (void) fprintf(stderr, - "\rloading space map for " - "vdev %llu of %llu, " - "metaslab %llu of %llu ...", - (longlong_t)c, - (longlong_t)rvd->vdev_children, - (longlong_t)m, - (longlong_t)vd->vdev_ms_count); + VERIFY0(vdev_metaslab_init(vd, 0)); + } - /* - * We don't want to spend the CPU - * manipulating the size-ordered - * tree, so clear the range_tree - * ops. - */ - msp->ms_tree->rt_ops = NULL; - VERIFY0(space_map_load(msp->ms_sm, - msp->ms_tree, SM_ALLOC)); - - if (!msp->ms_loaded) { - msp->ms_loaded = B_TRUE; - } - } - mutex_exit(&msp->ms_lock); + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + zdb_leak_init_ms(vd->vdev_ms[m], &vim_idx); + } + if (vd->vdev_ops == &vdev_indirect_ops) { + ASSERT3U(vim_idx, ==, + vdev_indirect_mapping_num_entries( + vd->vdev_indirect_mapping)); } } (void) fprintf(stderr, "\n"); + + if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_DEVICE_REMOVAL)); + (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, + increment_indirect_mapping_cb, zcb, NULL); + } } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -2866,18 +3197,93 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) spa_config_exit(spa, SCL_CONFIG, FTAG); } -static void -zdb_leak_fini(spa_t *spa) +static boolean_t +zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) { + boolean_t leaks = B_FALSE; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t total_leaked = 0; + + ASSERT(vim != NULL); + + for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[i]; + uint64_t obsolete_bytes = 0; + uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); + metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + /* + * This is not very efficient but it's easy to + * verify correctness. + */ + for (uint64_t inner_offset = 0; + inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); + inner_offset += 1 << vd->vdev_ashift) { + if (range_tree_contains(msp->ms_tree, + offset + inner_offset, 1 << vd->vdev_ashift)) { + obsolete_bytes += 1 << vd->vdev_ashift; + } + } + + int64_t bytes_leaked = obsolete_bytes - + zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; + ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, + zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); + if (bytes_leaked != 0 && + (vdev_obsolete_counts_are_precise(vd) || + dump_opt['d'] >= 5)) { + (void) printf("obsolete indirect mapping count " + "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), + (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + (u_longlong_t)bytes_leaked); + } + total_leaked += ABS(bytes_leaked); + } + + if (!vdev_obsolete_counts_are_precise(vd) && total_leaked > 0) { + int pct_leaked = total_leaked * 100 / + vdev_indirect_mapping_bytes_mapped(vim); + (void) printf("cannot verify obsolete indirect mapping " + "counts of vdev %llu because precise feature was not " + "enabled when it was removed: %d%% (%llx bytes) of mapping" + "unreferenced\n", + (u_longlong_t)vd->vdev_id, pct_leaked, + (u_longlong_t)total_leaked); + } else if (total_leaked > 0) { + (void) printf("obsolete indirect mapping count mismatch " + "for vdev %llu -- %llx total bytes mismatched\n", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)total_leaked); + leaks |= B_TRUE; + } + + vdev_indirect_mapping_free_obsolete_counts(vim, + zcb->zcb_vd_obsolete_counts[vd->vdev_id]); + zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; + + return (leaks); +} + +static boolean_t +zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) +{ + boolean_t leaks = B_FALSE; if (!dump_opt['L']) { vdev_t *rvd = spa->spa_root_vdev; for (unsigned c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; metaslab_group_t *mg = vd->vdev_mg; - for (unsigned m = 0; m < vd->vdev_ms_count; m++) { + + if (zcb->zcb_vd_obsolete_counts[c] != NULL) { + leaks |= zdb_check_for_obsolete_leaks(vd, zcb); + } + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; ASSERT3P(mg, ==, msp->ms_group); - mutex_enter(&msp->ms_lock); /* * The ms_tree has been overloaded to @@ -2887,18 +3293,30 @@ zdb_leak_fini(spa_t *spa) * represents an allocated block that we * did not claim during the traversal. * Claimed blocks would have been removed - * from the ms_tree. + * from the ms_tree. For indirect vdevs, + * space remaining in the tree represents + * parts of the mapping that are not + * referenced, which is not a bug. */ - range_tree_vacate(msp->ms_tree, zdb_leak, vd); + if (vd->vdev_ops == &vdev_indirect_ops) { + range_tree_vacate(msp->ms_tree, + NULL, NULL); + } else { + range_tree_vacate(msp->ms_tree, + zdb_leak, vd); + } if (msp->ms_loaded) { msp->ms_loaded = B_FALSE; } - - mutex_exit(&msp->ms_lock); } } + + umem_free(zcb->zcb_vd_obsolete_counts, + rvd->vdev_children * sizeof (uint32_t *)); + zcb->zcb_vd_obsolete_counts = NULL; } + return (leaks); } /* ARGSUSED */ @@ -2949,10 +3367,14 @@ dump_block_stats(spa_t *spa) */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, count_block_cb, &zcb, NULL); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, count_block_cb, &zcb, NULL); } + + zdb_claim_removing(spa, &zcb); + if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, @@ -2994,7 +3416,7 @@ dump_block_stats(spa_t *spa) /* * Report any leaked segments. */ - zdb_leak_fini(spa); + leaks |= zdb_leak_fini(spa, &zcb); tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; @@ -3002,7 +3424,8 @@ dump_block_stats(spa_t *spa) norm_space = metaslab_class_get_space(spa_normal_class(spa)); total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)); - total_found = tzb->zb_asize - zcb.zcb_dedup_asize; + total_found = tzb->zb_asize - zcb.zcb_dedup_asize + + zcb.zcb_removing_size; if (total_found == total_alloc) { if (!dump_opt['L']) @@ -3069,6 +3492,24 @@ dump_block_stats(spa_t *spa) (longlong_t)tzb->zb_ditto_samevdev); } + for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + if (vim == NULL) { + continue; + } + + char mem[32]; + zdb_nicenum(vdev_indirect_mapping_num_entries(vim), + mem, vdev_indirect_mapping_size(vim)); + + (void) printf("\tindirect vdev id %llu has %llu segments " + "(%s in memory)\n", + (longlong_t)vd->vdev_id, + (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); + } + if (dump_opt['b'] >= 2) { int l, t, level; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" @@ -3275,6 +3716,124 @@ dump_simulated_ddt(spa_t *spa) dump_dedup_ratio(&dds_total); } +static int +verify_device_removal_feature_counts(spa_t *spa) +{ + uint64_t dr_feature_refcount = 0; + uint64_t oc_feature_refcount = 0; + uint64_t indirect_vdev_count = 0; + uint64_t precise_vdev_count = 0; + uint64_t obsolete_counts_object_count = 0; + uint64_t obsolete_sm_count = 0; + uint64_t obsolete_counts_count = 0; + uint64_t scip_count = 0; + uint64_t obsolete_bpobj_count = 0; + int ret = 0; + + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + if (scip->scip_next_mapping_object != 0) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + (void) printf("Condensing indirect vdev %llu: new mapping " + "object %llu, prev obsolete sm %llu\n", + (u_longlong_t)scip->scip_vdev, + (u_longlong_t)scip->scip_next_mapping_object, + (u_longlong_t)scip->scip_prev_obsolete_sm_object); + if (scip->scip_prev_obsolete_sm_object != 0) { + space_map_t *prev_obsolete_sm = NULL; + VERIFY0(space_map_open(&prev_obsolete_sm, + spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, + 0, vd->vdev_asize, 0)); + space_map_update(prev_obsolete_sm); + dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); + (void) printf("\n"); + space_map_close(prev_obsolete_sm); + } + + scip_count += 2; + } + + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + if (vic->vic_mapping_object != 0) { + ASSERT(vd->vdev_ops == &vdev_indirect_ops || + vd->vdev_removing); + indirect_vdev_count++; + + if (vd->vdev_indirect_mapping->vim_havecounts) { + obsolete_counts_count++; + } + } + if (vdev_obsolete_counts_are_precise(vd)) { + ASSERT(vic->vic_mapping_object != 0); + precise_vdev_count++; + } + if (vdev_obsolete_sm_object(vd) != 0) { + ASSERT(vic->vic_mapping_object != 0); + obsolete_sm_count++; + } + } + + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], + &dr_feature_refcount); + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], + &oc_feature_refcount); + + if (dr_feature_refcount != indirect_vdev_count) { + ret = 1; + (void) printf("Number of indirect vdevs (%llu) " \ + "does not match feature count (%llu)\n", + (u_longlong_t)indirect_vdev_count, + (u_longlong_t)dr_feature_refcount); + } else { + (void) printf("Verified device_removal feature refcount " \ + "of %llu is correct\n", + (u_longlong_t)dr_feature_refcount); + } + + if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ) == 0) { + obsolete_bpobj_count++; + } + + + obsolete_counts_object_count = precise_vdev_count; + obsolete_counts_object_count += obsolete_sm_count; + obsolete_counts_object_count += obsolete_counts_count; + obsolete_counts_object_count += scip_count; + obsolete_counts_object_count += obsolete_bpobj_count; + obsolete_counts_object_count += remap_deadlist_count; + + if (oc_feature_refcount != obsolete_counts_object_count) { + ret = 1; + (void) printf("Number of obsolete counts objects (%llu) " \ + "does not match feature count (%llu)\n", + (u_longlong_t)obsolete_counts_object_count, + (u_longlong_t)oc_feature_refcount); + (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " + "ob:%llu rd:%llu\n", + (u_longlong_t)precise_vdev_count, + (u_longlong_t)obsolete_sm_count, + (u_longlong_t)obsolete_counts_count, + (u_longlong_t)scip_count, + (u_longlong_t)obsolete_bpobj_count, + (u_longlong_t)remap_deadlist_count); + } else { + (void) printf("Verified indirect_refcount feature refcount " \ + "of %llu is correct\n", + (u_longlong_t)oc_feature_refcount); + } + return (ret); +} + static void dump_zpool(spa_t *spa) { @@ -3308,18 +3867,24 @@ dump_zpool(spa_t *spa) if (dump_opt['d'] || dump_opt['i']) { dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { + dsl_pool_t *dp = spa->spa_dsl_pool; dump_full_bpobj(&spa->spa_deferred_bpobj, "Deferred frees", 0); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { - dump_full_bpobj( - &spa->spa_dsl_pool->dp_free_bpobj, + dump_full_bpobj(&dp->dp_free_bpobj, "Pool snapshot frees", 0); } + if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_DEVICE_REMOVAL)); + dump_full_bpobj(&dp->dp_obsolete_bpobj, + "Pool obsolete blocks", 0); + } if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { dump_bptree(spa->spa_meta_objset, - spa->spa_dsl_pool->dp_bptree_obj, + dp->dp_bptree_obj, "Pool dataset frees"); } dump_dtl(spa->spa_root_vdev, 0); @@ -3351,6 +3916,10 @@ dump_zpool(spa_t *spa) (longlong_t)refcount); } } + + if (rc == 0) { + rc = verify_device_removal_feature_counts(spa); + } } if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); @@ -3661,7 +4230,8 @@ zdb_read_block(char *thing, spa_t *spa) psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | - ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, + NULL, NULL)); } error = zio_wait(zio); diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c index be4df07abfd1..7939bced22a1 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c @@ -112,6 +112,7 @@ static int zfs_do_diff(int argc, char **argv); static int zfs_do_jail(int argc, char **argv); static int zfs_do_unjail(int argc, char **argv); static int zfs_do_bookmark(int argc, char **argv); +static int zfs_do_remap(int argc, char **argv); static int zfs_do_channel_program(int argc, char **argv); /* @@ -161,6 +162,7 @@ typedef enum { HELP_HOLDS, HELP_RELEASE, HELP_DIFF, + HELP_REMAP, HELP_BOOKMARK, HELP_CHANNEL_PROGRAM, } zfs_help_t; @@ -220,6 +222,7 @@ static zfs_command_t command_table[] = { { NULL }, { "jail", zfs_do_jail, HELP_JAIL }, { "unjail", zfs_do_unjail, HELP_UNJAIL }, + { "remap", zfs_do_remap, HELP_REMAP }, }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) @@ -342,6 +345,8 @@ get_usage(zfs_help_t idx) case HELP_DIFF: return (gettext("\tdiff [-FHt] " "[snapshot|filesystem]\n")); + case HELP_REMAP: + return (gettext("\tremap \n")); case HELP_BOOKMARK: return (gettext("\tbookmark \n")); case HELP_CHANNEL_PROGRAM: @@ -4170,6 +4175,7 @@ zfs_do_receive(int argc, char **argv) #define ZFS_DELEG_PERM_RELEASE "release" #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" +#define ZFS_DELEG_PERM_REMAP "remap" #define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE @@ -4190,6 +4196,7 @@ static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, + { ZFS_DELEG_PERM_REMAP, ZFS_DELEG_NOTE_REMAP }, { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, @@ -6978,7 +6985,7 @@ zfs_do_diff(int argc, char **argv) if (argc < 1) { (void) fprintf(stderr, - gettext("must provide at least one snapshot name\n")); + gettext("must provide at least one snapshot name\n")); usage(B_FALSE); } @@ -7019,6 +7026,22 @@ zfs_do_diff(int argc, char **argv) return (err != 0); } +static int +zfs_do_remap(int argc, char **argv) +{ + const char *fsname; + int err = 0; + if (argc != 2) { + (void) fprintf(stderr, gettext("wrong number of arguments\n")); + usage(B_FALSE); + } + + fsname = argv[1]; + err = zfs_remap_indirects(g_zfs, fsname); + + return (err); +} + /* * zfs bookmark * diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c index b7b08251e45b..66cc98b99d5b 100644 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c +++ b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 Martin Matuska . All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. @@ -250,7 +250,7 @@ get_usage(zpool_help_t idx) return (gettext("\treplace [-f] " "[new-device]\n")); case HELP_REMOVE: - return (gettext("\tremove ...\n")); + return (gettext("\tremove [-nps] ...\n")); case HELP_REOPEN: return (gettext("\treopen \n")); case HELP_SCRUB: @@ -599,8 +599,7 @@ zpool_do_add(int argc, char **argv) /* * zpool remove ... * - * Removes the given vdev from the pool. Currently, this supports removing - * spares, cache, and log devices from the pool. + * Removes the given vdev from the pool. */ int zpool_do_remove(int argc, char **argv) @@ -608,28 +607,87 @@ zpool_do_remove(int argc, char **argv) char *poolname; int i, ret = 0; zpool_handle_t *zhp; + boolean_t stop = B_FALSE; + boolean_t noop = B_FALSE; + boolean_t parsable = B_FALSE; + char c; - argc--; - argv++; + /* check options */ + while ((c = getopt(argc, argv, "nps")) != -1) { + switch (c) { + case 'n': + noop = B_TRUE; + break; + case 'p': + parsable = B_TRUE; + break; + case 's': + stop = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } - if (argc < 2) { - (void) fprintf(stderr, gettext("missing device\n")); - usage(B_FALSE); - } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); - for (i = 1; i < argc; i++) { - if (zpool_vdev_remove(zhp, argv[i]) != 0) + if (stop && noop) { + (void) fprintf(stderr, gettext("stop request ignored\n")); + return (0); + } + + if (stop) { + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + if (zpool_vdev_remove_cancel(zhp) != 0) ret = 1; + } else { + if (argc < 2) { + (void) fprintf(stderr, gettext("missing device\n")); + usage(B_FALSE); + } + + for (i = 1; i < argc; i++) { + if (noop) { + uint64_t size; + + if (zpool_vdev_indirect_size(zhp, argv[i], + &size) != 0) { + ret = 1; + break; + } + if (parsable) { + (void) printf("%s %llu\n", + argv[i], size); + } else { + char valstr[32]; + zfs_nicenum(size, valstr, + sizeof (valstr)); + (void) printf("Memory that will be " + "used after removing %s: %s\n", + argv[i], valstr); + } + } else { + if (zpool_vdev_remove(zhp, argv[i]) != 0) + ret = 1; + } + } } return (ret); @@ -1416,6 +1474,7 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, uint64_t ashift; spare_cbdata_t cb; const char *state; + char *type; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) @@ -1424,6 +1483,11 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + + if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) + return; + state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* @@ -2454,6 +2518,9 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, double scale; char *vname; + if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) + return; + if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); @@ -3060,6 +3127,9 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, boolean_t toplevel = (vs->vs_space != 0); uint64_t cap; + if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) + return; + if (scripted) (void) printf("\t%s", name); else if (strlen(name) + depth > cb->cb_namewidth) @@ -3998,7 +4068,7 @@ typedef struct status_cbdata { /* * Print out detailed scrub status. */ -void +static void print_scan_status(pool_scan_stat_t *ps) { time_t start, end, pause; @@ -4124,6 +4194,111 @@ print_scan_status(pool_scan_stat_t *ps) } } +/* + * Print out detailed removal status. + */ +static void +print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) +{ + char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; + time_t start, end; + nvlist_t *config, *nvroot; + nvlist_t **child; + uint_t children; + char *vdev_name; + + if (prs == NULL || prs->prs_state == DSS_NONE) + return; + + /* + * Determine name of vdev. + */ + config = zpool_get_config(zhp, NULL); + nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + assert(prs->prs_removing_vdev < children); + vdev_name = zpool_vdev_name(g_zfs, zhp, + child[prs->prs_removing_vdev], B_TRUE); + + (void) printf(gettext("remove: ")); + + start = prs->prs_start_time; + end = prs->prs_end_time; + zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); + + /* + * Removal is finished or canceled. + */ + if (prs->prs_state == DSS_FINISHED) { + uint64_t minutes_taken = (end - start) / 60; + + (void) printf(gettext("Removal of vdev %llu copied %s " + "in %lluh%um, completed on %s"), + (longlong_t)prs->prs_removing_vdev, + copied_buf, + (u_longlong_t)(minutes_taken / 60), + (uint_t)(minutes_taken % 60), + ctime((time_t *)&end)); + } else if (prs->prs_state == DSS_CANCELED) { + (void) printf(gettext("Removal of %s canceled on %s"), + vdev_name, ctime(&end)); + } else { + uint64_t copied, total, elapsed, mins_left, hours_left; + double fraction_done; + uint_t rate; + + assert(prs->prs_state == DSS_SCANNING); + + /* + * Removal is in progress. + */ + (void) printf(gettext( + "Evacuation of %s in progress since %s"), + vdev_name, ctime(&start)); + + copied = prs->prs_copied > 0 ? prs->prs_copied : 1; + total = prs->prs_to_copy; + fraction_done = (double)copied / total; + + /* elapsed time for this pass */ + elapsed = time(NULL) - prs->prs_start_time; + elapsed = elapsed > 0 ? elapsed : 1; + rate = copied / elapsed; + rate = rate > 0 ? rate : 1; + mins_left = ((total - copied) / rate) / 60; + hours_left = mins_left / 60; + + zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); + zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); + + /* + * do not print estimated time if hours_left is more than + * 30 days + */ + (void) printf(gettext(" %s copied out of %s at %s/s, " + "%.2f%% done"), + examined_buf, total_buf, rate_buf, 100 * fraction_done); + if (hours_left < (30 * 24)) { + (void) printf(gettext(", %lluh%um to go\n"), + (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); + } else { + (void) printf(gettext( + ", (copy is slow, no estimated time)\n")); + } + } + + if (prs->prs_mapping_memory > 0) { + char mem_buf[7]; + zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); + (void) printf(gettext(" %s memory used for " + "removed device mappings\n"), + mem_buf); + } +} + static void print_error_log(zpool_handle_t *zhp) { @@ -4289,8 +4464,7 @@ status_callback(zpool_handle_t *zhp, void *data) else (void) printf("\n"); - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); + nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); @@ -4507,11 +4681,16 @@ status_callback(zpool_handle_t *zhp, void *data) nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; pool_scan_stat_t *ps = NULL; + pool_removal_stat_t *prs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); print_scan_status(ps); + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); + print_removal_status(zhp, prs); + namewidth = max_width(zhp, nvroot, 0, 0); if (namewidth < 10) namewidth = 10; diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c index 79209f5e8ed9..ecc3ce6e16df 100644 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -343,6 +343,8 @@ ztest_func_t ztest_vdev_aux_add_remove; ztest_func_t ztest_split_pool; ztest_func_t ztest_reguid; ztest_func_t ztest_spa_upgrade; +ztest_func_t ztest_device_removal; +ztest_func_t ztest_remap_blocks; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -384,6 +386,8 @@ ztest_info_t ztest_info[] = { &ztest_opts.zo_vdevtime }, { ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime }, + { ztest_device_removal, 1, &zopt_sometimes }, + { ztest_remap_blocks, 1, &zopt_sometimes } }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -786,10 +790,10 @@ ztest_kill(ztest_shared_t *zs) /* * Before we kill off ztest, make sure that the config is updated. - * See comment above spa_config_sync(). + * See comment above spa_write_cachefile(). */ mutex_enter(&spa_namespace_lock); - spa_config_sync(ztest_spa, B_FALSE, B_FALSE); + spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); mutex_exit(&spa_namespace_lock); zfs_dbgmsg_print(FTAG); @@ -1016,7 +1020,7 @@ ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) do { top = ztest_random(rvd->vdev_children); tvd = rvd->vdev_child[top]; - } while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) || + } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); return (top); @@ -2785,7 +2789,19 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) VERIFY(mutex_lock(&ztest_vdev_lock) == 0); leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * If a vdev is in the process of being removed, its removal may + * finish while we are in progress, leading to an unexpected error + * value. Don't bother trying to attach while we are in the middle + * of removal. + */ + if (spa->spa_vdev_removal != NULL) { + spa_config_exit(spa, SCL_ALL, FTAG); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); + return; + } /* * Decide whether to do an attach or a replace. @@ -2838,7 +2854,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * If oldvd has siblings, then half of the time, detach it. */ if (oldvd_has_siblings && ztest_random(2) == 0) { - spa_config_exit(spa, SCL_VDEV, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); if (error != 0 && error != ENODEV && error != EBUSY && error != ENOTSUP) @@ -2865,6 +2881,10 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) } if (newvd) { + /* + * Reopen to ensure the vdev's asize field isn't stale. + */ + vdev_reopen(newvd); newsize = vdev_get_min_asize(newvd); } else { /* @@ -2902,7 +2922,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) else expected_error = 0; - spa_config_exit(spa, SCL_VDEV, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); /* * Build the nvlist describing newpath. @@ -2940,6 +2960,26 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } +/* ARGSUSED */ +void +ztest_device_removal(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + vdev_t *vd; + uint64_t guid; + + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); + guid = vd->vdev_guid; + spa_config_exit(spa, SCL_VDEV, FTAG); + + (void) spa_vdev_remove(spa, guid, B_FALSE); + + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); +} + /* * Callback function which expands the physical size of the vdev. */ @@ -3068,6 +3108,18 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) VERIFY(mutex_lock(&ztest_vdev_lock) == 0); spa_config_enter(spa, SCL_STATE, spa, RW_READER); + /* + * If there is a vdev removal in progress, it could complete while + * we are running, in which case we would not be able to verify + * that the metaslab_class space increased (because it decreases + * when the device removal completes). + */ + if (spa->spa_vdev_removal != NULL) { + spa_config_exit(spa, SCL_STATE, FTAG); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); + return; + } + top = ztest_random_vdev_top(spa, B_TRUE); tvd = spa->spa_root_vdev->vdev_child[top]; @@ -3159,16 +3211,18 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) /* * Make sure we were able to grow the vdev. */ - if (new_ms_count <= old_ms_count) - fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n", + if (new_ms_count <= old_ms_count) { + fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", old_ms_count, new_ms_count); + } /* * Make sure we were able to grow the pool. */ - if (new_class_space <= old_class_space) - fatal(0, "LUN expansion failed: class_space %llu <= %llu\n", + if (new_class_space <= old_class_space) { + fatal(0, "LUN expansion failed: class_space %llu < %llu\n", old_class_space, new_class_space); + } if (ztest_opts.zo_verbose >= 5) { char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; @@ -4635,6 +4689,20 @@ ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) (void) rw_unlock(&ztest_name_lock); } +/* ARGSUSED */ +void +ztest_remap_blocks(ztest_ds_t *zd, uint64_t id) +{ + (void) rw_rdlock(&ztest_name_lock); + + int error = dmu_objset_remap_indirects(zd->zd_name); + if (error == ENOSPC) + error = 0; + ASSERT0(error); + + (void) rw_unlock(&ztest_name_lock); +} + /* ARGSUSED */ void ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) @@ -4886,6 +4954,9 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) */ vdev_file_t *vf = vd0->vdev_tsd; + zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", + (long long)vd0->vdev_id, (int)maxfaults); + if (vf != NULL && ztest_random(3) == 0) { (void) close(vf->vf_vnode->v_fd); vf->vf_vnode->v_fd = -1; diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h index 3e15dd1c814d..e382c0e3ac50 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Martin Matuska . All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. @@ -131,6 +131,7 @@ typedef enum zfs_error { EZFS_DIFFDATA, /* bad zfs diff data */ EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_SCRUB_PAUSED, /* scrub currently paused */ + EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ EZFS_UNKNOWN } zfs_error_t; @@ -267,6 +268,8 @@ extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *, nvlist_t *, int); extern int zpool_vdev_detach(zpool_handle_t *, const char *); extern int zpool_vdev_remove(zpool_handle_t *, const char *); +extern int zpool_vdev_remove_cancel(zpool_handle_t *); +extern int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *); extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, splitflags_t); @@ -825,6 +828,7 @@ extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *); extern int zmount(const char *, const char *, int, char *, char *, int, char *, int); #endif +extern int zfs_remap_indirects(libzfs_handle_t *hdl, const char *); #ifdef __cplusplus } diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c index 521393582fb1..905557376b0f 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c @@ -3829,6 +3829,24 @@ zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) return (rv); } +int +zfs_remap_indirects(libzfs_handle_t *hdl, const char *fs) +{ + int err; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot remap filesystem '%s' "), fs); + + err = lzc_remap(fs); + + if (err != 0) { + (void) zfs_standard_error(hdl, err, errbuf); + } + + return (err); +} + /* * Creates snapshots. The keys in the snaps nvlist are the snapshots to be * created. diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c index 612f37ebd4b8..2b428efc7dce 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. * Copyright 2016 Igor Kozhukhov @@ -1334,6 +1334,13 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) (void) zfs_error(hdl, EZFS_BADDEV, msg); break; + case EINVAL: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid config; a pool with removing/removed " + "vdevs does not support adding raidz vdevs")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + case EOVERFLOW: /* * This occurrs when one of the devices is below @@ -2664,7 +2671,7 @@ zpool_vdev_attach(zpool_handle_t *zhp, (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache, - &islog)) == 0) + &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); if (avail_spare) @@ -2773,7 +2780,8 @@ zpool_vdev_attach(zpool_handle_t *zhp, break; case EBUSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"), + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, " + "or pool has removing/removed vdevs"), new_disk); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; @@ -2827,7 +2835,7 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - NULL)) == 0) + NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); if (avail_spare) @@ -3116,8 +3124,7 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, } /* - * Remove the given device. Currently, this is supported only for hot spares - * and level 2 cache devices. + * Remove the given device. */ int zpool_vdev_remove(zpool_handle_t *zhp, const char *path) @@ -3134,26 +3141,61 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - &islog)) == 0) + &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); - /* - * XXX - this should just go away. - */ - if (!avail_spare && !l2cache && !islog) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "only inactive hot spares, cache, top-level, " - "or log devices can be removed")); - return (zfs_error(hdl, EZFS_NODEVICE, msg)); - } version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog && version < SPA_VERSION_HOLES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgrade to support log removal")); + "pool must be upgraded to support log removal")); return (zfs_error(hdl, EZFS_BADVERSION, msg)); } - verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); + if (!islog && !avail_spare && !l2cache && zpool_is_bootable(zhp)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "root pool can not have removed devices, " + "because GRUB does not understand them")); + return (zfs_error(hdl, EINVAL, msg)); + } + + zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) + return (0); + + switch (errno) { + + case EINVAL: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid config; all top-level vdevs must " + "have the same sector size and not be raidz.")); + (void) zfs_error(hdl, EZFS_INVALCONFIG, msg); + break; + + case EBUSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Pool busy; removal may already be in progress")); + (void) zfs_error(hdl, EZFS_BUSY, msg); + break; + + default: + (void) zpool_standard_error(hdl, errno, msg); + } + return (-1); +} + +int +zpool_vdev_remove_cancel(zpool_handle_t *zhp) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot cancel removal")); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_cookie = 1; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) return (0); @@ -3161,6 +3203,36 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) return (zpool_standard_error(hdl, errno, msg)); } +int +zpool_vdev_indirect_size(zpool_handle_t *zhp, const char *path, + uint64_t *sizep) +{ + char msg[1024]; + nvlist_t *tgt; + boolean_t avail_spare, l2cache, islog; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"), + path); + + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + &islog)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + if (avail_spare || l2cache || islog) { + *sizep = 0; + return (0); + } + + if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "indirect size not available")); + return (zfs_error(hdl, EINVAL, msg)); + } + return (0); +} + /* * Clear the errors for the pool, or the particular device if specified. */ @@ -3188,7 +3260,7 @@ zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (path) { if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, - &l2cache, NULL)) == 0) + &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); /* diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c index 5f5335d0f8c4..6adab0bd1788 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c @@ -240,6 +240,9 @@ libzfs_error_description(libzfs_handle_t *hdl) return (dgettext(TEXT_DOMAIN, "invalid diff data")); case EZFS_POOLREADONLY: return (dgettext(TEXT_DOMAIN, "pool is read-only")); + case EZFS_NO_PENDING: + return (dgettext(TEXT_DOMAIN, "operation is not " + "in progress")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -487,6 +490,10 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case EROFS: zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); break; + /* There is no pending operation to cancel */ + case ESRCH: + zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap); + break; default: zfs_error_aux(hdl, strerror(error)); diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c index b1863bead51b..a7c973f0f634 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c @@ -286,6 +286,16 @@ lzc_promote(const char *fsname, char *snapnamebuf, int snapnamelen) return (0); } +int +lzc_remap(const char *fsname) +{ + int error; + nvlist_t *args = fnvlist_alloc(); + error = lzc_ioctl(ZFS_IOC_REMAP, fsname, args, NULL); + nvlist_free(args); + return (error); +} + /* * Creates snapshots. * diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h index c61ad52ef953..5202fd135db8 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h @@ -47,6 +47,7 @@ enum lzc_dataset_type { LZC_DATSET_TYPE_ZVOL }; +int lzc_remap(const char *fsname); int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **); int lzc_create(const char *, enum lzc_dataset_type, nvlist_t *); int lzc_clone(const char *, const char *, nvlist_t *); diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c index 8736424f2cc5..53ee2ab5842b 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c @@ -247,4 +247,20 @@ zpool_feature_init(void) "Edon-R hash algorithm.", ZFEATURE_FLAG_PER_DATASET, NULL); #endif + + zfeature_register(SPA_FEATURE_DEVICE_REMOVAL, + "com.delphix:device_removal", "device_removal", + "Top-level vdevs can be removed, reducing logical pool size.", + ZFEATURE_FLAG_MOS, NULL); + + static const spa_feature_t obsolete_counts_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_DEVICE_REMOVAL, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS, + "com.delphix:obsolete_counts", "obsolete_counts", + "Reduce memory used by removed devices when their blocks are " + "freed or remapped.", + ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps); } diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h index fa7e93612e59..9ada7f5464ce 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h @@ -57,6 +57,8 @@ typedef enum spa_feature { #ifdef illumos SPA_FEATURE_EDONR, #endif + SPA_FEATURE_DEVICE_REMOVAL, + SPA_FEATURE_OBSOLETE_COUNTS, SPA_FEATURES } spa_feature_t; diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c index b66fac804fb3..a3383f4ccf2d 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2010 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov */ @@ -53,6 +53,7 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { {ZFS_DELEG_PERM_MOUNT}, {ZFS_DELEG_PERM_PROMOTE}, {ZFS_DELEG_PERM_RECEIVE}, + {ZFS_DELEG_PERM_REMAP}, {ZFS_DELEG_PERM_RENAME}, {ZFS_DELEG_PERM_ROLLBACK}, {ZFS_DELEG_PERM_SNAPSHOT}, diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h index 16133c59f33f..06d2df9bb80d 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2010 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #ifndef _ZFS_DELEG_H @@ -67,6 +67,7 @@ typedef enum { ZFS_DELEG_NOTE_RELEASE, ZFS_DELEG_NOTE_DIFF, ZFS_DELEG_NOTE_BOOKMARK, + ZFS_DELEG_NOTE_REMAP, ZFS_DELEG_NOTE_NONE } zfs_deleg_note_t; diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c index dc70c143e857..5f7bcaba5450 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c @@ -436,6 +436,8 @@ zfs_prop_init(void) /* hidden properties */ zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "CREATETXG"); + zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG"); zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES"); zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files index 01675fb3fba9..c142115969d6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files +++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files @@ -118,11 +118,15 @@ ZFS_COMMON_OBJS += \ vdev.o \ vdev_cache.o \ vdev_file.o \ + vdev_indirect.o \ + vdev_indirect_births.o \ + vdev_indirect_mapping.o \ vdev_label.o \ vdev_mirror.o \ vdev_missing.o \ vdev_queue.o \ vdev_raidz.o \ + vdev_removal.o \ vdev_root.o \ zap.o \ zap_leaf.o \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 5d3a7b3220d7..479228309ba5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -5415,7 +5415,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* - * Lock out device removal. + * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c index 0bcfc0031311..bbdd765214fc 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c @@ -176,6 +176,12 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) return (0); } +boolean_t +bpobj_is_open(const bpobj_t *bpo) +{ + return (bpo->bpo_object != 0); +} + void bpobj_close(bpobj_t *bpo) { @@ -194,11 +200,11 @@ bpobj_close(bpobj_t *bpo) mutex_destroy(&bpo->bpo_lock); } -static boolean_t -bpobj_hasentries(bpobj_t *bpo) +boolean_t +bpobj_is_empty(bpobj_t *bpo) { - return (bpo->bpo_phys->bpo_num_blkptrs != 0 || - (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0)); + return (bpo->bpo_phys->bpo_num_blkptrs == 0 && + (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); } static int @@ -211,11 +217,9 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, int err = 0; dmu_buf_t *dbuf = NULL; + ASSERT(bpobj_is_open(bpo)); mutex_enter(&bpo->bpo_lock); - if (!bpobj_hasentries(bpo)) - goto out; - if (free) dmu_buf_will_dirty(bpo->bpo_dbuf, tx); @@ -345,7 +349,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, out: /* If there are no entries, there should be no bytes. */ - if (!bpobj_hasentries(bpo)) { + if (bpobj_is_empty(bpo)) { ASSERT0(bpo->bpo_phys->bpo_bytes); ASSERT0(bpo->bpo_phys->bpo_comp); ASSERT0(bpo->bpo_phys->bpo_uncomp); @@ -380,6 +384,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) bpobj_t subbpo; uint64_t used, comp, uncomp, subsubobjs; + ASSERT(bpobj_is_open(bpo)); + ASSERT(subobj != 0); ASSERT(bpo->bpo_havesubobj); ASSERT(bpo->bpo_havecomp); ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); @@ -392,7 +398,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); - if (!bpobj_hasentries(&subbpo)) { + if (bpobj_is_empty(&subbpo)) { /* No point in having an empty subobj. */ bpobj_close(&subbpo); bpobj_free(bpo->bpo_os, subobj, tx); @@ -466,6 +472,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) int blkoff; blkptr_t *bparray; + ASSERT(bpobj_is_open(bpo)); ASSERT(!BP_IS_HOLE(bp)); ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); @@ -551,6 +558,7 @@ space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) int bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { + ASSERT(bpobj_is_open(bpo)); mutex_enter(&bpo->bpo_lock); *usedp = bpo->bpo_phys->bpo_bytes; @@ -577,6 +585,8 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, struct space_range_arg sra = { 0 }; int err; + ASSERT(bpobj_is_open(bpo)); + /* * As an optimization, if they want the whole txg range, just * get bpo_bytes rather than iterating over the bps. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 3bc084d3e88a..6df0922750a2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -47,6 +47,7 @@ #include #include #include +#include uint_t zfs_dbuf_evict_key; @@ -3007,6 +3008,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) db->db_data_pending = dr; mutex_exit(&db->db_mtx); + dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; @@ -3482,6 +3484,141 @@ dbuf_write_override_done(zio_t *zio) abd_put(zio->io_abd); } +typedef struct dbuf_remap_impl_callback_arg { + objset_t *drica_os; + uint64_t drica_blk_birth; + dmu_tx_t *drica_tx; +} dbuf_remap_impl_callback_arg_t; + +static void +dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, + void *arg) +{ + dbuf_remap_impl_callback_arg_t *drica = arg; + objset_t *os = drica->drica_os; + spa_t *spa = dmu_objset_spa(os); + dmu_tx_t *tx = drica->drica_tx; + + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + if (os == spa_meta_objset(spa)) { + spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); + } else { + dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, + size, drica->drica_blk_birth, tx); + } +} + +static void +dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) +{ + blkptr_t bp_copy = *bp; + spa_t *spa = dmu_objset_spa(dn->dn_objset); + dbuf_remap_impl_callback_arg_t drica; + + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + drica.drica_os = dn->dn_objset; + drica.drica_blk_birth = bp->blk_birth; + drica.drica_tx = tx; + if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, + &drica)) { + /* + * The struct_rwlock prevents dbuf_read_impl() from + * dereferencing the BP while we are changing it. To + * avoid lock contention, only grab it when we are actually + * changing the BP. + */ + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + *bp = bp_copy; + rw_exit(&dn->dn_struct_rwlock); + } +} + +/* + * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting + * to remap a copy of every bp in the dbuf. + */ +boolean_t +dbuf_can_remap(const dmu_buf_impl_t *db) +{ + spa_t *spa = dmu_objset_spa(db->db_objset); + blkptr_t *bp = db->db.db_data; + boolean_t ret = B_FALSE; + + ASSERT3U(db->db_level, >, 0); + ASSERT3S(db->db_state, ==, DB_CACHED); + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { + blkptr_t bp_copy = bp[i]; + if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { + ret = B_TRUE; + break; + } + } + spa_config_exit(spa, SCL_VDEV, FTAG); + + return (ret); +} + +boolean_t +dnode_needs_remap(const dnode_t *dn) +{ + spa_t *spa = dmu_objset_spa(dn->dn_objset); + boolean_t ret = B_FALSE; + + if (dn->dn_phys->dn_nlevels == 0) { + return (B_FALSE); + } + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) { + blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j]; + if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { + ret = B_TRUE; + break; + } + } + spa_config_exit(spa, SCL_VDEV, FTAG); + + return (ret); +} + +/* + * Remap any existing BP's to concrete vdevs, if possible. + */ +static void +dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(db->db_objset); + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) + return; + + if (db->db_level > 0) { + blkptr_t *bp = db->db.db_data; + for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { + dbuf_remap_impl(dn, &bp[i], tx); + } + } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { + dnode_phys_t *dnp = db->db.db_data; + ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, + DMU_OT_DNODE); + for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) { + for (int j = 0; j < dnp[i].dn_nblkptr; j++) { + dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); + } + } + } +} + + /* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) @@ -3515,6 +3652,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } else { dbuf_release_bp(db); } + dbuf_remap(dn, db, tx); } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c index dc4fb548af21..cb00dfa9a08a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c @@ -717,15 +717,15 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class < DDT_CLASSES; class++) { error = ddt_object_lookup(ddt, type, class, dde); - if (error != ENOENT) + if (error != ENOENT) { + ASSERT0(error); break; + } } if (error != ENOENT) break; } - ASSERT(error == 0 || error == ENOENT); - ddt_enter(ddt); ASSERT(dde->dde_loaded == B_FALSE); @@ -1114,7 +1114,7 @@ ddt_sync(spa_t *spa, uint64_t txg) { dmu_tx_t *tx; zio_t *rio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); ASSERT(spa_syncing_txg(spa) == txg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index cf22b3544798..11ba14468568 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -71,6 +71,13 @@ uint32_t zfs_per_txg_dirty_frees_percent = 30; SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN, &zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg"); +/* + * This can be used for testing, to ensure that certain actions happen + * while in the middle of a remap (which might otherwise complete too + * quickly). + */ +int zfs_object_remap_one_indirect_delay_ticks = 0; + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, "object directory" }, @@ -1033,6 +1040,123 @@ dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } +static int +dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, + uint64_t last_removal_txg, uint64_t offset) +{ + uint64_t l1blkid = dbuf_whichblock(dn, 1, offset); + int err = 0; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG); + ASSERT3P(dbuf, !=, NULL); + + /* + * If the block hasn't been written yet, this default will ensure + * we don't try to remap it. + */ + uint64_t birth = UINT64_MAX; + ASSERT3U(last_removal_txg, !=, UINT64_MAX); + if (dbuf->db_blkptr != NULL) + birth = dbuf->db_blkptr->blk_birth; + rw_exit(&dn->dn_struct_rwlock); + + /* + * If this L1 was already written after the last removal, then we've + * already tried to remap it. + */ + if (birth <= last_removal_txg && + dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 && + dbuf_can_remap(dbuf)) { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_remap_l1indirect(tx, dn->dn_object); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + (void) dbuf_dirty(dbuf, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } + + dbuf_rele(dbuf, FTAG); + + delay(zfs_object_remap_one_indirect_delay_ticks); + + return (err); +} + +/* + * Remap all blockpointers in the object, if possible, so that they reference + * only concrete vdevs. + * + * To do this, iterate over the L0 blockpointers and remap any that reference + * an indirect vdev. Note that we only examine L0 blockpointers; since we + * cannot guarantee that we can remap all blockpointer anyways (due to split + * blocks), we do not want to make the code unnecessarily complicated to + * catch the unlikely case that there is an L1 block on an indirect vdev that + * contains no indirect blockpointers. + */ +int +dmu_object_remap_indirects(objset_t *os, uint64_t object, + uint64_t last_removal_txg) +{ + uint64_t offset, l1span; + int err; + dnode_t *dn; + + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) { + return (err); + } + + if (dn->dn_nlevels <= 1) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = SET_ERROR(EINTR); + } + + /* + * If the dnode has no indirect blocks, we cannot dirty them. + * We still want to remap the blkptr(s) in the dnode if + * appropriate, so mark it as dirty. + */ + if (err == 0 && dnode_needs_remap(dn)) { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, dn->dn_object); + if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) { + dnode_setdirty(dn, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } + + dnode_rele(dn, FTAG); + return (err); + } + + offset = 0; + l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT + + dn->dn_datablkshift); + /* + * Find the next L1 indirect that is not a hole. + */ + while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = SET_ERROR(EINTR); + break; + } + if ((err = dmu_object_remap_one_indirect(os, dn, + last_removal_txg, offset)) != 0) { + break; + } + offset += l1span; + } + + dnode_rele(dn, FTAG); + return (err); +} + void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index 6bf61854b7e9..71d616415c8f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -53,6 +53,7 @@ #include #include #include +#include /* * Needed to close a window in dnode_move() that allows the objset to be freed @@ -348,6 +349,17 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); + /* + * The $ORIGIN dataset (if it exists) doesn't have an associated + * objset, so there's no reason to open it. The $ORIGIN dataset + * will not exist on pools older than SPA_VERSION_ORIGIN. + */ + if (ds != NULL && spa_get_dsl(spa) != NULL && + spa_get_dsl(spa)->dp_origin_snap != NULL) { + ASSERT3P(ds->ds_dir, !=, + spa_get_dsl(spa)->dp_origin_snap->ds_dir); + } + os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; os->os_spa = spa; @@ -1052,6 +1064,101 @@ dmu_objset_clone(const char *clone, const char *origin) 5, ZFS_SPACE_CHECK_NORMAL)); } +static int +dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg) +{ + int error = 0; + uint64_t object = 0; + while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { + error = dmu_object_remap_indirects(os, object, + last_removed_txg); + /* + * If the ZPL removed the object before we managed to dnode_hold + * it, we would get an ENOENT. If the ZPL declares its intent + * to remove the object (dnode_free) before we manage to + * dnode_hold it, we would get an EEXIST. In either case, we + * want to continue remapping the other objects in the objset; + * in all other cases, we want to break early. + */ + if (error != 0 && error != ENOENT && error != EEXIST) { + break; + } + } + if (error == ESRCH) { + error = 0; + } + return (error); +} + +int +dmu_objset_remap_indirects(const char *fsname) +{ + int error = 0; + objset_t *os = NULL; + uint64_t last_removed_txg; + uint64_t remap_start_txg; + dsl_dir_t *dd; + + error = dmu_objset_hold(fsname, FTAG, &os); + if (error != 0) { + return (error); + } + dd = dmu_objset_ds(os)->ds_dir; + + if (!spa_feature_is_enabled(dmu_objset_spa(os), + SPA_FEATURE_OBSOLETE_COUNTS)) { + dmu_objset_rele(os, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) { + dmu_objset_rele(os, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * If there has not been a removal, we're done. + */ + last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os)); + if (last_removed_txg == -1ULL) { + dmu_objset_rele(os, FTAG); + return (0); + } + + /* + * If we have remapped since the last removal, we're done. + */ + if (dsl_dir_is_zapified(dd)) { + uint64_t last_remap_txg; + if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)), + dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (last_remap_txg), 1, &last_remap_txg) == 0 && + last_remap_txg > last_removed_txg) { + dmu_objset_rele(os, FTAG); + return (0); + } + } + + dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); + dsl_pool_rele(dmu_objset_pool(os), FTAG); + + remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os)); + error = dmu_objset_remap_indirects_impl(os, last_removed_txg); + if (error == 0) { + /* + * We update the last_remap_txg to be the start txg so that + * we can guarantee that every block older than last_remap_txg + * that can be remapped has been remapped. + */ + error = dsl_dir_update_last_remap_txg(dd, remap_start_txg); + } + + dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); + dsl_dataset_rele(dmu_objset_ds(os), FTAG); + + return (error); +} + int dmu_objset_snapshot_one(const char *fsname, const char *snapname) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index 37024880c271..ad02fa5918aa 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -300,6 +300,23 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) } } +void +dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_WRITE, 0, 0); + if (txh == NULL) + return; + + dnode_t *dn = txh->txh_dnode; + (void) refcount_add_many(&txh->txh_space_towrite, + 1ULL << dn->dn_indblkshift, FTAG); + dmu_tx_count_dnode(txh); +} + void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c index 093740cd591a..174a3120eb5e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c @@ -227,10 +227,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) int64_t pf_ahead_blks, max_blks; int epbs, max_dist_blks, pf_nblks, ipf_nblks; uint64_t end_of_access_blkid = blkid + nblks; + spa_t *spa = zf->zf_dnode->dn_objset->os_spa; if (zfs_prefetch_disable) return; + /* + * If we haven't yet loaded the indirect vdevs' mappings, we + * can only read from blocks that we carefully ensure are on + * concrete vdevs (or previously-loaded indirect vdevs). So we + * can't allow the predictive prefetcher to attempt reads of other + * blocks (e.g. of the MOS's dnode obejct). + */ + if (!spa_indirect_vdevs_loaded(spa)) + return; + /* * As a fast path for small (single-block) files, ignore access * to the first block. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index 3ec78c3d299e..27eebe222a83 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -1701,8 +1701,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) mutex_enter(&dn->dn_mtx); int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { - dn->dn_free_ranges[txgoff] = - range_tree_create(NULL, NULL, &dn->dn_mtx); + dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL); } range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 5a282ebf2549..f0a854a03968 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -88,6 +89,11 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN, extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); +static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, + uint64_t obj, dmu_tx_t *tx); +static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, + dmu_tx_t *tx); + extern int spa_asize_inflation; static zil_header_t zero_zil; @@ -161,6 +167,47 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) DD_USED_REFRSRV, DD_USED_HEAD, NULL); } +/* + * Called when the specified segment has been remapped, and is thus no + * longer referenced in the head dataset. The vdev must be indirect. + * + * If the segment is referenced by a snapshot, put it on the remap deadlist. + * Otherwise, add this segment to the obsolete spacemap. + */ +void +dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, + uint64_t size, uint64_t birth, dmu_tx_t *tx) +{ + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(birth <= tx->tx_txg); + ASSERT(!ds->ds_is_snapshot); + + if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); + } else { + blkptr_t fakebp; + dva_t *dva = &fakebp.blk_dva[0]; + + ASSERT(ds != NULL); + + mutex_enter(&ds->ds_remap_deadlist_lock); + if (!dsl_dataset_remap_deadlist_exists(ds)) { + dsl_dataset_create_remap_deadlist(ds, tx); + } + mutex_exit(&ds->ds_remap_deadlist_lock); + + BP_ZERO(&fakebp); + fakebp.blk_birth = birth; + DVA_SET_VDEV(dva, vdev); + DVA_SET_OFFSET(dva, offset); + DVA_SET_ASIZE(dva, size); + + dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx); + } +} + int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async) @@ -279,8 +326,10 @@ dsl_dataset_evict_async(void *dbu) } bplist_destroy(&ds->ds_pending_deadlist); - if (ds->ds_deadlist.dl_os != NULL) + if (dsl_deadlist_is_open(&ds->ds_deadlist)) dsl_deadlist_close(&ds->ds_deadlist); + if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) + dsl_deadlist_close(&ds->ds_remap_deadlist); if (ds->ds_dir) dsl_dir_async_rele(ds->ds_dir, ds); @@ -294,6 +343,7 @@ dsl_dataset_evict_async(void *dbu) mutex_exit(&ds->ds_opening_lock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_sendstream_lock); + mutex_destroy(&ds->ds_remap_deadlist_lock); refcount_destroy(&ds->ds_longholds); rrw_destroy(&ds->ds_bp_rwlock); @@ -418,15 +468,23 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_object = dsobj; ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; + err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj, + NULL, ds, &ds->ds_dir); + if (err != 0) { + kmem_free(ds, sizeof (dsl_dataset_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } + mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_remap_deadlist_lock, + NULL, MUTEX_DEFAULT, NULL); rrw_init(&ds->ds_bp_rwlock, B_FALSE); refcount_create(&ds->ds_longholds); bplist_create(&ds->ds_pending_deadlist); - dsl_deadlist_open(&ds->ds_deadlist, - mos, dsl_dataset_phys(ds)->ds_deadlist_obj); list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), offsetof(dmu_sendarg_t, dsa_link)); @@ -450,20 +508,6 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } } - err = dsl_dir_hold_obj(dp, - dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir); - if (err != 0) { - mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_opening_lock); - mutex_destroy(&ds->ds_sendstream_lock); - refcount_destroy(&ds->ds_longholds); - bplist_destroy(&ds->ds_pending_deadlist); - dsl_deadlist_close(&ds->ds_deadlist); - kmem_free(ds, sizeof (dsl_dataset_t)); - dmu_buf_rele(dbuf, tag); - return (err); - } - if (!ds->ds_is_snapshot) { ds->ds_snapname[0] = '\0'; if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { @@ -504,6 +548,15 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_reserved = ds->ds_quota = 0; } + dsl_deadlist_open(&ds->ds_deadlist, + mos, dsl_dataset_phys(ds)->ds_deadlist_obj); + uint64_t remap_deadlist_obj = + dsl_dataset_get_remap_deadlist_object(ds); + if (remap_deadlist_obj != 0) { + dsl_deadlist_open(&ds->ds_remap_deadlist, mos, + remap_deadlist_obj); + } + dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync, dsl_dataset_evict_async, &ds->ds_dbuf); if (err == 0) @@ -512,6 +565,8 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, if (err != 0 || winner != NULL) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); + if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) + dsl_deadlist_close(&ds->ds_remap_deadlist); if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); dsl_dir_rele(ds->ds_dir, ds); @@ -1448,10 +1503,27 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsl_deadlist_add_key(&ds->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + if (dsl_dataset_remap_deadlist_exists(ds)) { + uint64_t remap_deadlist_obj = + dsl_dataset_get_remap_deadlist_object(ds); + /* + * Move the remap_deadlist to the snapshot. The head + * will create a new remap deadlist on demand, from + * dsl_dataset_block_remapped(). + */ + dsl_dataset_unset_remap_deadlist_object(ds, tx); + dsl_deadlist_close(&ds->ds_remap_deadlist); + + dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST, + sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx)); + } + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; dsl_dataset_phys(ds)->ds_unique_bytes = 0; + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; @@ -3379,6 +3451,41 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, return (0); } +static void +dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone, + dsl_dataset_t *origin, dmu_tx_t *tx) +{ + uint64_t clone_remap_dl_obj, origin_remap_dl_obj; + dsl_pool_t *dp = dmu_tx_pool(tx); + + ASSERT(dsl_pool_sync_context(dp)); + + clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone); + origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin); + + if (clone_remap_dl_obj != 0) { + dsl_deadlist_close(&clone->ds_remap_deadlist); + dsl_dataset_unset_remap_deadlist_object(clone, tx); + } + if (origin_remap_dl_obj != 0) { + dsl_deadlist_close(&origin->ds_remap_deadlist); + dsl_dataset_unset_remap_deadlist_object(origin, tx); + } + + if (clone_remap_dl_obj != 0) { + dsl_dataset_set_remap_deadlist_object(origin, + clone_remap_dl_obj, tx); + dsl_deadlist_open(&origin->ds_remap_deadlist, + dp->dp_meta_objset, clone_remap_dl_obj); + } + if (origin_remap_dl_obj != 0) { + dsl_dataset_set_remap_deadlist_object(clone, + origin_remap_dl_obj, tx); + dsl_deadlist_open(&clone->ds_remap_deadlist, + dp->dp_meta_objset, origin_remap_dl_obj); + } +} + void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, dmu_tx_t *tx) @@ -3548,6 +3655,7 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_phys(clone)->ds_deadlist_obj); dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, dsl_dataset_phys(origin_head)->ds_deadlist_obj); + dsl_dataset_swap_remap_deadlists(clone, origin_head, tx); dsl_scan_ds_clone_swapped(origin_head, clone, tx); @@ -4055,3 +4163,90 @@ dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) zap_contains(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); } + +uint64_t +dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds) +{ + uint64_t remap_deadlist_obj; + int err; + + if (!dsl_dataset_is_zapified(ds)) + return (0); + + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, + DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1, + &remap_deadlist_obj); + + if (err != 0) { + VERIFY3S(err, ==, ENOENT); + return (0); + } + + ASSERT(remap_deadlist_obj != 0); + return (remap_deadlist_obj); +} + +boolean_t +dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds) +{ + EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist), + dsl_dataset_get_remap_deadlist_object(ds) != 0); + return (dsl_deadlist_is_open(&ds->ds_remap_deadlist)); +} + +static void +dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj, + dmu_tx_t *tx) +{ + ASSERT(obj != 0); + dsl_dataset_zapify(ds, tx); + VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, + DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx)); +} + +static void +dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx)); +} + +void +dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t remap_deadlist_object; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_dataset_remap_deadlist_exists(ds)); + + remap_deadlist_object = ds->ds_remap_deadlist.dl_object; + dsl_deadlist_close(&ds->ds_remap_deadlist); + dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx); + dsl_dataset_unset_remap_deadlist_object(ds, tx); + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} + +void +dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t remap_deadlist_obj; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock)); + /* + * Currently we only create remap deadlists when there are indirect + * vdevs with referenced mappings. + */ + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + remap_deadlist_obj = dsl_deadlist_clone( + &ds->ds_deadlist, UINT64_MAX, + dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); + dsl_dataset_set_remap_deadlist_object(ds, + remap_deadlist_obj, tx); + dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa), + remap_deadlist_obj); + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c index ff06c9e93cc6..356e5b51c3f4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -99,6 +99,8 @@ dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) { dmu_object_info_t doi; + ASSERT(!dsl_deadlist_is_open(dl)); + mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); dl->dl_os = os; dl->dl_object = object; @@ -117,17 +119,25 @@ dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) dl->dl_havetree = B_FALSE; } +boolean_t +dsl_deadlist_is_open(dsl_deadlist_t *dl) +{ + return (dl->dl_os != NULL); +} + void dsl_deadlist_close(dsl_deadlist_t *dl) { void *cookie = NULL; dsl_deadlist_entry_t *dle; - dl->dl_os = NULL; + ASSERT(dsl_deadlist_is_open(dl)); if (dl->dl_oldfmt) { dl->dl_oldfmt = B_FALSE; bpobj_close(&dl->dl_bpobj); + dl->dl_os = NULL; + dl->dl_object = 0; return; } @@ -143,6 +153,8 @@ dsl_deadlist_close(dsl_deadlist_t *dl) mutex_destroy(&dl->dl_lock); dl->dl_dbuf = NULL; dl->dl_phys = NULL; + dl->dl_os = NULL; + dl->dl_object = 0; } uint64_t @@ -309,7 +321,7 @@ static void dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, uint64_t mrs_obj, dmu_tx_t *tx) { - dsl_deadlist_t dl; + dsl_deadlist_t dl = { 0 }; dsl_pool_t *dp = dmu_objset_pool(os); dsl_deadlist_open(&dl, os, dlobj); @@ -365,6 +377,7 @@ void dsl_deadlist_space(dsl_deadlist_t *dl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { + ASSERT(dsl_deadlist_is_open(dl)); if (dl->dl_oldfmt) { VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, usedp, compp, uncompp)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c index c8cf60dd1b3e..e548ff29ee62 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2013 by Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -205,6 +205,10 @@ dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) if (clone->ds_dir->dd_origin_txg > mintxg) { dsl_deadlist_remove_key(&clone->ds_deadlist, mintxg, tx); + if (dsl_dataset_remap_deadlist_exists(clone)) { + dsl_deadlist_remove_key( + &clone->ds_remap_deadlist, mintxg, tx); + } dsl_dataset_remove_clones_key(clone, mintxg, tx); } dsl_dataset_rele(clone, FTAG); @@ -212,6 +216,39 @@ dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) zap_cursor_fini(&zc); } +static void +dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* Move blocks to be obsoleted to pool's obsolete list. */ + if (dsl_dataset_remap_deadlist_exists(ds_next)) { + if (!bpobj_is_open(&dp->dp_obsolete_bpobj)) + dsl_pool_create_obsolete_bpobj(dp, tx); + + dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist, + &dp->dp_obsolete_bpobj, + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + } + + /* Merge our deadlist into next's and free it. */ + if (dsl_dataset_remap_deadlist_exists(ds)) { + uint64_t remap_deadlist_object = + dsl_dataset_get_remap_deadlist_object(ds); + ASSERT(remap_deadlist_object != 0); + + mutex_enter(&ds_next->ds_remap_deadlist_lock); + if (!dsl_dataset_remap_deadlist_exists(ds_next)) + dsl_dataset_create_remap_deadlist(ds_next, tx); + mutex_exit(&ds_next->ds_remap_deadlist_lock); + + dsl_deadlist_merge(&ds_next->ds_remap_deadlist, + remap_deadlist_object, tx); + dsl_dataset_destroy_remap_deadlist(ds, tx); + } +} + void dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) { @@ -327,11 +364,14 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) dsl_deadlist_merge(&ds_next->ds_deadlist, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); } + dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_deadlist_obj = 0; + dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx); + /* Collapse range in clone heads */ dsl_dataset_remove_clones_key(ds, dsl_dataset_phys(ds)->ds_creation_txg, tx); @@ -365,6 +405,10 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds)); dsl_deadlist_remove_key(&hds->ds_deadlist, dsl_dataset_phys(ds)->ds_creation_txg, tx); + if (dsl_dataset_remap_deadlist_exists(hds)) { + dsl_deadlist_remove_key(&hds->ds_remap_deadlist, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + } dsl_dataset_rele(hds, FTAG); } else { @@ -813,14 +857,18 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) /* * Destroy the deadlist. Unless it's a clone, the - * deadlist should be empty. (If it's a clone, it's - * safe to ignore the deadlist contents.) + * deadlist should be empty since the dataset has no snapshots. + * (If it's a clone, it's safe to ignore the deadlist contents + * since they are still referenced by the origin snapshot.) */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_deadlist_obj = 0; + if (dsl_dataset_remap_deadlist_exists(ds)) + dsl_dataset_destroy_remap_deadlist(ds, tx); + objset_t *os; VERIFY0(dmu_objset_from_ds(ds, &os)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c index 6142de70ade5..4a3283cb6a76 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c @@ -132,6 +132,11 @@ extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); +typedef struct ddulrt_arg { + dsl_dir_t *ddulrta_dd; + uint64_t ddlrta_txg; +} ddulrt_arg_t; + static void dsl_dir_evict_async(void *dbu) { @@ -738,6 +743,35 @@ dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) return (enforce); } +static void +dsl_dir_update_last_remap_txg_sync(void *varg, dmu_tx_t *tx) +{ + ddulrt_arg_t *arg = varg; + uint64_t last_remap_txg; + dsl_dir_t *dd = arg->ddulrta_dd; + objset_t *mos = dd->dd_pool->dp_meta_objset; + + dsl_dir_zapify(dd, tx); + if (zap_lookup(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (last_remap_txg), 1, &last_remap_txg) != 0 || + last_remap_txg < arg->ddlrta_txg) { + VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (arg->ddlrta_txg), 1, &arg->ddlrta_txg, tx)); + } +} + +int +dsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg) +{ + ddulrt_arg_t arg; + arg.ddulrta_dd = dd; + arg.ddlrta_txg = txg; + + return (dsl_sync_task(spa_name(dd->dd_pool->dp_spa), + NULL, dsl_dir_update_last_remap_txg_sync, &arg, + 1, ZFS_SPACE_CHECK_RESERVED)); +} + /* * Check if adding additional child filesystem(s) would exceed any filesystem * limits or adding additional snapshot(s) would exceed any snapshot limits. @@ -1029,6 +1063,18 @@ dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) } } +int +dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count) +{ + if (dsl_dir_is_zapified(dd)) { + objset_t *os = dd->dd_pool->dp_meta_objset; + return (zap_lookup(os, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, + sizeof (*count), 1, count)); + } else { + return (ENOENT); + } +} + void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { @@ -1060,6 +1106,10 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, count); } + if (dsl_dir_get_remaptxg(dd, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REMAPTXG, + count); + } if (dsl_dir_is_clone(dd)) { char buf[ZFS_MAX_DATASET_NAME_LEN]; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c index f45a1f1d8440..ca0fe7a3b68c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -388,9 +388,25 @@ dsl_pool_open(dsl_pool_t *dp) dp->dp_meta_objset, obj)); } + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj); + if (err == 0) { + VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, + dp->dp_meta_objset, obj)); + } else if (err == ENOENT) { + /* + * We might not have created the remap bpobj yet. + */ + err = 0; + } else { + goto out; + } + } + /* - * Note: errors ignored, because the leak dir will not exist if we - * have not encountered a leak yet. + * Note: errors ignored, because the these special dirs, used for + * space accounting, are only created on demand. */ (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir); @@ -436,21 +452,22 @@ dsl_pool_close(dsl_pool_t *dp) * includes pool-opening context), it actually only got a "ref" * and not a hold, so just drop that here. */ - if (dp->dp_origin_snap) + if (dp->dp_origin_snap != NULL) dsl_dataset_rele(dp->dp_origin_snap, dp); - if (dp->dp_mos_dir) + if (dp->dp_mos_dir != NULL) dsl_dir_rele(dp->dp_mos_dir, dp); - if (dp->dp_free_dir) + if (dp->dp_free_dir != NULL) dsl_dir_rele(dp->dp_free_dir, dp); - if (dp->dp_leak_dir) + if (dp->dp_leak_dir != NULL) dsl_dir_rele(dp->dp_leak_dir, dp); - if (dp->dp_root_dir) + if (dp->dp_root_dir != NULL) dsl_dir_rele(dp->dp_root_dir, dp); bpobj_close(&dp->dp_free_bpobj); + bpobj_close(&dp->dp_obsolete_bpobj); /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ - if (dp->dp_meta_objset) + if (dp->dp_meta_objset != NULL) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); @@ -476,11 +493,40 @@ dsl_pool_close(dsl_pool_t *dp) rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); taskq_destroy(dp->dp_vnrele_taskq); - if (dp->dp_blkstats) + if (dp->dp_blkstats != NULL) kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); } +void +dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + uint64_t obj; + /* + * Currently, we only create the obsolete_bpobj where there are + * indirect vdevs with referenced mappings. + */ + ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL)); + /* create and open the obsolete_bpobj */ + obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); + VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} + +void +dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, tx)); + bpobj_free(dp->dp_meta_objset, + dp->dp_obsolete_bpobj.bpo_object, tx); + bpobj_close(&dp->dp_obsolete_bpobj); +} + dsl_pool_t * dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c index e84e9e1d322f..61c0b2696352 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Gary Mills - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 Datto Inc. */ @@ -69,6 +69,7 @@ unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ +unsigned int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */ unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ @@ -96,9 +97,9 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ -uint64_t zfs_free_max_blocks = UINT64_MAX; +uint64_t zfs_async_block_max_blocks = UINT64_MAX; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, - &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG"); + &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG"); #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ @@ -1146,7 +1147,6 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; - objset_t *os; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); @@ -1190,18 +1190,23 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) goto out; } - if (dmu_objset_from_ds(ds, &os)) - goto out; - /* - * Only the ZIL in the head (non-snapshot) is valid. Even though + * Only the ZIL in the head (non-snapshot) is valid. Even though * snapshots can have ZIL block pointers (which may be the same - * BP as in the head), they must be ignored. So we traverse the - * ZIL here, rather than in scan_recurse(), because the regular - * snapshot block-sharing rules don't apply to it. + * BP as in the head), they must be ignored. In addition, $ORIGIN + * doesn't have a objset (i.e. its ds_bp is a hole) so we don't + * need to look for a ZIL in it either. So we traverse the ZIL here, + * rather than in scan_recurse(), because the regular snapshot + * block-sharing rules don't apply to it. */ - if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot) + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) && + ds->ds_dir != dp->dp_origin_snap->ds_dir) { + objset_t *os; + if (dmu_objset_from_ds(ds, &os) != 0) { + goto out; + } dsl_scan_zil(dp, &os->os_zil_header); + } /* * Iterate over the bps in this ds. @@ -1510,19 +1515,19 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) } static boolean_t -dsl_scan_free_should_suspend(dsl_scan_t *scn) +dsl_scan_async_block_should_pause(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; if (zfs_recover) return (B_FALSE); - if (scn->scn_visited_this_txg >= zfs_free_max_blocks) + if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks) return (B_TRUE); elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || - (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms && + (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); } @@ -1534,7 +1539,7 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) if (!scn->scn_is_bptree || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { - if (dsl_scan_free_should_suspend(scn)) + if (dsl_scan_async_block_should_pause(scn)) return (SET_ERROR(ERESTART)); } @@ -1547,6 +1552,22 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) return (0); } +static int +dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + const dva_t *dva = &bp->blk_dva[0]; + + if (dsl_scan_async_block_should_pause(scn)) + return (SET_ERROR(ERESTART)); + + spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, + DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), + DVA_GET_ASIZE(dva), tx); + scn->scn_visited_this_txg++; + return (0); +} + boolean_t dsl_scan_active(dsl_scan_t *scn) { @@ -1627,6 +1648,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (zfs_free_bpobj_enabled && spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; + scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, @@ -1724,6 +1746,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } + if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); @@ -1731,6 +1754,24 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } + EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), + 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ)); + if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_OBSOLETE_COUNTS)); + + scn->scn_is_bptree = B_FALSE; + scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; + err = bpobj_iterate(&dp->dp_obsolete_bpobj, + dsl_scan_obsolete_block_cb, scn, tx); + if (err != 0 && err != ERESTART) + zfs_panic_recover("error %u from bpobj_iterate()", err); + + if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) + dsl_pool_destroy_obsolete_bpobj(dp, tx); + } + if (scn->scn_phys.scn_state != DSS_SCANNING) return; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index c10ca655072c..c8b18565896d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -34,6 +34,7 @@ #include #include #include +#include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); @@ -225,6 +226,11 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, &metaslab_bias_enabled, 0, "Enable metaslab group biasing"); +/* + * Enable/disable remapping of indirect DVAs to their concrete vdevs. + */ +boolean_t zfs_remap_blkptr_enable = B_TRUE; + /* * Enable/disable segment-based metaslab selection. */ @@ -255,6 +261,8 @@ uint64_t metaslab_trace_max_entries = 5000; static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); +static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t); +static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); kmem_cache_t *metaslab_alloc_trace_cache; @@ -401,7 +409,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) * Skip any holes, uninitialized top-levels, or * vdevs that are not in this metalab class. */ - if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } @@ -436,10 +444,10 @@ metaslab_class_fragmentation(metaslab_class_t *mc) metaslab_group_t *mg = tvd->vdev_mg; /* - * Skip any holes, uninitialized top-levels, or - * vdevs that are not in this metalab class. + * Skip any holes, uninitialized top-levels, + * or vdevs that are not in this metalab class. */ - if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } @@ -485,7 +493,7 @@ metaslab_class_expandable_space(metaslab_class_t *mc) vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; - if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } @@ -597,6 +605,8 @@ metaslab_group_alloc_update(metaslab_group_t *mg) boolean_t was_initialized; ASSERT(vd == vd->vdev_top); + ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, + SCL_ALLOC); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; @@ -707,7 +717,7 @@ metaslab_group_activate(metaslab_group_t *mg) metaslab_class_t *mc = mg->mg_class; metaslab_group_t *mgprev, *mgnext; - ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); ASSERT(mc->mc_rotor != mg); ASSERT(mg->mg_prev == NULL); @@ -734,13 +744,22 @@ metaslab_group_activate(metaslab_group_t *mg) metaslab_class_minblocksize_update(mc); } +/* + * Passivate a metaslab group and remove it from the allocation rotor. + * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating + * a metaslab group. This function will momentarily drop spa_config_locks + * that are lower than the SCL_ALLOC lock (see comment below). + */ void metaslab_group_passivate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; + spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; + int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); - ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, + (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { ASSERT(mc->mc_rotor != mg); @@ -750,7 +769,23 @@ metaslab_group_passivate(metaslab_group_t *mg) return; } + /* + * The spa_config_lock is an array of rwlocks, ordered as + * follows (from highest to lowest): + * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > + * SCL_ZIO > SCL_FREE > SCL_VDEV + * (For more information about the spa_config_lock see spa_misc.c) + * The higher the lock, the broader its coverage. When we passivate + * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO + * config locks. However, the metaslab group's taskq might be trying + * to preload metaslabs so we must drop the SCL_ZIO lock and any + * lower locks to allow the I/O to complete. At a minimum, + * we continue to hold the SCL_ALLOC lock, which prevents any future + * allocations from taking place and any changes to the vdev tree. + */ + spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); taskq_wait(mg->mg_taskq); + spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); mgprev = mg->mg_prev; @@ -1430,6 +1465,12 @@ metaslab_load(metaslab_t *msp) ASSERT(!msp->ms_loading); msp->ms_loading = B_TRUE; + /* + * Nobody else can manipulate a loading metaslab, so it's now safe + * to drop the lock. This way we don't have to hold the lock while + * reading the spacemap from disk. + */ + mutex_exit(&msp->ms_lock); /* * If the space map has not been allocated yet, then treat @@ -1442,6 +1483,8 @@ metaslab_load(metaslab_t *msp) range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); success = (error == 0); + + mutex_enter(&msp->ms_lock); msp->ms_loading = B_FALSE; if (success) { @@ -1479,6 +1522,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; @@ -1490,7 +1534,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, */ if (object != 0) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, - ms->ms_size, vd->vdev_ashift, &ms->ms_lock); + ms->ms_size, vd->vdev_ashift); if (error != 0) { kmem_free(ms, sizeof (metaslab_t)); @@ -1507,7 +1551,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, * addition of new space; and for debugging, it ensures that we'd * data fault on any attempt to use this metaslab before it's ready. */ - ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); + ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms); @@ -1576,6 +1620,7 @@ metaslab_fini(metaslab_t *msp) mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); + mutex_destroy(&msp->ms_sync_lock); kmem_free(msp, sizeof (metaslab_t)); } @@ -1941,14 +1986,11 @@ metaslab_weight(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); /* - * This vdev is in the process of being removed so there is nothing + * If this vdev is in the process of being removed, there is nothing * for us to do here. */ - if (vd->vdev_removing) { - ASSERT0(space_map_allocated(msp->ms_sm)); - ASSERT0(vd->vdev_ms_shift); + if (vd->vdev_removing) return (0); - } metaslab_set_fragmentation(msp); @@ -2080,10 +2122,13 @@ metaslab_group_preload(metaslab_group_t *mg) } mutex_enter(&mg->mg_lock); + /* * Load the next potential metaslabs */ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { + ASSERT3P(msp->ms_group, ==, mg); + /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced @@ -2110,7 +2155,7 @@ metaslab_group_preload(metaslab_group_t *mg) * * 2. The minimal on-disk space map representation is zfs_condense_pct/100 * times the size than the free space range tree representation - * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). + * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). * * 3. The on-disk size of the space map should actually decrease. * @@ -2207,7 +2252,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) * a relatively inexpensive operation since we expect these trees to * have a small number of nodes. */ - condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); + condense_tree = range_tree_create(NULL, NULL); range_tree_add(condense_tree, msp->ms_start, msp->ms_size); /* @@ -2240,7 +2285,6 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) mutex_exit(&msp->ms_lock); space_map_truncate(sm, tx); - mutex_enter(&msp->ms_lock); /* * While we would ideally like to create a space map representation @@ -2257,6 +2301,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) range_tree_destroy(condense_tree); space_map_write(sm, msp->ms_tree, SM_FREE, tx); + mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; } @@ -2306,10 +2351,14 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * The only state that can actually be changing concurrently with * metaslab_sync() is the metaslab's ms_tree. No other thread can * be modifying this txg's alloctree, freeingtree, freedtree, or - * space_map_phys_t. Therefore, we only hold ms_lock to satify - * space map ASSERTs. We drop it whenever we call into the DMU, - * because the DMU can call down to us (e.g. via zio_free()) at - * any time. + * space_map_phys_t. We drop ms_lock whenever we could call + * into the DMU, because the DMU can call down to us + * (e.g. via zio_free()) at any time. + * + * The spa_vdev_remove_thread() can be reading metaslab state + * concurrently, and it is locked out by the ms_sync_lock. Note + * that the ms_lock is insufficient for this, because it is dropped + * by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); @@ -2321,11 +2370,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, - msp->ms_start, msp->ms_size, vd->vdev_ashift, - &msp->ms_lock)); + msp->ms_start, msp->ms_size, vd->vdev_ashift)); ASSERT(msp->ms_sm != NULL); } + mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* @@ -2341,13 +2390,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_should_condense(msp)) { metaslab_condense(msp, txg, tx); } else { + mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx); + mutex_enter(&msp->ms_lock); } if (msp->ms_loaded) { /* - * When the space map is loaded, we have an accruate + * When the space map is loaded, we have an accurate * histogram in the range tree. This gives us an opportunity * to bring the space map's histogram up-to-date so we clear * it first before updating it. @@ -2415,6 +2466,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &object, tx); } + mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } @@ -2444,23 +2496,19 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) for (int t = 0; t < TXG_SIZE; t++) { ASSERT(msp->ms_alloctree[t] == NULL); - msp->ms_alloctree[t] = range_tree_create(NULL, msp, - &msp->ms_lock); + msp->ms_alloctree[t] = range_tree_create(NULL, NULL); } ASSERT3P(msp->ms_freeingtree, ==, NULL); - msp->ms_freeingtree = range_tree_create(NULL, msp, - &msp->ms_lock); + msp->ms_freeingtree = range_tree_create(NULL, NULL); ASSERT3P(msp->ms_freedtree, ==, NULL); - msp->ms_freedtree = range_tree_create(NULL, msp, - &msp->ms_lock); + msp->ms_freedtree = range_tree_create(NULL, NULL); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ASSERT(msp->ms_defertree[t] == NULL); - msp->ms_defertree[t] = range_tree_create(NULL, msp, - &msp->ms_lock); + msp->ms_defertree[t] = range_tree_create(NULL, NULL); } vdev_space_update(vd, 0, 0, msp->ms_size); @@ -2470,7 +2518,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); - if (free_space <= spa_get_slop_space(spa)) { + if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { defer_allowed = B_FALSE; } @@ -2540,19 +2588,33 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) metaslab_unload(msp); } + ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freeingtree)); + ASSERT0(range_tree_space(msp->ms_freedtree)); + mutex_exit(&msp->ms_lock); } void metaslab_sync_reassess(metaslab_group_t *mg) { + spa_t *spa = mg->mg_class->mc_spa; + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); metaslab_group_alloc_update(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); /* - * Preload the next potential metaslabs + * Preload the next potential metaslabs but only on active + * metaslab groups. We can get into a state where the metaslab + * is no longer active since we dirty metaslabs as we remove a + * a device, thus potentially making the metaslab group eligible + * for preloading. */ - metaslab_group_preload(mg); + if (mg->mg_activation_count > 0) { + metaslab_group_preload(mg); + } + spa_config_exit(spa, SCL_ALLOC, FTAG); } static uint64_t @@ -3004,7 +3066,7 @@ int ditto_same_vdev_distance_shift = 3; /* * Allocate a block for the specified i/o. */ -static int +int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal) @@ -3050,10 +3112,11 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * It's possible the vdev we're using as the hint no - * longer exists (i.e. removed). Consult the rotor when + * longer exists or its mg has been closed (e.g. by + * device removal). Consult the rotor when * all else fails. */ - if (vd != NULL) { + if (vd != NULL && vd->vdev_mg != NULL) { mg = vd->vdev_mg; if (flags & METASLAB_HINTBP_AVOID && @@ -3215,20 +3278,228 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, return (SET_ERROR(ENOSPC)); } -/* - * Free the block represented by DVA in the context of the specified - * transaction group. - */ -static void -metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) +void +metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, + uint64_t txg) { + metaslab_t *msp; + spa_t *spa = vd->vdev_spa; + + ASSERT3U(txg, ==, spa->spa_syncing_txg); + ASSERT(vdev_is_concrete(vd)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + VERIFY(!msp->ms_condensing); + VERIFY3U(offset, >=, msp->ms_start); + VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); + + metaslab_check_free_impl(vd, offset, asize); + mutex_enter(&msp->ms_lock); + if (range_tree_space(msp->ms_freeingtree) == 0) { + vdev_dirty(vd, VDD_METASLAB, msp, txg); + } + range_tree_add(msp->ms_freeingtree, offset, asize); + mutex_exit(&msp->ms_lock); +} + +/* ARGSUSED */ +void +metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + uint64_t *txgp = arg; + + if (vd->vdev_ops->vdev_op_remap != NULL) + vdev_indirect_mark_obsolete(vd, offset, size, *txgp); + else + metaslab_free_impl(vd, offset, size, *txgp); +} + +static void +metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, + uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + if (txg > spa_freeze_txg(spa)) + return; + + if (spa->spa_vdev_removal != NULL && + spa->spa_vdev_removal->svr_vdev == vd && + vdev_is_concrete(vd)) { + /* + * Note: we check if the vdev is concrete because when + * we complete the removal, we first change the vdev to be + * an indirect vdev (in open context), and then (in syncing + * context) clear spa_vdev_removal. + */ + free_from_removing_vdev(vd, offset, size, txg); + } else if (vd->vdev_ops->vdev_op_remap != NULL) { + vdev_indirect_mark_obsolete(vd, offset, size, txg); + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_free_impl_cb, &txg); + } else { + metaslab_free_concrete(vd, offset, size, txg); + } +} + +typedef struct remap_blkptr_cb_arg { + blkptr_t *rbca_bp; + spa_remap_cb_t rbca_cb; + vdev_t *rbca_remap_vd; + uint64_t rbca_remap_offset; + void *rbca_cb_arg; +} remap_blkptr_cb_arg_t; + +void +remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + remap_blkptr_cb_arg_t *rbca = arg; + blkptr_t *bp = rbca->rbca_bp; + + /* We can not remap split blocks. */ + if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) + return; + ASSERT0(inner_offset); + + if (rbca->rbca_cb != NULL) { + /* + * At this point we know that we are not handling split + * blocks and we invoke the callback on the previous + * vdev which must be indirect. + */ + ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); + + rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, + rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); + + /* set up remap_blkptr_cb_arg for the next call */ + rbca->rbca_remap_vd = vd; + rbca->rbca_remap_offset = offset; + } + + /* + * The phys birth time is that of dva[0]. This ensures that we know + * when each dva was written, so that resilver can determine which + * blocks need to be scrubbed (i.e. those written during the time + * the vdev was offline). It also ensures that the key used in + * the ARC hash table is unique (i.e. dva[0] + phys_birth). If + * we didn't change the phys_birth, a lookup in the ARC for a + * remapped BP could find the data that was previously stored at + * this vdev + offset. + */ + vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, + DVA_GET_VDEV(&bp->blk_dva[0])); + vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; + bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, + DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); + + DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], offset); +} + +/* + * If the block pointer contains any indirect DVAs, modify them to refer to + * concrete DVAs. Note that this will sometimes not be possible, leaving + * the indirect DVA in place. This happens if the indirect DVA spans multiple + * segments in the mapping (i.e. it is a "split block"). + * + * If the BP was remapped, calls the callback on the original dva (note the + * callback can be called multiple times if the original indirect DVA refers + * to another indirect DVA, etc). + * + * Returns TRUE if the BP was remapped. + */ +boolean_t +spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) +{ + remap_blkptr_cb_arg_t rbca; + + if (!zfs_remap_blkptr_enable) + return (B_FALSE); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) + return (B_FALSE); + + /* + * Dedup BP's can not be remapped, because ddt_phys_select() depends + * on DVA[0] being the same in the BP as in the DDT (dedup table). + */ + if (BP_GET_DEDUP(bp)) + return (B_FALSE); + + /* + * Gang blocks can not be remapped, because + * zio_checksum_gang_verifier() depends on the DVA[0] that's in + * the BP used to read the gang block header (GBH) being the same + * as the DVA[0] that we allocated for the GBH. + */ + if (BP_IS_GANG(bp)) + return (B_FALSE); + + /* + * Embedded BP's have no DVA to remap. + */ + if (BP_GET_NDVAS(bp) < 1) + return (B_FALSE); + + /* + * Note: we only remap dva[0]. If we remapped other dvas, we + * would no longer know what their phys birth txg is. + */ + dva_t *dva = &bp->blk_dva[0]; + + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + + if (vd->vdev_ops->vdev_op_remap == NULL) + return (B_FALSE); + + rbca.rbca_bp = bp; + rbca.rbca_cb = callback; + rbca.rbca_remap_vd = vd; + rbca.rbca_remap_offset = offset; + rbca.rbca_cb_arg = arg; + + /* + * remap_blkptr_cb() will be called in order for each level of + * indirection, until a concrete vdev is reached or a split block is + * encountered. old_vd and old_offset are updated within the callback + * as we go from the one indirect vdev to the next one (either concrete + * or indirect again) in that order. + */ + vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); + + /* Check if the DVA wasn't remapped because it is a split block */ + if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Undo the allocation of a DVA which happened in the given transaction group. + */ +void +metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ + metaslab_t *msp; + vdev_t *vd; uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); - vdev_t *vd; - metaslab_t *msp; ASSERT(DVA_IS_VALID(dva)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (txg > spa_freeze_txg(spa)) return; @@ -3241,91 +3512,51 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) return; } - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(!vd->vdev_removing); + ASSERT(vdev_is_concrete(vd)); + ASSERT0(vd->vdev_indirect_config.vic_mapping_object); + ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); if (DVA_GET_GANG(dva)) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + mutex_enter(&msp->ms_lock); + range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], + offset, size); - if (now) { - range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], - offset, size); - - VERIFY(!msp->ms_condensing); - VERIFY3U(offset, >=, msp->ms_start); - VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); - VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, - msp->ms_size); - VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); - VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - range_tree_add(msp->ms_tree, offset, size); - msp->ms_max_size = metaslab_block_maxsize(msp); - } else { - VERIFY3U(txg, ==, spa->spa_syncing_txg); - if (range_tree_space(msp->ms_freeingtree) == 0) - vdev_dirty(vd, VDD_METASLAB, msp, txg); - range_tree_add(msp->ms_freeingtree, offset, size); - } - + VERIFY(!msp->ms_condensing); + VERIFY3U(offset, >=, msp->ms_start); + VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); + VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, + msp->ms_size); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + range_tree_add(msp->ms_tree, offset, size); mutex_exit(&msp->ms_lock); } /* - * Intent log support: upon opening the pool after a crash, notify the SPA - * of blocks that the intent log has allocated for immediate write, but - * which are still considered free by the SPA because the last transaction - * group didn't commit yet. + * Free the block represented by DVA in the context of the specified + * transaction group. */ -static int -metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +void +metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); - vdev_t *vd; - metaslab_t *msp; - int error = 0; + vdev_t *vd = vdev_lookup_top(spa, vdev); ASSERT(DVA_IS_VALID(dva)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); - if ((vd = vdev_lookup_top(spa, vdev)) == NULL || - (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) - return (SET_ERROR(ENXIO)); - - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - - if (DVA_GET_GANG(dva)) + if (DVA_GET_GANG(dva)) { size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - - mutex_enter(&msp->ms_lock); - - if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - - if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) - error = SET_ERROR(ENOENT); - - if (error || txg == 0) { /* txg == 0 indicates dry run */ - mutex_exit(&msp->ms_lock); - return (error); } - VERIFY(!msp->ms_condensing); - VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); - VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); - range_tree_remove(msp->ms_tree, offset, size); - - if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ - if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) - vdev_dirty(vd, VDD_METASLAB, msp, txg); - range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); - } - - mutex_exit(&msp->ms_lock); - - return (0); + metaslab_free_impl(vd, offset, size, txg); } /* @@ -3376,6 +3607,122 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) mutex_exit(&mc->mc_lock); } +static int +metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, + uint64_t txg) +{ + metaslab_t *msp; + spa_t *spa = vd->vdev_spa; + int error = 0; + + if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) + return (ENXIO); + + ASSERT3P(vd->vdev_ms, !=, NULL); + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + mutex_enter(&msp->ms_lock); + + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + + if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) + error = SET_ERROR(ENOENT); + + if (error || txg == 0) { /* txg == 0 indicates dry run */ + mutex_exit(&msp->ms_lock); + return (error); + } + + VERIFY(!msp->ms_condensing); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); + range_tree_remove(msp->ms_tree, offset, size); + + if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ + if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); + } + + mutex_exit(&msp->ms_lock); + + return (0); +} + +typedef struct metaslab_claim_cb_arg_t { + uint64_t mcca_txg; + int mcca_error; +} metaslab_claim_cb_arg_t; + +/* ARGSUSED */ +static void +metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + metaslab_claim_cb_arg_t *mcca_arg = arg; + + if (mcca_arg->mcca_error == 0) { + mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, + size, mcca_arg->mcca_txg); + } +} + +int +metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) +{ + if (vd->vdev_ops->vdev_op_remap != NULL) { + metaslab_claim_cb_arg_t arg; + + /* + * Only zdb(1M) can claim on indirect vdevs. This is used + * to detect leaks of mapped space (that are not accounted + * for in the obsolete counts, spacemap, or bpobj). + */ + ASSERT(!spa_writeable(vd->vdev_spa)); + arg.mcca_error = 0; + arg.mcca_txg = txg; + + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_claim_impl_cb, &arg); + + if (arg.mcca_error == 0) { + arg.mcca_error = metaslab_claim_concrete(vd, + offset, size, txg); + } + return (arg.mcca_error); + } else { + return (metaslab_claim_concrete(vd, offset, size, txg)); + } +} + +/* + * Intent log support: upon opening the pool after a crash, notify the SPA + * of blocks that the intent log has allocated for immediate write, but + * which are still considered free by the SPA because the last transaction + * group didn't commit yet. + */ +static int +metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd; + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { + return (SET_ERROR(ENXIO)); + } + + ASSERT(DVA_IS_VALID(dva)); + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + return (metaslab_claim_impl(vd, offset, size, txg)); +} + int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, @@ -3405,7 +3752,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, txg, flags, zal); if (error != 0) { for (d--; d >= 0; d--) { - metaslab_free_dva(spa, &dva[d], txg, B_TRUE); + metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, DVA_GET_VDEV(&dva[d]), zio, flags); bzero(&dva[d], sizeof (dva_t)); @@ -3443,8 +3790,13 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); - for (int d = 0; d < ndvas; d++) - metaslab_free_dva(spa, &dva[d], txg, now); + for (int d = 0; d < ndvas; d++) { + if (now) { + metaslab_unalloc_dva(spa, &dva[d], txg); + } else { + metaslab_free_dva(spa, &dva[d], txg); + } + } spa_config_exit(spa, SCL_FREE, FTAG); } @@ -3480,6 +3832,49 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) return (error); } +/* ARGSUSED */ +static void +metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + if (vd->vdev_ops == &vdev_indirect_ops) + return; + + metaslab_check_free_impl(vd, offset, size); +} + +static void +metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) +{ + metaslab_t *msp; + spa_t *spa = vd->vdev_spa; + + if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) + return; + + if (vd->vdev_ops->vdev_op_remap != NULL) { + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_check_free_impl_cb, NULL); + return; + } + + ASSERT(vdev_is_concrete(vd)); + ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + mutex_enter(&msp->ms_lock); + if (msp->ms_loaded) + range_tree_verify(msp->ms_tree, offset, size); + + range_tree_verify(msp->ms_freeingtree, offset, size); + range_tree_verify(msp->ms_freedtree, offset, size); + for (int j = 0; j < TXG_DEFER_SIZE; j++) + range_tree_verify(msp->ms_defertree[j], offset, size); + mutex_exit(&msp->ms_lock); +} + void metaslab_check_free(spa_t *spa, const blkptr_t *bp) { @@ -3492,15 +3887,13 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp) vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); - metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - if (msp->ms_loaded) - range_tree_verify(msp->ms_tree, offset, size); + if (DVA_GET_GANG(&bp->blk_dva[i])) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - range_tree_verify(msp->ms_freeingtree, offset, size); - range_tree_verify(msp->ms_freedtree, offset, size); - for (int j = 0; j < TXG_DEFER_SIZE; j++) - range_tree_verify(msp->ms_defertree[j], offset, size); + ASSERT3P(vd, !=, NULL); + + metaslab_check_free_impl(vd, offset, size); } spa_config_exit(spa, SCL_VDEV, FTAG); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c index 6422fd1c1fa6..3a8cb6373d94 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include @@ -85,7 +85,6 @@ range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); - ASSERT(MUTEX_HELD(rt->rt_lock)); rt->rt_histogram[idx]++; ASSERT3U(rt->rt_histogram[idx], !=, 0); } @@ -100,7 +99,6 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); - ASSERT(MUTEX_HELD(rt->rt_lock)); ASSERT3U(rt->rt_histogram[idx], !=, 0); rt->rt_histogram[idx]--; } @@ -128,7 +126,7 @@ range_tree_seg_compare(const void *x1, const void *x2) } range_tree_t * -range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp) +range_tree_create(range_tree_ops_t *ops, void *arg) { range_tree_t *rt; @@ -137,7 +135,6 @@ range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp) avl_create(&rt->rt_root, range_tree_seg_compare, sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); - rt->rt_lock = lp; rt->rt_ops = ops; rt->rt_arg = arg; @@ -168,7 +165,6 @@ range_tree_add(void *arg, uint64_t start, uint64_t size) uint64_t end = start + size; boolean_t merge_before, merge_after; - ASSERT(MUTEX_HELD(rt->rt_lock)); VERIFY(size != 0); rsearch.rs_start = start; @@ -243,7 +239,6 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size) uint64_t end = start + size; boolean_t left_over, right_over; - ASSERT(MUTEX_HELD(rt->rt_lock)); VERIFY3U(size, !=, 0); VERIFY3U(size, <=, rt->rt_space); @@ -307,7 +302,6 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) range_seg_t rsearch; uint64_t end = start + size; - ASSERT(MUTEX_HELD(rt->rt_lock)); VERIFY(size != 0); rsearch.rs_start = start; @@ -329,11 +323,9 @@ range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) { range_seg_t *rs; - mutex_enter(rt->rt_lock); rs = range_tree_find(rt, off, size); if (rs != NULL) panic("freeing free block; rs=%p", (void *)rs); - mutex_exit(rt->rt_lock); } boolean_t @@ -351,6 +343,9 @@ range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t *rs; + if (size == 0) + return; + while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { uint64_t free_start = MAX(rs->rs_start, start); uint64_t free_end = MIN(rs->rs_end, start + size); @@ -363,7 +358,6 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) { range_tree_t *rt; - ASSERT(MUTEX_HELD((*rtsrc)->rt_lock)); ASSERT0(range_tree_space(*rtdst)); ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); @@ -378,7 +372,6 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) range_seg_t *rs; void *cookie = NULL; - ASSERT(MUTEX_HELD(rt->rt_lock)); if (rt->rt_ops != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); @@ -398,7 +391,6 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) { range_seg_t *rs; - ASSERT(MUTEX_HELD(rt->rt_lock)); for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) func(arg, rs->rs_start, rs->rs_end - rs->rs_start); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 1f29cb2e0676..7e5c531274c3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Martin Matuska . All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -51,11 +51,15 @@ #include #include #include +#include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -94,7 +98,7 @@ static int check_hostid = 1; * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ -static int zfs_ccw_retry_interval = 300; +int zfs_ccw_retry_interval = 300; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, @@ -155,14 +159,11 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; -static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, - const char *name); -static void spa_event_post(sysevent_t *ev); static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, - spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); @@ -801,7 +802,7 @@ spa_change_guid(spa_t *spa) spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } @@ -1133,6 +1134,9 @@ spa_activate(spa_t *spa, int mode) */ trim_thread_create(spa); + for (size_t i = 0; i < TXG_SIZE; i++) + spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0); + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), @@ -1183,6 +1187,12 @@ spa_deactivate(spa_t *spa) } } + for (size_t i = 0; i < TXG_SIZE; i++) { + ASSERT3P(spa->spa_txg_zio[i], !=, NULL); + VERIFY0(zio_wait(spa->spa_txg_zio[i])); + spa->spa_txg_zio[i] = NULL; + } + metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; @@ -1326,6 +1336,13 @@ spa_unload(spa_t *spa) spa->spa_async_zio_root = NULL; } + if (spa->spa_vdev_removal != NULL) { + spa_vdev_removal_destroy(spa->spa_vdev_removal); + spa->spa_vdev_removal = NULL; + } + + spa_condense_fini(spa); + bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -1383,6 +1400,8 @@ spa_unload(spa_t *spa) spa->spa_async_suspended = 0; + spa->spa_indirect_vdevs_loaded = B_FALSE; + if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; @@ -1397,7 +1416,7 @@ spa_unload(spa_t *spa) * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ -static void +void spa_load_spares(spa_t *spa) { nvlist_t **spares; @@ -1514,7 +1533,7 @@ spa_load_spares(spa_t *spa) * Devices which are already active have their details maintained, and are * not re-opened. */ -static void +void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache; @@ -1674,7 +1693,7 @@ spa_check_removed(vdev_t *vd) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && - !vd->vdev_ishole) { + vdev_is_concrete(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } @@ -1757,27 +1776,26 @@ spa_config_valid(spa_t *spa, nvlist_t *config) /* * Resolve any "missing" vdevs in the current configuration. + * Also trust the MOS config about any "indirect" vdevs. * If we find that the MOS config has more accurate information * about the top-level vdev then use that vdev instead. */ - if (tvd->vdev_ops == &vdev_missing_ops && - mtvd->vdev_ops != &vdev_missing_ops) { - - if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) - continue; + if ((tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops) || + (mtvd->vdev_ops == &vdev_indirect_ops && + tvd->vdev_ops != &vdev_indirect_ops)) { /* * Device specific actions. */ if (mtvd->vdev_islog) { + if (!(spa->spa_import_flags & + ZFS_IMPORT_MISSING_LOG)) { + continue; + } + spa_set_log_state(spa, SPA_LOG_CLEAR); - } else { - /* - * XXX - once we have 'readonly' pool - * support we should be able to handle - * missing data devices by transitioning - * the pool to readonly. - */ + } else if (mtvd->vdev_ops != &vdev_indirect_ops) { continue; } @@ -1791,10 +1809,6 @@ spa_config_valid(spa_t *spa, nvlist_t *config) vdev_add_child(rvd, mtvd); vdev_add_child(mrvd, tvd); - spa_config_exit(spa, SCL_ALL, FTAG); - vdev_load(mtvd); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_reopen(rvd); } else { if (mtvd->vdev_islog) { @@ -1813,6 +1827,14 @@ spa_config_valid(spa_t *spa, nvlist_t *config) */ spa_config_valid_zaps(tvd, mtvd); } + + /* + * Never trust this info from userland; always use what's + * in the MOS. This prevents it from getting out of sync + * with the rest of the info in the MOS. + */ + tvd->vdev_removing = mtvd->vdev_removing; + tvd->vdev_indirect_config = mtvd->vdev_indirect_config; } vdev_free(mrvd); @@ -1887,11 +1909,11 @@ spa_activate_log(spa_t *spa) } int -spa_offline_log(spa_t *spa) +spa_reset_logs(spa_t *spa) { int error; - error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* @@ -2108,7 +2130,7 @@ static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); - return (err); + return (SET_ERROR(err)); } /* @@ -2297,7 +2319,7 @@ vdev_count_verify_zaps(vdev_t *vd) */ static int spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, char **ereport) { int error = 0; @@ -2315,7 +2337,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * If this is an untrusted config, access the pool in read-only mode. * This prevents things like resilvering recently removed devices. */ - if (!mosconfig) + if (!trust_config) spa->spa_mode = FREAD; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -2383,7 +2405,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, */ if (type != SPA_IMPORT_ASSEMBLE) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd, mosconfig); + error = vdev_validate(rvd, trust_config); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) @@ -2477,7 +2499,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * can handle missing vdevs. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, - &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && + &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE && rvd->vdev_guid_sum != ub->ub_guid_sum) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); @@ -2501,6 +2523,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; + /* + * Everything that we read before we do spa_remove_init() must + * have been rewritten after the last device removal was initiated. + * Otherwise we could be reading from indirect vdevs before + * we have loaded their mappings. + */ + error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); @@ -2509,6 +2538,41 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* + * Validate the config, using the MOS config to fill in any + * information which might be missing. If we fail to validate + * the config then declare the pool unfit for use. If we're + * assembling a pool from a split, the log is not transferred + * over. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + nvlist_t *mos_config; + if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + if (!spa_config_valid(spa, mos_config)) { + nvlist_free(mos_config); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } + nvlist_free(mos_config); + + /* + * Now that we've validated the config, check the state of the + * root vdev. If it can't be opened, it indicates one or + * more toplevel vdevs are faulted. + */ + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (SET_ERROR(ENXIO)); + } + + /* + * Everything that we read before spa_remove_init() must be stored + * on concreted vdevs. Therefore we do this as early as possible. + */ + if (spa_remove_init(spa) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; @@ -2616,19 +2680,20 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - if (!mosconfig) { + if (!trust_config) { uint64_t hostid; - nvlist_t *policy = NULL, *nvconfig; + nvlist_t *policy = NULL; + nvlist_t *mos_config; - if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) + if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, + if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { char *hostname; unsigned long myhostid = 0; - VERIFY(nvlist_lookup_string(nvconfig, + VERIFY(nvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); #ifdef _KERNEL @@ -2642,7 +2707,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, #endif /* _KERNEL */ if (check_hostid && hostid != 0 && myhostid != 0 && hostid != myhostid) { - nvlist_free(nvconfig); + nvlist_free(mos_config); cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%lx). " @@ -2654,10 +2719,10 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, } if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_REWIND_POLICY, &policy) == 0) - VERIFY(nvlist_add_nvlist(nvconfig, + VERIFY(nvlist_add_nvlist(mos_config, ZPOOL_REWIND_POLICY, policy) == 0); - spa_config_set(spa, nvconfig); + spa_config_set(spa, mos_config); spa_unload(spa); spa_deactivate(spa); spa_activate(spa, orig_mode); @@ -2845,7 +2910,15 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Load the vdev state for all toplevel vdevs. */ - vdev_load(rvd); + error = vdev_load(rvd); + if (error != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + + error = spa_condense_init(spa); + if (error != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } /* * Propagate the leaf DTLs we just loaded all the way up the tree. @@ -2863,38 +2936,10 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa_update_dspace(spa); - /* - * Validate the config, using the MOS config to fill in any - * information which might be missing. If we fail to validate - * the config then declare the pool unfit for use. If we're - * assembling a pool from a split, the log is not transferred - * over. - */ - if (type != SPA_IMPORT_ASSEMBLE) { - nvlist_t *nvconfig; - - if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - if (!spa_config_valid(spa, nvconfig)) { - nvlist_free(nvconfig); - return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, - ENXIO)); - } - nvlist_free(nvconfig); - - /* - * Now that we've validated the config, check the state of the - * root vdev. If it can't be opened, it indicates one or - * more toplevel vdevs are faulted. - */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) - return (SET_ERROR(ENXIO)); - - if (spa_writeable(spa) && spa_check_logs(spa)) { - *ereport = FM_EREPORT_ZFS_LOG_REPLAY; - return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); - } + if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa) && + spa_check_logs(spa)) { + *ereport = FM_EREPORT_ZFS_LOG_REPLAY; + return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } if (missing_feat_write) { @@ -2924,6 +2969,18 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, int need_update = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); + /* + * We must check this before we start the sync thread, because + * we only want to start a condense thread for condense + * operations that were in progress when the pool was + * imported. Once we start syncing, spa_sync() could + * initiate a condense (and start a thread for it). In + * that case it would be wrong to start a second + * condense thread. + */ + boolean_t condense_in_progress = + (spa->spa_condensing_indirect != NULL); + ASSERT(state != SPA_LOAD_TRYIMPORT); /* @@ -3002,6 +3059,16 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); + + /* + * Note: unlike condensing, we don't need an analogous + * "removal_in_progress" dance because no other thread + * can start a removal while we hold the spa_namespace_lock. + */ + spa_restart_removal(spa); + + if (condense_in_progress) + spa_condense_indirect_restart(spa); } return (0); @@ -3187,7 +3254,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, */ spa_unload(spa); spa_deactivate(spa); - spa_config_sync(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); @@ -3742,6 +3809,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; + spa->spa_removing_phys.sr_state = DSS_NONE; + spa->spa_removing_phys.sr_removing_vdev = -1; + spa->spa_removing_phys.sr_prev_indirect_vdev = -1; /* * Create "The Godfather" zio to hold all async IOs @@ -3916,7 +3986,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, */ txg_wait_synced(spa->spa_dsl_pool, txg); - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); spa_history_log_version(spa, "create"); @@ -4388,7 +4458,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) if (props != NULL) spa_configfile_set(spa, props, B_FALSE); - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); @@ -4728,7 +4798,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) - spa_config_sync(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); @@ -4820,8 +4890,41 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (spa_vdev_exit(spa, vd, txg, error)); /* - * Transfer each new top-level vdev from vd to rvd. + * If we are in the middle of a device removal, we can only add + * devices which match the existing devices in the pool. + * If we are in the middle of a removal, or have some indirect + * vdevs, we can not add raidz toplevels. */ + if (spa->spa_vdev_removal != NULL || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + for (int c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (spa->spa_vdev_removal != NULL && + tvd->vdev_ashift != + spa->spa_vdev_removal->svr_vdev->vdev_ashift) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* Fail if top level vdev is raidz */ + if (tvd->vdev_ops == &vdev_raidz_ops) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* + * Need the top level mirror to be + * a mirror of leaf vdevs only + */ + if (tvd->vdev_ops == &vdev_mirror_ops) { + for (uint64_t cid = 0; + cid < tvd->vdev_children; cid++) { + vdev_t *cvd = tvd->vdev_child[cid]; + if (!cvd->vdev_ops->vdev_op_leaf) { + return (spa_vdev_exit(spa, vd, + txg, EINVAL)); + } + } + } + } + } + for (int c = 0; c < vd->vdev_children; c++) { /* @@ -4907,6 +5010,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (spa->spa_vdev_removal != NULL || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + } + if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); @@ -5352,7 +5460,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); - error = spa_offline_log(spa); + error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) @@ -5380,7 +5488,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ - if (vd->vdev_islog || vd->vdev_ishole) { + if (vd->vdev_islog || !vdev_is_concrete(vd)) { if (lastlog == 0) lastlog = c; continue; @@ -5433,7 +5541,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || - vml[c]->vdev_ishole || + !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || @@ -5630,257 +5738,6 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, return (error); } -static nvlist_t * -spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) -{ - for (int i = 0; i < count; i++) { - uint64_t guid; - - VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, - &guid) == 0); - - if (guid == target_guid) - return (nvpp[i]); - } - - return (NULL); -} - -static void -spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, - nvlist_t *dev_to_remove) -{ - nvlist_t **newdev = NULL; - - if (count > 1) - newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); - - for (int i = 0, j = 0; i < count; i++) { - if (dev[i] == dev_to_remove) - continue; - VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); - } - - VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); - VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); - - for (int i = 0; i < count - 1; i++) - nvlist_free(newdev[i]); - - if (count > 1) - kmem_free(newdev, (count - 1) * sizeof (void *)); -} - -/* - * Evacuate the device. - */ -static int -spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) -{ - uint64_t txg; - int error = 0; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); - ASSERT(vd == vd->vdev_top); - - /* - * Evacuate the device. We don't hold the config lock as writer - * since we need to do I/O but we do keep the - * spa_namespace_lock held. Once this completes the device - * should no longer have any blocks allocated on it. - */ - if (vd->vdev_islog) { - if (vd->vdev_stat.vs_alloc != 0) - error = spa_offline_log(spa); - } else { - error = SET_ERROR(ENOTSUP); - } - - if (error) - return (error); - - /* - * The evacuation succeeded. Remove any remaining MOS metadata - * associated with this vdev, and wait for these changes to sync. - */ - ASSERT0(vd->vdev_stat.vs_alloc); - txg = spa_vdev_config_enter(spa); - vd->vdev_removing = B_TRUE; - vdev_dirty_leaves(vd, VDD_DTL, txg); - vdev_config_dirty(vd); - spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); - - return (0); -} - -/* - * Complete the removal by cleaning up the namespace. - */ -static void -spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) -{ - vdev_t *rvd = spa->spa_root_vdev; - uint64_t id = vd->vdev_id; - boolean_t last_vdev = (id == (rvd->vdev_children - 1)); - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - ASSERT(vd == vd->vdev_top); - - /* - * Only remove any devices which are empty. - */ - if (vd->vdev_stat.vs_alloc != 0) - return; - - (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); - - if (list_link_active(&vd->vdev_state_dirty_node)) - vdev_state_clean(vd); - if (list_link_active(&vd->vdev_config_dirty_node)) - vdev_config_clean(vd); - - vdev_free(vd); - - if (last_vdev) { - vdev_compact_children(rvd); - } else { - vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); - vdev_add_child(rvd, vd); - } - vdev_config_dirty(rvd); - - /* - * Reassess the health of our root vdev. - */ - vdev_reopen(rvd); -} - -/* - * Remove a device from the pool - - * - * Removing a device from the vdev namespace requires several steps - * and can take a significant amount of time. As a result we use - * the spa_vdev_config_[enter/exit] functions which allow us to - * grab and release the spa_config_lock while still holding the namespace - * lock. During each step the configuration is synced out. - * - * Currently, this supports removing only hot spares, slogs, and level 2 ARC - * devices. - */ -int -spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) -{ - vdev_t *vd; - sysevent_t *ev = NULL; - metaslab_group_t *mg; - nvlist_t **spares, **l2cache, *nv; - uint64_t txg = 0; - uint_t nspares, nl2cache; - int error = 0; - boolean_t locked = MUTEX_HELD(&spa_namespace_lock); - - ASSERT(spa_writeable(spa)); - - if (!locked) - txg = spa_vdev_enter(spa); - - vd = spa_lookup_by_guid(spa, guid, B_FALSE); - - if (spa->spa_spares.sav_vdevs != NULL && - nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && - (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { - /* - * Only remove the hot spare if it's not currently in use - * in this pool. - */ - if (vd == NULL || unspare) { - if (vd == NULL) - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_AUX); - spa_vdev_remove_aux(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares, nv); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; - } else { - error = SET_ERROR(EBUSY); - } - } else if (spa->spa_l2cache.sav_vdevs != NULL && - nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && - (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { - /* - * Cache devices can always be removed. - */ - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); - spa_vdev_remove_aux(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); - spa_load_l2cache(spa); - spa->spa_l2cache.sav_sync = B_TRUE; - } else if (vd != NULL && vd->vdev_islog) { - ASSERT(!locked); - ASSERT(vd == vd->vdev_top); - - mg = vd->vdev_mg; - - /* - * Stop allocating from this vdev. - */ - metaslab_group_passivate(mg); - - /* - * Wait for the youngest allocations and frees to sync, - * and then wait for the deferral of those frees to finish. - */ - spa_vdev_config_exit(spa, NULL, - txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); - - /* - * Attempt to evacuate the vdev. - */ - error = spa_vdev_remove_evacuate(spa, vd); - - txg = spa_vdev_config_enter(spa); - - /* - * If we couldn't evacuate the vdev, unwind. - */ - if (error) { - metaslab_group_activate(mg); - return (spa_vdev_exit(spa, NULL, txg, error)); - } - - /* - * Clean up the vdev namespace. - */ - ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); - spa_vdev_remove_from_namespace(spa, vd); - - } else if (vd != NULL) { - /* - * Normal vdevs cannot be removed (yet). - */ - error = SET_ERROR(ENOTSUP); - } else { - /* - * There is no vdev of any kind with the specified guid. - */ - error = SET_ERROR(ENOENT); - } - - if (!locked) - error = spa_vdev_exit(spa, NULL, txg, error); - - if (ev) - spa_event_post(ev); - - return (error); -} - /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. @@ -6288,10 +6145,13 @@ spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; - while (spa->spa_async_thread != NULL && - spa->spa_async_thread_vd != NULL) + while (spa->spa_async_thread != NULL || + spa->spa_async_thread_vd != NULL || + spa->spa_condense_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); + + spa_vdev_remove_suspend(spa); } void @@ -6301,6 +6161,7 @@ spa_async_resume(spa_t *spa) ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); + spa_restart_removal(spa); } static boolean_t @@ -6859,6 +6720,39 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) rrw_exit(&dp->dp_config_rwlock, FTAG); } +static void +vdev_indirect_state_sync_verify(vdev_t *vd) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + vdev_indirect_births_t *vib = vd->vdev_indirect_births; + + if (vd->vdev_ops == &vdev_indirect_ops) { + ASSERT(vim != NULL); + ASSERT(vib != NULL); + } + + if (vdev_obsolete_sm_object(vd) != 0) { + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); + ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); + + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, + space_map_allocated(vd->vdev_obsolete_sm)); + } + ASSERT(vd->vdev_obsolete_segments != NULL); + + /* + * Since frees / remaps to an indirect vdev can only + * happen in syncing context, the obsolete segments + * tree must be empty when we start syncing. + */ + ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); +} + /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. @@ -6878,6 +6772,13 @@ spa_sync(spa_t *spa, uint64_t txg) VERIFY(spa_writeable(spa)); + /* + * Wait for i/os issued in open context that need to complete + * before this txg syncs. + */ + VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK])); + spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0); + /* * Lock out configuration changes. */ @@ -6980,6 +6881,16 @@ spa_sync(spa_t *spa, uint64_t txg) ASSERT3U(mc->mc_alloc_max_slots, <=, max_queue_depth * rvd->vdev_children); + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + vdev_indirect_state_sync_verify(vd); + + if (vdev_indirect_should_condense(vd)) { + spa_condense_indirect_start_sync(vd, tx); + break; + } + } + /* * Iterate to convergence. */ @@ -7009,7 +6920,11 @@ spa_sync(spa_t *spa, uint64_t txg) ddt_sync(spa, txg); dsl_scan_sync(dp, tx); - while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) + if (spa->spa_vdev_removal != NULL) + svr_sync(spa, tx); + + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) + != NULL) vdev_sync(vd, txg); if (pass == 1) { @@ -7061,6 +6976,10 @@ spa_sync(spa_t *spa, uint64_t txg) all_vdev_zap_entry_count); } + if (spa->spa_vdev_removal != NULL) { + ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); + } + /* * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. @@ -7085,7 +7004,8 @@ spa_sync(spa_t *spa, uint64_t txg) for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; - if (vd->vdev_ms_array == 0 || vd->vdev_islog) + if (vd->vdev_ms_array == 0 || vd->vdev_islog || + !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_DVAS_PER_BP) @@ -7328,7 +7248,7 @@ spa_has_active_shared_spare(spa_t *spa) return (B_FALSE); } -static sysevent_t * +sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; @@ -7382,7 +7302,7 @@ spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) return (ev); } -static void +void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL @@ -7393,6 +7313,14 @@ spa_event_post(sysevent_t *ev) #endif } +void +spa_event_discard(sysevent_t *ev) +{ +#ifdef _KERNEL + sysevent_free(ev); +#endif +} + /* * Post a sysevent corresponding to the given event. The 'name' must be one of * the event definitions in sys/sysevent/eventdefs.h. The payload will be diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c index f143766315ec..82c435e21570 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c @@ -56,7 +56,7 @@ * configuration information. When the module loads, we read this information * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is * maintained independently in spa.c. Whenever the namespace is modified, or - * the configuration of a pool is changed, we call spa_config_sync(), which + * the configuration of a pool is changed, we call spa_write_cachefile(), which * walks through all the active pools and writes the configuration to disk. */ @@ -216,11 +216,11 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) * the configuration has been synced to the MOS. This exposes a window where * the MOS config will have been updated but the cache file has not. If * the system were to crash at that instant then the cached config may not - * contain the correct information to open the pool and an explicity import + * contain the correct information to open the pool and an explicit import * would be required. */ void -spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) +spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; @@ -556,7 +556,7 @@ spa_config_update(spa_t *spa, int what) /* * Update the global config cache to reflect the new mosconfig. */ - spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); + spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index b6b0e2189508..3c58597d33fd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -248,8 +248,11 @@ kmem_cache_t *spa_buffer_pool; int spa_mode_global; #ifdef ZFS_DEBUG -/* Everything except dprintf and spa is on by default in debug builds */ -int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA); +/* + * Everything except dprintf, spa, and indirect_remap is on by default + * in debug builds. + */ +int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA | ZFS_DEBUG_INDIRECT_REMAP); #else int zfs_flags = 0; #endif @@ -520,7 +523,7 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } - ASSERT(wlocks_held <= locks); + ASSERT3U(wlocks_held, <=, locks); } void @@ -1271,7 +1274,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) * If the config changed, update the config cache. */ if (config_changed) - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); } /* @@ -1355,7 +1358,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) */ if (config_changed) { mutex_enter(&spa_namespace_lock); - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); } @@ -1433,7 +1436,7 @@ spa_rename(const char *name, const char *newname) /* * Sync the updated config cache. */ - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_close(spa, FTAG); @@ -1649,6 +1652,12 @@ spa_is_initializing(spa_t *spa) return (spa->spa_is_initializing); } +boolean_t +spa_indirect_vdevs_loaded(spa_t *spa) +{ + return (spa->spa_indirect_vdevs_loaded); +} + blkptr_t * spa_get_rootblkptr(spa_t *spa) { @@ -1799,6 +1808,24 @@ spa_update_dspace(spa_t *spa) { spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + ddt_get_dedup_dspace(spa); + if (spa->spa_vdev_removal != NULL) { + /* + * We can't allocate from the removing device, so + * subtract its size. This prevents the DMU/DSL from + * filling up the (now smaller) pool while we are in the + * middle of removing the device. + * + * Note that the DMU/DSL doesn't actually know or care + * how much space is allocated (it does its own tracking + * of how much space has been logically used). So it + * doesn't matter that the data we are moving may be + * allocated twice (on the old device and the new + * device). + */ + vdev_t *vd = spa->spa_vdev_removal->svr_vdev; + spa->spa_dspace -= spa_deflate(spa) ? + vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + } } /* @@ -2201,3 +2228,45 @@ spa_maxblocksize(spa_t *spa) else return (SPA_OLD_MAXBLOCKSIZE); } + +/* + * Returns the txg that the last device removal completed. No indirect mappings + * have been added since this txg. + */ +uint64_t +spa_get_last_removal_txg(spa_t *spa) +{ + uint64_t vdevid; + uint64_t ret = -1ULL; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + /* + * sr_prev_indirect_vdev is only modified while holding all the + * config locks, so it is sufficient to hold SCL_VDEV as reader when + * examining it. + */ + vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; + + while (vdevid != -1ULL) { + vdev_t *vd = vdev_lookup_top(spa, vdevid); + vdev_indirect_births_t *vib = vd->vdev_indirect_births; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + /* + * If the removal did not remap any data, we don't care. + */ + if (vdev_indirect_births_count(vib) != 0) { + ret = vdev_indirect_births_last_entry_txg(vib); + break; + } + + vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; + } + spa_config_exit(spa, SCL_VDEV, FTAG); + + IMPLY(ret != -1ULL, + spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + return (ret); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c index dc610d51b110..a5b0038bf304 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -50,42 +50,27 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_blksz, CTLFLAG_RDTUN, &space_map_blksz, "Maximum block size for space map. Must be power of 2 and greater than 4096."); /* - * Load the space map disk into the specified range tree. Segments of maptype - * are added to the range tree, other segment types are removed. - * - * Note: space_map_load() will drop sm_lock across dmu_read() calls. - * The caller must be OK with this. + * Iterate through the space map, invoking the callback on each (non-debug) + * space map entry. */ int -space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) +space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) { uint64_t *entry, *entry_map, *entry_map_end; - uint64_t bufsize, size, offset, end, space; + uint64_t bufsize, size, offset, end; int error = 0; - ASSERT(MUTEX_HELD(sm->sm_lock)); - end = space_map_length(sm); - space = space_map_allocated(sm); - - VERIFY0(range_tree_space(rt)); - - if (maptype == SM_FREE) { - range_tree_add(rt, sm->sm_start, sm->sm_size); - space = sm->sm_size - space; - } bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); entry_map = zio_buf_alloc(bufsize); - mutex_exit(sm->sm_lock); if (end > bufsize) { dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, end - bufsize, ZIO_PRIORITY_SYNC_READ); } - mutex_enter(sm->sm_lock); - for (offset = 0; offset < end; offset += bufsize) { + for (offset = 0; offset < end && error == 0; offset += bufsize) { size = MIN(end - offset, bufsize); VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); VERIFY(size != 0); @@ -94,19 +79,18 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) dprintf("object=%llu offset=%llx size=%llx\n", space_map_object(sm), offset, size); - mutex_exit(sm->sm_lock); error = dmu_read(sm->sm_os, space_map_object(sm), offset, size, entry_map, DMU_READ_PREFETCH); - mutex_enter(sm->sm_lock); if (error != 0) break; entry_map_end = entry_map + (size / sizeof (uint64_t)); - for (entry = entry_map; entry < entry_map_end; entry++) { + for (entry = entry_map; entry < entry_map_end && error == 0; + entry++) { uint64_t e = *entry; uint64_t offset, size; - if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ + if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ continue; offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) + @@ -117,25 +101,69 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift)); VERIFY3U(offset, >=, sm->sm_start); VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size); - if (SM_TYPE_DECODE(e) == maptype) { - VERIFY3U(range_tree_space(rt) + size, <=, - sm->sm_size); - range_tree_add(rt, offset, size); - } else { - range_tree_remove(rt, offset, size); - } + error = callback(SM_TYPE_DECODE(e), offset, size, arg); } } - if (error == 0) - VERIFY3U(range_tree_space(rt), ==, space); - else - range_tree_vacate(rt, NULL, NULL); - zio_buf_free(entry_map, bufsize); return (error); } +typedef struct space_map_load_arg { + space_map_t *smla_sm; + range_tree_t *smla_rt; + maptype_t smla_type; +} space_map_load_arg_t; + +static int +space_map_load_callback(maptype_t type, uint64_t offset, uint64_t size, + void *arg) +{ + space_map_load_arg_t *smla = arg; + if (type == smla->smla_type) { + VERIFY3U(range_tree_space(smla->smla_rt) + size, <=, + smla->smla_sm->sm_size); + range_tree_add(smla->smla_rt, offset, size); + } else { + range_tree_remove(smla->smla_rt, offset, size); + } + + return (0); +} + +/* + * Load the space map disk into the specified range tree. Segments of maptype + * are added to the range tree, other segment types are removed. + */ +int +space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) +{ + uint64_t space; + int err; + space_map_load_arg_t smla; + + VERIFY0(range_tree_space(rt)); + space = space_map_allocated(sm); + + if (maptype == SM_FREE) { + range_tree_add(rt, sm->sm_start, sm->sm_size); + space = sm->sm_size - space; + } + + smla.smla_rt = rt; + smla.smla_sm = sm; + smla.smla_type = maptype; + err = space_map_iterate(sm, space_map_load_callback, &smla); + + if (err == 0) { + VERIFY3U(range_tree_space(rt), ==, space); + } else { + range_tree_vacate(rt, NULL, NULL); + } + + return (err); +} + void space_map_histogram_clear(space_map_t *sm) { @@ -164,7 +192,6 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) { int idx = 0; - ASSERT(MUTEX_HELD(rt->rt_lock)); ASSERT(dmu_tx_is_syncing(tx)); VERIFY3U(space_map_object(sm), !=, 0); @@ -233,9 +260,6 @@ space_map_entries(space_map_t *sm, range_tree_t *rt) return (entries); } -/* - * Note: space_map_write() will drop sm_lock across dmu_write() calls. - */ void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, dmu_tx_t *tx) @@ -248,7 +272,6 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, uint64_t *entry, *entry_map, *entry_map_end; uint64_t expected_entries, actual_entries = 1; - ASSERT(MUTEX_HELD(rt->rt_lock)); ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); VERIFY3U(space_map_object(sm), !=, 0); dmu_buf_will_dirty(sm->sm_dbuf, tx); @@ -298,11 +321,9 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, run_len = MIN(size, SM_RUN_MAX); if (entry == entry_map_end) { - mutex_exit(rt->rt_lock); dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize, sm->sm_blksz, entry_map, tx); - mutex_enter(rt->rt_lock); sm->sm_phys->smp_objsize += sm->sm_blksz; entry = entry_map; } @@ -319,10 +340,8 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, if (entry != entry_map) { size = (entry - entry_map) * sizeof (uint64_t); - mutex_exit(rt->rt_lock); dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize, size, entry_map, tx); - mutex_enter(rt->rt_lock); sm->sm_phys->smp_objsize += size; } ASSERT3U(expected_entries, ==, actual_entries); @@ -355,7 +374,7 @@ space_map_open_impl(space_map_t *sm) int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, - uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp) + uint64_t start, uint64_t size, uint8_t shift) { space_map_t *sm; int error; @@ -369,7 +388,6 @@ space_map_open(space_map_t **smp, objset_t *os, uint64_t object, sm->sm_start = start; sm->sm_size = size; sm->sm_shift = shift; - sm->sm_lock = lp; sm->sm_os = os; sm->sm_object = object; @@ -458,8 +476,6 @@ space_map_update(space_map_t *sm) if (sm == NULL) return; - ASSERT(MUTEX_HELD(sm->sm_lock)); - sm->sm_alloc = sm->sm_phys->smp_alloc; sm->sm_length = sm->sm_phys->smp_objsize; } @@ -487,27 +503,29 @@ space_map_alloc(objset_t *os, dmu_tx_t *tx) } void -space_map_free(space_map_t *sm, dmu_tx_t *tx) +space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx) { - spa_t *spa; - - if (sm == NULL) - return; - - spa = dmu_objset_spa(sm->sm_os); + spa_t *spa = dmu_objset_spa(os); if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { dmu_object_info_t doi; - dmu_object_info_from_db(sm->sm_dbuf, &doi); + VERIFY0(dmu_object_info(os, smobj, &doi)); if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) { - VERIFY(spa_feature_is_active(spa, - SPA_FEATURE_SPACEMAP_HISTOGRAM)); spa_feature_decr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); } } - VERIFY3U(dmu_object_free(sm->sm_os, space_map_object(sm), tx), ==, 0); + VERIFY0(dmu_object_free(os, smobj, tx)); +} + +void +space_map_free(space_map_t *sm, dmu_tx_t *tx) +{ + if (sm == NULL) + return; + + space_map_free_obj(sm->sm_os, space_map_object(sm), tx); sm->sm_object = 0; } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c index 3d990596f766..a866e65d54f7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c @@ -117,8 +117,6 @@ space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) { range_seg_t *rs; - ASSERT(MUTEX_HELD(rt->rt_lock)); - for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt); } @@ -134,8 +132,6 @@ space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref) int64_t refcnt = 0; space_ref_t *sr; - ASSERT(MUTEX_HELD(rt->rt_lock)); - range_tree_vacate(rt, NULL, NULL); for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h index 2a365199ce44..d425e239f6a6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_BPOBJ_H @@ -74,6 +74,7 @@ void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx); int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); void bpobj_close(bpobj_t *bpo); +boolean_t bpobj_is_open(const bpobj_t *bpo); int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); @@ -85,6 +86,7 @@ int bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +boolean_t bpobj_is_empty(bpobj_t *bpo); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index 90a0f221c69b..69617b3dca9c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -318,6 +318,8 @@ void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); void dbuf_release_bp(dmu_buf_impl_t *db); +boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf); + void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, struct dmu_tx *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index e61318ad2271..b0c809221170 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -293,6 +293,7 @@ int dmu_objset_find(char *name, int func(const char *, void *), void *arg, void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); +int dmu_objset_remap_indirects(const char *fsname); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ @@ -329,6 +330,9 @@ typedef struct dmu_buf { #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" #define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" #define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map" +#define DMU_POOL_REMOVING "com.delphix:removing" +#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" +#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" /* * Allocate an object from this objset. The range of object numbers @@ -412,6 +416,8 @@ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); +int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg); + void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, @@ -430,8 +436,8 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus - * data. As with any normal buffer, you must call dmu_buf_read() to - * read db_data, dmu_buf_will_dirty() before modifying it, and the + * data. As with any normal buffer, you must call dmu_buf_will_dirty() + * before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release your hold with dmu_buf_rele(). @@ -678,6 +684,7 @@ void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len); +void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h index 92f50a01774c..5566c70add13 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h @@ -313,6 +313,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, int minlvl, uint64_t blkfill, uint64_t txg); void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn); +boolean_t dnode_needs_remap(const dnode_t *dn); #define DNODE_IS_CACHEABLE(_dn) \ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index f01c33aea855..03dca17bee6d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -100,6 +100,11 @@ struct dsl_pool; #define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok" #define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok" +/* + * This field is set to the object number of the remap deadlist if one exists. + */ +#define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist" + /* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. @@ -161,6 +166,24 @@ typedef struct dsl_dataset { dsl_deadlist_t ds_deadlist; bplist_t ds_pending_deadlist; + /* + * The remap deadlist contains blocks (DVA's, really) that are + * referenced by the previous snapshot and point to indirect vdevs, + * but in this dataset they have been remapped to point to concrete + * (or at least, less-indirect) vdevs. In other words, the + * physical DVA is referenced by the previous snapshot but not by + * this dataset. Logically, the DVA continues to be referenced, + * but we are using a different (less indirect) physical DVA. + * This deadlist is used to determine when physical DVAs that + * point to indirect vdevs are no longer referenced anywhere, + * and thus should be marked obsolete. + * + * This is only used if SPA_FEATURE_OBSOLETE_COUNTS is enabled. + */ + dsl_deadlist_t ds_remap_deadlist; + /* protects creation of the ds_remap_deadlist */ + kmutex_t ds_remap_deadlist_lock; + /* protected by lock on pool's dp_dirty_datasets list */ txg_node_t ds_dirty_link; list_node_t ds_synced_link; @@ -310,6 +333,8 @@ void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx); int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async); +void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, + uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx); void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); @@ -397,6 +422,11 @@ void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx); int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, nvlist_t *result); +uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds); +void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); +boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds); +void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); + void dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h index d2c16d72c17e..08f38233d7ab 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_DEADLIST_H @@ -79,6 +80,7 @@ void dsl_deadlist_space_range(dsl_deadlist_t *dl, void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx); +boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h index 59e8e055551a..6fb6a121ade6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_DELEG_H @@ -57,6 +57,7 @@ extern "C" { #define ZFS_DELEG_PERM_RELEASE "release" #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" +#define ZFS_DELEG_PERM_REMAP "remap" /* * Note: the names of properties that are marked delegatable are also diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h index c5ba6e2f6e43..17747eb8ca82 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h @@ -47,6 +47,7 @@ struct dsl_dataset; #define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" #define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" +#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg" typedef enum dd_used { DD_USED_HEAD, @@ -144,6 +145,7 @@ uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd); void dsl_dir_get_origin(dsl_dir_t *dd, char *buf); int dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count); int dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count); +int dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count); void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); uint64_t dsl_dir_space_available(dsl_dir_t *dd, @@ -166,6 +168,7 @@ int dsl_dir_activate_fs_ss_limit(const char *); int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *, cred_t *); void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *); +int dsl_dir_update_last_remap_txg(dsl_dir_t *, uint64_t); int dsl_dir_rename(const char *oldname, const char *newname); int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *); @@ -182,7 +185,6 @@ boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" #define ORIGIN_DIR_NAME "$ORIGIN" -#define XLATION_DIR_NAME "$XLATION" #define FREE_DIR_NAME "$FREE" #define LEAK_DIR_NAME "$LEAK" diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h index b6c51cb3cd69..095d33a47e15 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h @@ -96,6 +96,7 @@ typedef struct dsl_pool { bpobj_t dp_free_bpobj; uint64_t dp_bptree_obj; uint64_t dp_empty_bpobj; + bpobj_t dp_obsolete_bpobj; struct dsl_scan *dp_scan; @@ -144,7 +145,6 @@ void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg); int dsl_pool_sync_context(dsl_pool_t *dp); uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); -uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); @@ -173,6 +173,9 @@ int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp); void dsl_pool_rele(dsl_pool_t *dp, void *tag); +void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx); + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h index fd950cc01476..a738ebe3ee08 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2017 Datto Inc. */ @@ -117,6 +117,9 @@ typedef struct dsl_scan { boolean_t scn_is_bptree; boolean_t scn_async_destroying; boolean_t scn_async_stalled; + uint64_t scn_async_block_min_time_ms; + + /* for debugging / information */ uint64_t scn_visited_this_txg; dsl_scan_phys_t scn_phys; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h index 84c17c1c14da..1601bf8fe339 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -66,8 +66,15 @@ uint64_t metaslab_block_maxsize(metaslab_t *); int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *); +int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, + dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); +void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, uint64_t); +void metaslab_free_dva(spa_t *, const dva_t *, uint64_t); +void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *); +void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); +int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_alloc_trace_init(void); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h index 783cd371272f..5161f98d0a9e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -258,14 +258,13 @@ struct metaslab_group { * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_tree) contains the list of * free segments which are eligible for allocation. As blocks are - * allocated, the allocated segments are removed from the ms_tree and - * added to a per txg allocation tree (ms_alloctree). This allows us to - * process all allocations in syncing context where it is safe to update - * the on-disk space maps. Frees are also processed in syncing context. - * Most frees are generated from syncing context, and those that are not - * are held in the spa_free_bplist for processing in syncing context. - * An additional set of in-core trees is maintained to track deferred - * frees (ms_defertree). Once a block is freed it will move from the + * allocated, the allocated segment are removed from the ms_tree and + * added to a per txg allocation tree (ms_alloctree). As blocks are + * freed, they are added to the free tree (ms_freeingtree). These trees + * allow us to process all allocations and frees in syncing context + * where it is safe to update the on-disk space maps. An additional set + * of in-core trees is maintained to track deferred frees + * (ms_defertree). Once a block is freed it will move from the * ms_freedtree to the ms_defertree. A deferred free means that a block * has been freed but cannot be used by the pool until TXG_DEFER_SIZE * transactions groups later. For example, a block that is freed in txg @@ -311,6 +310,7 @@ struct metaslab_group { */ struct metaslab { kmutex_t ms_lock; + kmutex_t ms_sync_lock; kcondvar_t ms_load_cv; space_map_t *ms_sm; uint64_t ms_id; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h index 9f3ead537165..043b1337fa83 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_RANGE_TREE_H @@ -41,6 +41,10 @@ extern "C" { typedef struct range_tree_ops range_tree_ops_t; +/* + * Note: the range_tree may not be accessed concurrently; consumers + * must provide external locking if required. + */ typedef struct range_tree { avl_tree_t rt_root; /* offset-ordered segment AVL tree */ uint64_t rt_space; /* sum of all segments in the map */ @@ -53,7 +57,6 @@ typedef struct range_tree { * 2^i <= size of range in bytes < 2^(i+1) */ uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; - kmutex_t *rt_lock; /* pointer to lock that protects map */ } range_tree_t; typedef struct range_seg { @@ -75,7 +78,7 @@ typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); void range_tree_init(void); void range_tree_fini(void); -range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp); +range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); uint64_t range_tree_space(range_tree_t *rt); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index a49a1c284ba1..5f39becc3092 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -233,7 +234,10 @@ typedef struct zio_cksum_salt { * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type - * phys birth txg of block allocation; zero if same as logical birth txg + * phys birth txg when dva[0] was written; zero if same as logical birth txg + * note that typically all the dva's would be written in this + * txg, but they could be different if they were moved by + * device removal. * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes @@ -713,7 +717,7 @@ extern kmutex_t spa_namespace_lock; #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 -extern void spa_config_sync(spa_t *, boolean_t, boolean_t); +extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); extern void spa_config_load(void); extern nvlist_t *spa_all_configs(uint64_t *); extern void spa_config_set(spa_t *spa, nvlist_t *config); @@ -776,7 +780,7 @@ typedef enum spa_log_state { extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); -extern int spa_offline_log(spa_t *spa); +extern int spa_reset_logs(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); @@ -785,6 +789,7 @@ extern void spa_claim_notify(zio_t *zio); extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern boolean_t spa_is_initializing(spa_t *spa); +extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); @@ -849,6 +854,11 @@ extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); +typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, + void *arg); +extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, + spa_remap_cb_t callback, void *arg); +extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern int spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); @@ -901,6 +911,10 @@ extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, const char *name); +extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, + const char *name); +extern void spa_event_post(sysevent_t *ev); +extern void spa_event_discard(sysevent_t *ev); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index 3baf2e35e95e..cbd5908cd7ac 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -63,6 +64,62 @@ typedef struct spa_history_phys { uint64_t sh_records_lost; /* num of records overwritten */ } spa_history_phys_t; +/* + * All members must be uint64_t, for byteswap purposes. + */ +typedef struct spa_removing_phys { + uint64_t sr_state; /* dsl_scan_state_t */ + + /* + * The vdev ID that we most recently attempted to remove, + * or -1 if no removal has been attempted. + */ + uint64_t sr_removing_vdev; + + /* + * The vdev ID that we most recently successfully removed, + * or -1 if no devices have been removed. + */ + uint64_t sr_prev_indirect_vdev; + + uint64_t sr_start_time; + uint64_t sr_end_time; + + /* + * Note that we can not use the space map's or indirect mapping's + * accounting as a substitute for these values, because we need to + * count frees of not-yet-copied data as though it did the copy. + * Otherwise, we could get into a situation where copied > to_copy, + * or we complete before copied == to_copy. + */ + uint64_t sr_to_copy; /* bytes that need to be copied */ + uint64_t sr_copied; /* bytes that have been copied or freed */ +} spa_removing_phys_t; + +/* + * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT + * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense + * of an indirect vdev's mapping object is in progress. + */ +typedef struct spa_condensing_indirect_phys { + /* + * The vdev ID of the indirect vdev whose indirect mapping is + * being condensed. + */ + uint64_t scip_vdev; + + /* + * The vdev's old obsolete spacemap. This spacemap's contents are + * being integrated into the new mapping. + */ + uint64_t scip_prev_obsolete_sm_object; + + /* + * The new mapping object that is being created. + */ + uint64_t scip_next_mapping_object; +} spa_condensing_indirect_phys_t; + struct spa_aux_vdev { uint64_t sav_object; /* MOS object for device list */ nvlist_t *sav_config; /* cached device config */ @@ -142,6 +199,7 @@ struct spa { int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ + boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ uint64_t spa_import_flags; /* import specific flags */ spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; @@ -203,6 +261,14 @@ struct spa { int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ + + spa_removing_phys_t spa_removing_phys; + spa_vdev_removal_t *spa_vdev_removal; + + spa_condensing_indirect_phys_t spa_condensing_indirect_phys; + spa_condensing_indirect_t *spa_condensing_indirect; + kthread_t *spa_condense_thread; /* thread doing condense. */ + char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ @@ -232,6 +298,7 @@ struct spa { /* per-CPU array of root of async I/O: */ zio_t **spa_async_zio_root; zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ + zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ uint8_t spa_suspended; /* pool is suspended */ @@ -312,6 +379,8 @@ extern const char *spa_config_path; extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); +extern void spa_load_spares(spa_t *spa); +extern void spa_load_l2cache(spa_t *spa); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h index a59e6d37d43a..457300d05328 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_SPACE_MAP_H @@ -73,6 +73,9 @@ typedef struct space_map_phys { * The space map object defines a region of space, its size, how much is * allocated, and the on-disk object that stores this information. * Consumers of space maps may only access the members of this structure. + * + * Note: the space_map may not be accessed concurrently; consumers + * must provide external locking if required. */ typedef struct space_map { uint64_t sm_start; /* start of map */ @@ -85,7 +88,6 @@ typedef struct space_map { uint32_t sm_blksz; /* block size for space map */ dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */ space_map_phys_t *sm_phys; /* on-disk space map */ - kmutex_t *sm_lock; /* pointer to lock that protects map */ } space_map_t; /* @@ -133,7 +135,11 @@ typedef enum { SM_FREE } maptype_t; +typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size, + void *arg); + int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); +int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg); void space_map_histogram_clear(space_map_t *sm); void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, @@ -150,9 +156,10 @@ void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, void space_map_truncate(space_map_t *sm, dmu_tx_t *tx); uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx); void space_map_free(space_map_t *sm, dmu_tx_t *tx); +void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx); int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, - uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp); + uint64_t start, uint64_t size, uint8_t shift); void space_map_close(space_map_t *sm); int64_t space_map_alloc_delta(space_map_t *sm); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h index d07b928a310a..4e303bae8774 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -57,7 +57,7 @@ extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); extern void vdev_reopen(vdev_t *); extern int vdev_validate_aux(vdev_t *vd); extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); - +extern boolean_t vdev_is_concrete(vdev_t *vd); extern boolean_t vdev_is_bootable(vdev_t *vd); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); @@ -76,6 +76,11 @@ extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx); extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx); extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx); +extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx); +extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, + uint64_t size, uint64_t txg); +extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev, + uint64_t offset, uint64_t size, dmu_tx_t *tx); extern void vdev_hold(vdev_t *); extern void vdev_rele(vdev_t *); @@ -88,7 +93,6 @@ extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd); - extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index 28d1c674f933..15fbe12a859d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -27,6 +27,7 @@ #define _SYS_VDEV_IMPL_H #include +#include #include #include #include @@ -34,6 +35,9 @@ #include #include #include +#include +#include +#include #ifdef __cplusplus extern "C" { @@ -70,6 +74,11 @@ typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); +typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, + uint64_t offset, uint64_t size, void *arg); +typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, + vdev_remap_cb_t callback, void *arg); + typedef struct vdev_ops { vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; @@ -79,6 +88,7 @@ typedef struct vdev_ops { vdev_state_change_func_t *vdev_op_state_change; vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; + vdev_remap_func_t *vdev_op_remap; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -125,6 +135,45 @@ struct vdev_queue { uint64_t vq_lastoffset; }; +/* + * On-disk indirect vdev state. + * + * An indirect vdev is described exclusively in the MOS config of a pool. + * The config for an indirect vdev includes several fields, which are + * accessed in memory by a vdev_indirect_config_t. + */ +typedef struct vdev_indirect_config { + /* + * Object (in MOS) which contains the indirect mapping. This object + * contains an array of vdev_indirect_mapping_entry_phys_t ordered by + * vimep_src. The bonus buffer for this object is a + * vdev_indirect_mapping_phys_t. This object is allocated when a vdev + * removal is initiated. + * + * Note that this object can be empty if none of the data on the vdev + * has been copied yet. + */ + uint64_t vic_mapping_object; + + /* + * Object (in MOS) which contains the birth times for the mapping + * entries. This object contains an array of + * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus + * buffer for this object is a vdev_indirect_birth_phys_t. This object + * is allocated when a vdev removal is initiated. + * + * Note that this object can be empty if none of the vdev has yet been + * copied. + */ + uint64_t vic_births_object; + + /* + * This is the vdev ID which was removed previous to this vdev, or + * UINT64_MAX if there are no previously removed vdevs. + */ + uint64_t vic_prev_indirect_vdev; +} vdev_indirect_config_t; + /* * Virtual device descriptor */ @@ -198,6 +247,40 @@ struct vdev { kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ uint64_t vdev_top_zap; + /* + * Values stored in the config for an indirect or removing vdev. + */ + vdev_indirect_config_t vdev_indirect_config; + + /* + * The vdev_indirect_rwlock protects the vdev_indirect_mapping + * pointer from changing on indirect vdevs (when it is condensed). + * Note that removing (not yet indirect) vdevs have different + * access patterns (the mapping is not accessed from open context, + * e.g. from zio_read) and locking strategy (e.g. svr_lock). + */ + krwlock_t vdev_indirect_rwlock; + vdev_indirect_mapping_t *vdev_indirect_mapping; + vdev_indirect_births_t *vdev_indirect_births; + + /* + * In memory data structures used to manage the obsolete sm, for + * indirect or removing vdevs. + * + * The vdev_obsolete_segments is the in-core record of the segments + * that are no longer referenced anywhere in the pool (due to + * being freed or remapped and not referenced by any snapshots). + * During a sync, segments are added to vdev_obsolete_segments + * via vdev_indirect_mark_obsolete(); at the end of each sync + * pass, this is appended to vdev_obsolete_sm via + * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock + * protects against concurrent modifications of vdev_obsolete_segments + * from multiple zio threads. + */ + kmutex_t vdev_obsolete_lock; + range_tree_t *vdev_obsolete_segments; + space_map_t *vdev_obsolete_sm; + /* * The queue depth parameters determine how many async writes are * still pending (i.e. allocated by net yet issued to disk) per @@ -348,7 +431,7 @@ extern void vdev_remove_parent(vdev_t *cvd); */ extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd); extern boolean_t vdev_log_state_valid(vdev_t *vd); -extern void vdev_load(vdev_t *vd); +extern int vdev_load(vdev_t *vd); extern int vdev_dtl_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); @@ -371,6 +454,7 @@ extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; +extern vdev_ops_t vdev_indirect_ops; /* * Common size functions @@ -386,6 +470,15 @@ extern void vdev_set_min_asize(vdev_t *vd); extern int zfs_vdev_cache_size; extern uint_t zfs_geom_probe_vdev_key; +/* + * Functions from vdev_indirect.c + */ +extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx); +extern boolean_t vdev_indirect_should_condense(vdev_t *vd); +extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); +extern int vdev_obsolete_sm_object(vdev_t *vd); +extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd); + #ifdef illumos /* * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h new file mode 100644 index 000000000000..987b14485d2b --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INDIRECT_BIRTHS_H +#define _SYS_VDEV_INDIRECT_BIRTHS_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_indirect_birth_entry_phys { + uint64_t vibe_offset; + uint64_t vibe_phys_birth_txg; +} vdev_indirect_birth_entry_phys_t; + +typedef struct vdev_indirect_birth_phys { + uint64_t vib_count; /* count of v_i_b_entry_phys_t's */ +} vdev_indirect_birth_phys_t; + +typedef struct vdev_indirect_births { + uint64_t vib_object; + + /* + * Each entry indicates that everything up to but not including + * vibe_offset was copied in vibe_phys_birth_txg. Entries are sorted + * by increasing phys_birth, and also by increasing offset. See + * vdev_indirect_births_physbirth for usage. + */ + vdev_indirect_birth_entry_phys_t *vib_entries; + + objset_t *vib_objset; + + dmu_buf_t *vib_dbuf; + vdev_indirect_birth_phys_t *vib_phys; +} vdev_indirect_births_t; + +extern vdev_indirect_births_t *vdev_indirect_births_open(objset_t *os, + uint64_t object); +extern void vdev_indirect_births_close(vdev_indirect_births_t *vib); +extern boolean_t vdev_indirect_births_is_open(vdev_indirect_births_t *vib); +extern uint64_t vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx); +extern void vdev_indirect_births_free(objset_t *os, uint64_t object, + dmu_tx_t *tx); + +extern uint64_t vdev_indirect_births_count(vdev_indirect_births_t *vib); +extern uint64_t vdev_indirect_births_object(vdev_indirect_births_t *vib); + +extern void vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, + uint64_t offset, uint64_t txg, dmu_tx_t *tx); + +extern uint64_t vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, + uint64_t offset, uint64_t asize); + +extern uint64_t vdev_indirect_births_last_entry_txg( + vdev_indirect_births_t *vib); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INDIRECT_BIRTHS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h new file mode 100644 index 000000000000..7e42c1019504 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h @@ -0,0 +1,141 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INDIRECT_MAPPING_H +#define _SYS_VDEV_INDIRECT_MAPPING_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_indirect_mapping_entry_phys { + /* + * Decode with DVA_MAPPING_* macros. + * Contains: + * the source offset (low 63 bits) + * the one-bit "mark", used for garbage collection (by zdb) + */ + uint64_t vimep_src; + + /* + * Note: the DVA's asize is 24 bits, and can thus store ranges + * up to 8GB. + */ + dva_t vimep_dst; +} vdev_indirect_mapping_entry_phys_t; + +#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \ + BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \ + BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +typedef struct vdev_indirect_mapping_entry { + vdev_indirect_mapping_entry_phys_t vime_mapping; + uint32_t vime_obsolete_count; + list_node_t vime_node; +} vdev_indirect_mapping_entry_t; + +/* + * This is stored in the bonus buffer of the mapping object, see comment of + * vdev_indirect_config for more details. + */ +typedef struct vdev_indirect_mapping_phys { + uint64_t vimp_max_offset; + uint64_t vimp_bytes_mapped; + uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */ + + /* + * For each entry in the mapping object, this object contains an + * entry representing the number of bytes of that mapping entry + * that were no longer in use by the pool at the time this indirect + * vdev was last condensed. + */ + uint64_t vimp_counts_object; +} vdev_indirect_mapping_phys_t; + +#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t)) + +typedef struct vdev_indirect_mapping { + uint64_t vim_object; + boolean_t vim_havecounts; + + /* + * An ordered array of all mapping entries, sorted by source offset. + * Note that vim_entries is needed during a removal (and contains + * mappings that have been synced to disk so far) to handle frees + * from the removing device. + */ + vdev_indirect_mapping_entry_phys_t *vim_entries; + + objset_t *vim_objset; + + dmu_buf_t *vim_dbuf; + vdev_indirect_mapping_phys_t *vim_phys; +} vdev_indirect_mapping_t; + +extern vdev_indirect_mapping_t *vdev_indirect_mapping_open(objset_t *os, + uint64_t object); +extern void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx); +extern void vdev_indirect_mapping_free(objset_t *os, uint64_t obj, + dmu_tx_t *tx); + +extern uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_bytes_mapped( + vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim); + +/* + * Writes the given list of vdev_indirect_mapping_entry_t to the mapping + * then updates internal state. + */ +extern void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, + list_t *vime_list, dmu_tx_t *tx); + +extern vdev_indirect_mapping_entry_phys_t * + vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, + uint64_t offset); + +extern vdev_indirect_mapping_entry_phys_t * + vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, + uint64_t offset); + +extern uint32_t *vdev_indirect_mapping_load_obsolete_counts( + vdev_indirect_mapping_t *vim); +extern void vdev_indirect_mapping_load_obsolete_spacemap( + vdev_indirect_mapping_t *vim, + uint32_t *counts, space_map_t *obsolete_space_sm); +extern void vdev_indirect_mapping_increment_obsolete_count( + vdev_indirect_mapping_t *vim, + uint64_t offset, uint64_t asize, uint32_t *counts); +extern void vdev_indirect_mapping_free_obsolete_counts( + vdev_indirect_mapping_t *vim, uint32_t *counts); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INDIRECT_MAPPING_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h new file mode 100644 index 000000000000..5b1e3056be14 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_REMOVAL_H +#define _SYS_VDEV_REMOVAL_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_vdev_removal { + vdev_t *svr_vdev; + uint64_t svr_max_offset_to_sync[TXG_SIZE]; + /* Thread performing a vdev removal. */ + kthread_t *svr_thread; + /* Segments left to copy from the current metaslab. */ + range_tree_t *svr_allocd_segs; + kmutex_t svr_lock; + kcondvar_t svr_cv; + boolean_t svr_thread_exit; + + /* + * New mappings to write out each txg. + */ + list_t svr_new_segments[TXG_SIZE]; + + /* + * Ranges that were freed while a mapping was in flight. This is + * a subset of the ranges covered by vdev_im_new_segments. + */ + range_tree_t *svr_frees[TXG_SIZE]; + + /* + * Number of bytes which we have finished our work for + * in each txg. This could be data copied (which will be part of + * the mappings in vdev_im_new_segments), or data freed before + * we got around to copying it. + */ + uint64_t svr_bytes_done[TXG_SIZE]; + + /* List of leaf zap objects to be unlinked */ + nvlist_t *svr_zaplist; +} spa_vdev_removal_t; + +typedef struct spa_condensing_indirect { + /* + * New mappings to write out each txg. + */ + list_t sci_new_mapping_entries[TXG_SIZE]; + + vdev_indirect_mapping_t *sci_new_mapping; +} spa_condensing_indirect_t; + +extern int spa_remove_init(spa_t *); +extern void spa_restart_removal(spa_t *); +extern int spa_condense_init(spa_t *); +extern void spa_condense_fini(spa_t *); +extern void spa_condense_indirect_restart(spa_t *); +extern void spa_vdev_condense_suspend(spa_t *); +extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t); +extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t); +extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *); +extern void svr_sync(spa_t *spa, dmu_tx_t *tx); +extern void spa_vdev_remove_suspend(spa_t *); +extern int spa_vdev_remove_cancel(spa_t *); +extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_REMOVAL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h index 433576201f77..b04b24f17f8b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H @@ -61,6 +61,7 @@ extern boolean_t zfs_free_leak_on_eio; #define ZFS_DEBUG_ZIO_FREE (1 << 6) #define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7) #define ZFS_DEBUG_METASLAB_VERIFY (1 << 8) +#define ZFS_DEBUG_INDIRECT_REMAP (1 << 9) #ifdef ZFS_DEBUG extern void __dprintf(const char *file, const char *func, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h index 2771a2b46ad4..cff7177e7f35 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h @@ -420,7 +420,7 @@ extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid); extern void zil_commit(zilog_t *zilog, uint64_t oid); extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); -extern int zil_vdev_offline(const char *osname, void *txarg); +extern int zil_reset(const char *osname, void *txarg); extern int zil_claim(struct dsl_pool *dp, struct dsl_dataset *ds, void *txarg); extern int zil_check_log_chain(struct dsl_pool *dp, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index 0259e708c536..08a5d56a8c9b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -199,7 +199,7 @@ enum zio_flag { #define ZIO_VDEV_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ - ZIO_FLAG_CANFAIL) + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL) #define ZIO_CHILD_BIT(x) (1 << (x)) #define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x))) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h index 32e90e2fbc28..4db05ac77598 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h @@ -29,6 +29,7 @@ typedef enum zio_priority { ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ ZIO_PRIORITY_TRIM, /* free requests used for TRIM */ + ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c index 884bd9f649f9..64b9c0cb3510 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c @@ -828,6 +828,8 @@ txg_list_remove(txg_list_t *tl, uint64_t txg) txg_verify(tl->tl_spa, txg); mutex_enter(&tl->tl_lock); if ((tn = tl->tl_head[t]) != NULL) { + ASSERT(tn->tn_member[t]); + ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]); p = (char *)tn - tl->tl_offset; tl->tl_head[t] = tn->tn_next[t]; tn->tn_next[t] = NULL; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index c63a9495f91f..1af7af71ae37 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright 2013 Martin Matuska . All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -33,8 +33,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -155,6 +157,7 @@ static vdev_ops_t *vdev_ops_table[] = { &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, + &vdev_indirect_ops, NULL }; @@ -409,8 +412,10 @@ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; + vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); + vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); @@ -441,14 +446,18 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); + vic->vic_prev_indirect_vdev = UINT64_MAX; + + rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); + vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { - vd->vdev_dtl[t] = range_tree_create(NULL, NULL, - &vd->vdev_dtl_lock); + vd->vdev_dtl[t] = range_tree_create(NULL, NULL); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); @@ -474,6 +483,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, char *type; uint64_t guid = 0, islog, nparity; vdev_t *vd; + vdev_indirect_config_t *vic; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -561,6 +571,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, ASSERT(nparity != -1ULL); vd = vdev_alloc_common(spa, id, guid, ops); + vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; @@ -583,6 +594,16 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; + ASSERT0(vic->vic_mapping_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, + &vic->vic_mapping_object); + ASSERT0(vic->vic_births_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, + &vic->vic_births_object); + ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, + &vic->vic_prev_indirect_vdev); + /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. @@ -775,6 +796,23 @@ vdev_free(vdev_t *vd) } mutex_exit(&vd->vdev_dtl_lock); + EQUIV(vd->vdev_indirect_births != NULL, + vd->vdev_indirect_mapping != NULL); + if (vd->vdev_indirect_births != NULL) { + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vdev_indirect_births_close(vd->vdev_indirect_births); + } + + if (vd->vdev_obsolete_sm != NULL) { + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + } + range_tree_destroy(vd->vdev_obsolete_segments); + rw_destroy(&vd->vdev_indirect_rwlock); + mutex_destroy(&vd->vdev_obsolete_lock); + mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); @@ -883,6 +921,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; + mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; @@ -967,15 +1006,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) ASSERT(!vd->vdev_ishole); - /* - * Compute the raidz-deflation ratio. Note, we hard-code - * in 128k (1 << 17) because it is the "typical" blocksize. - * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, - * otherwise it would inconsistently account for existing bp's. - */ - vd->vdev_deflate_ratio = (1 << 17) / - (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); - ASSERT(oldc <= newc); mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); @@ -991,7 +1021,12 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) for (m = oldc; m < newc; m++) { uint64_t object = 0; - if (txg == 0) { + /* + * vdev_ms_array may be 0 if we are creating the "fake" + * metaslabs for an indirect vdev for zdb's leak detection. + * See zdb_leak_init(). + */ + if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); @@ -1025,12 +1060,11 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) void vdev_metaslab_fini(vdev_t *vd) { - uint64_t m; - uint64_t count = vd->vdev_ms_count; - if (vd->vdev_ms != NULL) { + uint64_t count = vd->vdev_ms_count; + metaslab_group_passivate(vd->vdev_mg); - for (m = 0; m < count; m++) { + for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) @@ -1038,7 +1072,10 @@ vdev_metaslab_fini(vdev_t *vd) } kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; + + vd->vdev_ms_count = 0; } + ASSERT0(vd->vdev_ms_count); } typedef struct vdev_probe_stats { @@ -1082,6 +1119,8 @@ vdev_probe_done(zio_t *zio) zio->io_error = 0; } else { ASSERT(zio->io_error != 0); + zfs_dbgmsg("failed probe on vdev %llu", + (longlong_t)vd->vdev_id); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); @@ -1248,6 +1287,21 @@ vdev_open_children(vdev_t *vd) taskq_destroy(tq); } +/* + * Compute the raidz-deflation ratio. Note, we hard-code + * in 128k (1 << 17) because it is the "typical" blocksize. + * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, + * otherwise it would inconsistently account for existing bp's. + */ +static void +vdev_set_deflate_ratio(vdev_t *vd) +{ + if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { + vd->vdev_deflate_ratio = (1 << 17) / + (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + } +} + /* * Prepare a virtual device for access. */ @@ -1450,6 +1504,14 @@ vdev_open(vdev_t *vd) return (error); } + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + !vd->vdev_isl2cache && !vd->vdev_islog) { + if (vd->vdev_ashift > spa->spa_max_ashift) + spa->spa_max_ashift = vd->vdev_ashift; + if (vd->vdev_ashift < spa->spa_min_ashift) + spa->spa_min_ashift = vd->vdev_ashift; + } + /* * Track the min and max ashift values for normal data devices. */ @@ -1777,7 +1839,8 @@ void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); - ASSERT(!vd->vdev_ishole); + /* indirect vdevs don't have metaslabs or dtls */ + ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); @@ -1847,10 +1910,10 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); - mutex_enter(rt->rt_lock); + mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); - mutex_exit(rt->rt_lock); + mutex_exit(&vd->vdev_dtl_lock); } boolean_t @@ -1862,10 +1925,21 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); - mutex_enter(rt->rt_lock); + /* + * While we are loading the pool, the DTLs have not been loaded yet. + * Ignore the DTLs and try all devices. This avoids a recursive + * mutex enter on the vdev_dtl_lock, and also makes us try hard + * when loading the pool (relying on the checksum to ensure that + * we get the right data -- note that we while loading, we are + * only reading the MOS, which is always checksummed). + */ + if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + + mutex_enter(&vd->vdev_dtl_lock); if (range_tree_space(rt) != 0) dirty = range_tree_contains(rt, txg, size); - mutex_exit(rt->rt_lock); + mutex_exit(&vd->vdev_dtl_lock); return (dirty); } @@ -1876,9 +1950,9 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; - mutex_enter(rt->rt_lock); + mutex_enter(&vd->vdev_dtl_lock); empty = (range_tree_space(rt) == 0); - mutex_exit(rt->rt_lock); + mutex_exit(&vd->vdev_dtl_lock); return (empty); } @@ -1971,7 +2045,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done); - if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) + if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { @@ -2080,10 +2154,10 @@ vdev_dtl_load(vdev_t *vd) int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { - ASSERT(!vd->vdev_ishole); + ASSERT(vdev_is_concrete(vd)); error = space_map_open(&vd->vdev_dtl_sm, mos, - vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); + vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); @@ -2162,11 +2236,10 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; - kmutex_t rtlock; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); - ASSERT(!vd->vdev_ishole); + ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); @@ -2200,16 +2273,11 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, - 0, -1ULL, 0, &vd->vdev_dtl_lock)); + 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } - bzero(&rtlock, sizeof(rtlock)); - mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); - - rtsync = range_tree_create(NULL, NULL, &rtlock); - - mutex_enter(&rtlock); + rtsync = range_tree_create(NULL, NULL); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); @@ -2221,9 +2289,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) range_tree_destroy(rtsync); - mutex_exit(&rtlock); - mutex_destroy(&rtlock); - /* * If the object for the space map has changed then dirty * the top level so that we update the config. @@ -2316,30 +2381,62 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) return (needed); } -void +int vdev_load(vdev_t *vd) { + int error = 0; /* * Recursively load all children. */ - for (int c = 0; c < vd->vdev_children; c++) - vdev_load(vd->vdev_child[c]); + for (int c = 0; c < vd->vdev_children; c++) { + error = vdev_load(vd->vdev_child[c]); + if (error != 0) { + return (error); + } + } + + vdev_set_deflate_ratio(vd); /* * If this is a top-level vdev, initialize its metaslabs. */ - if (vd == vd->vdev_top && !vd->vdev_ishole && - (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || - vdev_metaslab_init(vd, 0) != 0)) - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); + if (vd == vd->vdev_top && vdev_is_concrete(vd)) { + if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (SET_ERROR(ENXIO)); + } else if ((error = vdev_metaslab_init(vd, 0)) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (error); + } + } /* * If this is a leaf vdev, load its DTL. */ - if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) + if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); + return (error); + } + + uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); + if (obsolete_sm_object != 0) { + objset_t *mos = vd->vdev_spa->spa_meta_objset; + ASSERT(vd->vdev_asize != 0); + ASSERT(vd->vdev_obsolete_sm == NULL); + + if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, + obsolete_sm_object, 0, vd->vdev_asize, 0))) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (error); + } + space_map_update(vd->vdev_obsolete_sm); + } + + return (0); } /* @@ -2384,14 +2481,42 @@ vdev_validate_aux(vdev_t *vd) return (0); } +/* + * Free the objects used to store this vdev's spacemaps, and the array + * that points to them. + */ void -vdev_remove(vdev_t *vd, uint64_t txg) +vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) +{ + if (vd->vdev_ms_array == 0) + return; + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; + size_t array_bytes = array_count * sizeof (uint64_t); + uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); + VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, + array_bytes, smobj_array, 0)); + + for (uint64_t i = 0; i < array_count; i++) { + uint64_t smobj = smobj_array[i]; + if (smobj == 0) + continue; + + space_map_free_obj(mos, smobj, tx); + } + + kmem_free(smobj_array, array_bytes); + VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); + vd->vdev_ms_array = 0; +} + +static void +vdev_remove_empty(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; dmu_tx_t *tx; - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); @@ -2418,7 +2543,6 @@ vdev_remove(vdev_t *vd, uint64_t txg) metaslab_group_histogram_remove(mg, msp); VERIFY0(space_map_allocated(msp->ms_sm)); - space_map_free(msp->ms_sm, tx); space_map_close(msp->ms_sm); msp->ms_sm = NULL; mutex_exit(&msp->ms_lock); @@ -2428,13 +2552,10 @@ vdev_remove(vdev_t *vd, uint64_t txg) metaslab_class_histogram_verify(mg->mg_class); for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); - } - if (vd->vdev_ms_array) { - (void) dmu_object_free(mos, vd->vdev_ms_array, tx); - vd->vdev_ms_array = 0; - } + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + vdev_destroy_spacemaps(vd, tx); if (vd->vdev_islog && vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); @@ -2449,7 +2570,7 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); - ASSERT(!vd->vdev_ishole); + ASSERT(vdev_is_concrete(vd)); while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) metaslab_sync_done(msp, txg); @@ -2466,10 +2587,33 @@ vdev_sync(vdev_t *vd, uint64_t txg) metaslab_t *msp; dmu_tx_t *tx; - ASSERT(!vd->vdev_ishole); + if (range_tree_space(vd->vdev_obsolete_segments) > 0) { + dmu_tx_t *tx; - if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + vdev_indirect_sync_obsolete(vd, tx); + dmu_tx_commit(tx); + + /* + * If the vdev is indirect, it can't have dirty + * metaslabs or DTLs. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); + ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); + return; + } + } + + ASSERT(vdev_is_concrete(vd)); + + if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && + !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); + ASSERT0(vd->vdev_indirect_config.vic_mapping_object); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); @@ -2478,12 +2622,6 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } - /* - * Remove the metadata associated with this vdev once it's empty. - */ - if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) - vdev_remove(vd, txg); - while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); @@ -2492,6 +2630,16 @@ vdev_sync(vdev_t *vd, uint64_t txg) while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); + /* + * Remove the metadata associated with this vdev once it's empty. + * Note that this is typically used for log/cache device removal; + * we don't empty toplevel vdevs when removing them. But if + * a toplevel happens to be emptied, this is not harmful. + */ + if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) { + vdev_remove_empty(vd, txg); + } + (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); } @@ -2705,7 +2853,7 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); - error = spa_offline_log(spa); + error = spa_reset_logs(spa); spa_vdev_state_enter(spa, SCL_ALLOC); @@ -2794,6 +2942,12 @@ vdev_clear(spa_t *spa, vdev_t *vd) vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); } + /* + * It makes no sense to "clear" an indirect vdev. + */ + if (!vdev_is_concrete(vd)) + return; + /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We @@ -2848,7 +3002,8 @@ vdev_is_dead(vdev_t *vd) * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ - return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || + return (vd->vdev_state < VDEV_STATE_DEGRADED || + vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } @@ -2861,7 +3016,8 @@ vdev_readable(vdev_t *vd) boolean_t vdev_writeable(vdev_t *vd) { - return (!vdev_is_dead(vd) && !vd->vdev_cant_write); + return (!vdev_is_dead(vd) && !vd->vdev_cant_write && + vdev_is_concrete(vd)); } boolean_t @@ -2878,7 +3034,7 @@ vdev_allocatable(vdev_t *vd) * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write && !vd->vdev_ishole && + !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } @@ -2931,7 +3087,8 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; vs->vs_physical_ashift = vd->vdev_physical_ashift; - if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { + if (vd->vdev_aux == NULL && vd == vd->vdev_top && + vdev_is_concrete(vd)) { vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; } @@ -3075,7 +3232,8 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_WRITE && txg != 0 && + if (spa->spa_load_state == SPA_LOAD_NONE && + type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { @@ -3240,8 +3398,9 @@ vdev_config_dirty(vdev_t *vd) ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && - !vd->vdev_ishole) + vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); + } } } @@ -3282,7 +3441,8 @@ vdev_state_dirty(vdev_t *vd) (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); - if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) + if (!list_link_active(&vd->vdev_state_dirty_node) && + vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } @@ -3316,9 +3476,10 @@ vdev_propagate_state(vdev_t *vd) child = vd->vdev_child[c]; /* - * Don't factor holes into the decision. + * Don't factor holes or indirect vdevs into the + * decision. */ - if (child->vdev_ishole) + if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || @@ -3507,7 +3668,8 @@ vdev_is_bootable(vdev_t *vd) if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && vd->vdev_children > 1) { return (B_FALSE); - } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { + } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || + strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { return (B_FALSE); } } @@ -3520,6 +3682,18 @@ vdev_is_bootable(vdev_t *vd) return (B_TRUE); } +boolean_t +vdev_is_concrete(vdev_t *vd) +{ + vdev_ops_t *ops = vd->vdev_ops; + if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || + ops == &vdev_missing_ops || ops == &vdev_root_ops) { + return (B_FALSE); + } else { + return (B_TRUE); + } +} + /* * Load the state from the original vdev tree (ovd) which * we've retrieved from the MOS config object. If the original @@ -3577,7 +3751,10 @@ vdev_expand(vdev_t *vd, uint64_t txg) ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { + vdev_set_deflate_ratio(vd); + + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + vdev_is_concrete(vd)) { VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c index d8c334059d54..0140c38a09c9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c @@ -834,6 +834,7 @@ vdev_ops_t vdev_disk_ops = { NULL, vdev_disk_hold, vdev_disk_rele, + NULL, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c index af6b406b3e5a..d599c95a58c3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c @@ -269,6 +269,7 @@ vdev_ops_t vdev_file_ops = { NULL, vdev_file_hold, vdev_file_rele, + NULL, VDEV_TYPE_FILE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -287,6 +288,7 @@ vdev_ops_t vdev_disk_ops = { NULL, vdev_file_hold, vdev_file_rele, + NULL, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index 2b1c9e475a68..62c6cf9d1429 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -1147,6 +1147,7 @@ vdev_ops_t vdev_geom_ops = { NULL, vdev_geom_hold, vdev_geom_rele, + NULL, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c new file mode 100644 index 000000000000..4adb22832abc --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c @@ -0,0 +1,1037 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * An indirect vdev corresponds to a vdev that has been removed. Since + * we cannot rewrite block pointers of snapshots, etc., we keep a + * mapping from old location on the removed device to the new location + * on another device in the pool and use this mapping whenever we need + * to access the DVA. Unfortunately, this mapping did not respect + * logical block boundaries when it was first created, and so a DVA on + * this indirect vdev may be "split" into multiple sections that each + * map to a different location. As a consequence, not all DVAs can be + * translated to an equivalent new DVA. Instead we must provide a + * "vdev_remap" operation that executes a callback on each contiguous + * segment of the new location. This function is used in multiple ways: + * + * - reads and repair writes to this device use the callback to create + * a child io for each mapped segment. + * + * - frees and claims to this device use the callback to free or claim + * each mapped segment. (Note that we don't actually need to claim + * log blocks on indirect vdevs, because we don't allocate to + * removing vdevs. However, zdb uses zio_claim() for its leak + * detection.) + */ + +/* + * "Big theory statement" for how we mark blocks obsolete. + * + * When a block on an indirect vdev is freed or remapped, a section of + * that vdev's mapping may no longer be referenced (aka "obsolete"). We + * keep track of how much of each mapping entry is obsolete. When + * an entry becomes completely obsolete, we can remove it, thus reducing + * the memory used by the mapping. The complete picture of obsolescence + * is given by the following data structures, described below: + * - the entry-specific obsolete count + * - the vdev-specific obsolete spacemap + * - the pool-specific obsolete bpobj + * + * == On disk data structures used == + * + * We track the obsolete space for the pool using several objects. Each + * of these objects is created on demand and freed when no longer + * needed, and is assumed to be empty if it does not exist. + * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. + * + * - Each vic_mapping_object (associated with an indirect vdev) can + * have a vimp_counts_object. This is an array of uint32_t's + * with the same number of entries as the vic_mapping_object. When + * the mapping is condensed, entries from the vic_obsolete_sm_object + * (see below) are folded into the counts. Therefore, each + * obsolete_counts entry tells us the number of bytes in the + * corresponding mapping entry that were not referenced when the + * mapping was last condensed. + * + * - Each indirect or removing vdev can have a vic_obsolete_sm_object. + * This is a space map containing an alloc entry for every DVA that + * has been obsoleted since the last time this indirect vdev was + * condensed. We use this object in order to improve performance + * when marking a DVA as obsolete. Instead of modifying an arbitrary + * offset of the vimp_counts_object, we only need to append an entry + * to the end of this object. When a DVA becomes obsolete, it is + * added to the obsolete space map. This happens when the DVA is + * freed, remapped and not referenced by a snapshot, or the last + * snapshot referencing it is destroyed. + * + * - Each dataset can have a ds_remap_deadlist object. This is a + * deadlist object containing all blocks that were remapped in this + * dataset but referenced in a previous snapshot. Blocks can *only* + * appear on this list if they were remapped (dsl_dataset_block_remapped); + * blocks that were killed in a head dataset are put on the normal + * ds_deadlist and marked obsolete when they are freed. + * + * - The pool can have a dp_obsolete_bpobj. This is a list of blocks + * in the pool that need to be marked obsolete. When a snapshot is + * destroyed, we move some of the ds_remap_deadlist to the obsolete + * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then + * asynchronously process the obsolete bpobj, moving its entries to + * the specific vdevs' obsolete space maps. + * + * == Summary of how we mark blocks as obsolete == + * + * - When freeing a block: if any DVA is on an indirect vdev, append to + * vic_obsolete_sm_object. + * - When remapping a block, add dva to ds_remap_deadlist (if prev snap + * references; otherwise append to vic_obsolete_sm_object). + * - When freeing a snapshot: move parts of ds_remap_deadlist to + * dp_obsolete_bpobj (same algorithm as ds_deadlist). + * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to + * individual vdev's vic_obsolete_sm_object. + */ + +/* + * "Big theory statement" for how we condense indirect vdevs. + * + * Condensing an indirect vdev's mapping is the process of determining + * the precise counts of obsolete space for each mapping entry (by + * integrating the obsolete spacemap into the obsolete counts) and + * writing out a new mapping that contains only referenced entries. + * + * We condense a vdev when we expect the mapping to shrink (see + * vdev_indirect_should_condense()), but only perform one condense at a + * time to limit the memory usage. In addition, we use a separate + * open-context thread (spa_condense_indirect_thread) to incrementally + * create the new mapping object in a way that minimizes the impact on + * the rest of the system. + * + * == Generating a new mapping == + * + * To generate a new mapping, we follow these steps: + * + * 1. Save the old obsolete space map and create a new mapping object + * (see spa_condense_indirect_start_sync()). This initializes the + * spa_condensing_indirect_phys with the "previous obsolete space map", + * which is now read only. Newly obsolete DVAs will be added to a + * new (initially empty) obsolete space map, and will not be + * considered as part of this condense operation. + * + * 2. Construct in memory the precise counts of obsolete space for each + * mapping entry, by incorporating the obsolete space map into the + * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) + * + * 3. Iterate through each mapping entry, writing to the new mapping any + * entries that are not completely obsolete (i.e. which don't have + * obsolete count == mapping length). (See + * spa_condense_indirect_generate_new_mapping().) + * + * 4. Destroy the old mapping object and switch over to the new one + * (spa_condense_indirect_complete_sync). + * + * == Restarting from failure == + * + * To restart the condense when we import/open the pool, we must start + * at the 2nd step above: reconstruct the precise counts in memory, + * based on the space map + counts. Then in the 3rd step, we start + * iterating where we left off: at vimp_max_offset of the new mapping + * object. + */ + +boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; + +/* + * Condense if at least this percent of the bytes in the mapping is + * obsolete. With the default of 25%, the amount of space mapped + * will be reduced to 1% of its original size after at most 16 + * condenses. Higher values will condense less often (causing less + * i/o); lower values will reduce the mapping size more quickly. + */ +int zfs_indirect_condense_obsolete_pct = 25; + +/* + * Condense if the obsolete space map takes up more than this amount of + * space on disk (logically). This limits the amount of disk space + * consumed by the obsolete space map; the default of 1GB is small enough + * that we typically don't mind "wasting" it. + */ +uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; + +/* + * Don't bother condensing if the mapping uses less than this amount of + * memory. The default of 128KB is considered a "trivial" amount of + * memory and not worth reducing. + */ +uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; + +/* + * This is used by the test suite so that it can ensure that certain + * actions happen while in the middle of a condense (which might otherwise + * complete too quickly). If used to reduce the performance impact of + * condensing in production, a maximum value of 1 should be sufficient. + */ +int zfs_condense_indirect_commit_entry_delay_ticks = 0; + +/* + * Mark the given offset and size as being obsolete in the given txg. + */ +void +vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size, + uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + ASSERT3U(spa_syncing_txg(spa), ==, txg); + ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); + ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); + ASSERT(size > 0); + VERIFY(vdev_indirect_mapping_entry_for_offset( + vd->vdev_indirect_mapping, offset) != NULL); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + mutex_enter(&vd->vdev_obsolete_lock); + range_tree_add(vd->vdev_obsolete_segments, offset, size); + mutex_exit(&vd->vdev_obsolete_lock); + vdev_dirty(vd, 0, NULL, txg); + } +} + +/* + * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This + * wrapper is provided because the DMU does not know about vdev_t's and + * cannot directly call vdev_indirect_mark_obsolete. + */ +void +spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, + uint64_t size, dmu_tx_t *tx) +{ + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + ASSERT(dmu_tx_is_syncing(tx)); + + /* The DMU can only remap indirect vdevs. */ + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx)); +} + +static spa_condensing_indirect_t * +spa_condensing_indirect_create(spa_t *spa) +{ + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); + objset_t *mos = spa->spa_meta_objset; + + for (int i = 0; i < TXG_SIZE; i++) { + list_create(&sci->sci_new_mapping_entries[i], + sizeof (vdev_indirect_mapping_entry_t), + offsetof(vdev_indirect_mapping_entry_t, vime_node)); + } + + sci->sci_new_mapping = + vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); + + return (sci); +} + +static void +spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) +{ + for (int i = 0; i < TXG_SIZE; i++) + list_destroy(&sci->sci_new_mapping_entries[i]); + + if (sci->sci_new_mapping != NULL) + vdev_indirect_mapping_close(sci->sci_new_mapping); + + kmem_free(sci, sizeof (*sci)); +} + +boolean_t +vdev_indirect_should_condense(vdev_t *vd) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + spa_t *spa = vd->vdev_spa; + + ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); + + if (!zfs_condense_indirect_vdevs_enable) + return (B_FALSE); + + /* + * We can only condense one indirect vdev at a time. + */ + if (spa->spa_condensing_indirect != NULL) + return (B_FALSE); + + if (spa_shutting_down(spa)) + return (B_FALSE); + + /* + * The mapping object size must not change while we are + * condensing, so we can only condense indirect vdevs + * (not vdevs that are still in the middle of being removed). + */ + if (vd->vdev_ops != &vdev_indirect_ops) + return (B_FALSE); + + /* + * If nothing new has been marked obsolete, there is no + * point in condensing. + */ + if (vd->vdev_obsolete_sm == NULL) { + ASSERT0(vdev_obsolete_sm_object(vd)); + return (B_FALSE); + } + + ASSERT(vd->vdev_obsolete_sm != NULL); + + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + + uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); + uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); + uint64_t mapping_size = vdev_indirect_mapping_size(vim); + uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); + + ASSERT3U(bytes_obsolete, <=, bytes_mapped); + + /* + * If a high percentage of the bytes that are mapped have become + * obsolete, condense (unless the mapping is already small enough). + * This has a good chance of reducing the amount of memory used + * by the mapping. + */ + if (bytes_obsolete * 100 / bytes_mapped >= + zfs_indirect_condense_obsolete_pct && + mapping_size > zfs_condense_min_mapping_bytes) { + zfs_dbgmsg("should condense vdev %llu because obsolete " + "spacemap covers %d%% of %lluMB mapping", + (u_longlong_t)vd->vdev_id, + (int)(bytes_obsolete * 100 / bytes_mapped), + (u_longlong_t)bytes_mapped / 1024 / 1024); + return (B_TRUE); + } + + /* + * If the obsolete space map takes up too much space on disk, + * condense in order to free up this disk space. + */ + if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { + zfs_dbgmsg("should condense vdev %llu because obsolete sm " + "length %lluMB >= max size %lluMB", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)obsolete_sm_size / 1024 / 1024, + (u_longlong_t)zfs_condense_max_obsolete_bytes / + 1024 / 1024); + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * This sync task completes (finishes) a condense, deleting the old + * mapping and replacing it with the new one. + */ +static void +spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_condensing_indirect_t *sci = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + objset_t *mos = spa->spa_meta_objset; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); + uint64_t new_count = + vdev_indirect_mapping_num_entries(sci->sci_new_mapping); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT3P(sci, ==, spa->spa_condensing_indirect); + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); + } + ASSERT(vic->vic_mapping_object != 0); + ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); + ASSERT(scip->scip_next_mapping_object != 0); + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + + /* + * Reset vdev_indirect_mapping to refer to the new object. + */ + rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vd->vdev_indirect_mapping = sci->sci_new_mapping; + rw_exit(&vd->vdev_indirect_rwlock); + + sci->sci_new_mapping = NULL; + vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); + vic->vic_mapping_object = scip->scip_next_mapping_object; + scip->scip_next_mapping_object = 0; + + space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + scip->scip_prev_obsolete_sm_object = 0; + + scip->scip_vdev = 0; + + VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, tx)); + spa_condensing_indirect_destroy(spa->spa_condensing_indirect); + spa->spa_condensing_indirect = NULL; + + zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " + "new mapping object %llu has %llu entries " + "(was %llu entries)", + vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, + new_count, old_count); + + vdev_config_dirty(spa->spa_root_vdev); +} + +/* + * This sync task appends entries to the new mapping object. + */ +static void +spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) +{ + spa_condensing_indirect_t *sci = arg; + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(sci, ==, spa->spa_condensing_indirect); + + vdev_indirect_mapping_add_entries(sci->sci_new_mapping, + &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); +} + +/* + * Open-context function to add one entry to the new mapping. The new + * entry will be remembered and written from syncing context. + */ +static void +spa_condense_indirect_commit_entry(spa_t *spa, + vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) +{ + spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; + + ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + /* + * If we are the first entry committed this txg, kick off the sync + * task to write to the MOS on our behalf. + */ + if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { + dsl_sync_task_nowait(dmu_tx_pool(tx), + spa_condense_indirect_commit_sync, sci, + 0, ZFS_SPACE_CHECK_NONE, tx); + } + + vdev_indirect_mapping_entry_t *vime = + kmem_alloc(sizeof (*vime), KM_SLEEP); + vime->vime_mapping = *vimep; + vime->vime_obsolete_count = count; + list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); + + dmu_tx_commit(tx); +} + +static void +spa_condense_indirect_generate_new_mapping(vdev_t *vd, + uint32_t *obsolete_counts, uint64_t start_index) +{ + spa_t *spa = vd->vdev_spa; + uint64_t mapi = start_index; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + uint64_t old_num_entries = + vdev_indirect_mapping_num_entries(old_mapping); + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); + + zfs_dbgmsg("starting condense of vdev %llu from index %llu", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)mapi); + + while (mapi < old_num_entries && !spa_shutting_down(spa)) { + vdev_indirect_mapping_entry_phys_t *entry = + &old_mapping->vim_entries[mapi]; + uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); + ASSERT3U(obsolete_counts[mapi], <=, entry_size); + if (obsolete_counts[mapi] < entry_size) { + spa_condense_indirect_commit_entry(spa, entry, + obsolete_counts[mapi]); + + /* + * This delay may be requested for testing, debugging, + * or performance reasons. + */ + delay(zfs_condense_indirect_commit_entry_delay_ticks); + } + + mapi++; + } + if (spa_shutting_down(spa)) { + zfs_dbgmsg("pausing condense of vdev %llu at index %llu", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)mapi); + } +} + +static void +spa_condense_indirect_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + uint32_t *counts; + uint64_t start_index; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + space_map_t *prev_obsolete_sm = NULL; + + ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); + ASSERT(scip->scip_next_mapping_object != 0); + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + for (int i = 0; i < TXG_SIZE; i++) { + /* + * The list must start out empty in order for the + * _commit_sync() sync task to be properly registered + * on the first call to _commit_entry(); so it's wise + * to double check and ensure we actually are starting + * with empty lists. + */ + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); + } + + VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); + space_map_update(prev_obsolete_sm); + counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); + if (prev_obsolete_sm != NULL) { + vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, + counts, prev_obsolete_sm); + } + space_map_close(prev_obsolete_sm); + + /* + * Generate new mapping. Determine what index to continue from + * based on the max offset that we've already written in the + * new mapping. + */ + uint64_t max_offset = + vdev_indirect_mapping_max_offset(sci->sci_new_mapping); + if (max_offset == 0) { + /* We haven't written anything to the new mapping yet. */ + start_index = 0; + } else { + /* + * Pick up from where we left off. _entry_for_offset() + * returns a pointer into the vim_entries array. If + * max_offset is greater than any of the mappings + * contained in the table NULL will be returned and + * that indicates we've exhausted our iteration of the + * old_mapping. + */ + + vdev_indirect_mapping_entry_phys_t *entry = + vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, + max_offset); + + if (entry == NULL) { + /* + * We've already written the whole new mapping. + * This special value will cause us to skip the + * generate_new_mapping step and just do the sync + * task to complete the condense. + */ + start_index = UINT64_MAX; + } else { + start_index = entry - old_mapping->vim_entries; + ASSERT3U(start_index, <, + vdev_indirect_mapping_num_entries(old_mapping)); + } + } + + spa_condense_indirect_generate_new_mapping(vd, counts, start_index); + + vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); + + /* + * We may have bailed early from generate_new_mapping(), if + * the spa is shutting down. In this case, do not complete + * the condense. + */ + if (!spa_shutting_down(spa)) { + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + spa_condense_indirect_complete_sync, sci, 0, + ZFS_SPACE_CHECK_NONE)); + } + + mutex_enter(&spa->spa_async_lock); + spa->spa_condense_thread = NULL; + cv_broadcast(&spa->spa_async_cv); + mutex_exit(&spa->spa_async_lock); + thread_exit(); +} + +/* + * Sync task to begin the condensing process. + */ +void +spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + + ASSERT0(scip->scip_next_mapping_object); + ASSERT0(scip->scip_prev_obsolete_sm_object); + ASSERT0(scip->scip_vdev); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); + ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); + + uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); + ASSERT(obsolete_sm_obj != 0); + + scip->scip_vdev = vd->vdev_id; + scip->scip_next_mapping_object = + vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); + + scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; + + /* + * We don't need to allocate a new space map object, since + * vdev_indirect_sync_obsolete will allocate one when needed. + */ + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); + + VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), + sizeof (*scip) / sizeof (uint64_t), scip, tx)); + + ASSERT3P(spa->spa_condensing_indirect, ==, NULL); + spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); + + zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " + "posm=%llu nm=%llu", + vd->vdev_id, dmu_tx_get_txg(tx), + (u_longlong_t)scip->scip_prev_obsolete_sm_object, + (u_longlong_t)scip->scip_next_mapping_object); + + ASSERT3P(spa->spa_condense_thread, ==, NULL); + spa->spa_condense_thread = thread_create(NULL, 0, + spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, minclsyspri); +} + +/* + * Sync to the given vdev's obsolete space map any segments that are no longer + * referenced as of the given txg. + * + * If the obsolete space map doesn't exist yet, create and open it. + */ +void +vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + ASSERT3U(vic->vic_mapping_object, !=, 0); + ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); + ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); + + if (vdev_obsolete_sm_object(vd) == 0) { + uint64_t obsolete_sm_object = + space_map_alloc(spa->spa_meta_objset, tx); + + ASSERT(vd->vdev_top_zap != 0); + VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, + sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); + ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0); + + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(space_map_open(&vd->vdev_obsolete_sm, + spa->spa_meta_objset, obsolete_sm_object, + 0, vd->vdev_asize, 0)); + space_map_update(vd->vdev_obsolete_sm); + } + + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + + space_map_write(vd->vdev_obsolete_sm, + vd->vdev_obsolete_segments, SM_ALLOC, tx); + space_map_update(vd->vdev_obsolete_sm); + range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); +} + +int +spa_condense_init(spa_t *spa) +{ + int error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), + sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), + &spa->spa_condensing_indirect_phys); + if (error == 0) { + if (spa_writeable(spa)) { + spa->spa_condensing_indirect = + spa_condensing_indirect_create(spa); + } + return (0); + } else if (error == ENOENT) { + return (0); + } else { + return (error); + } +} + +void +spa_condense_fini(spa_t *spa) +{ + if (spa->spa_condensing_indirect != NULL) { + spa_condensing_indirect_destroy(spa->spa_condensing_indirect); + spa->spa_condensing_indirect = NULL; + } +} + +/* + * Restart the condense - called when the pool is opened. + */ +void +spa_condense_indirect_restart(spa_t *spa) +{ + vdev_t *vd; + ASSERT(spa->spa_condensing_indirect != NULL); + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(spa, + spa->spa_condensing_indirect_phys.scip_vdev); + ASSERT(vd != NULL); + spa_config_exit(spa, SCL_VDEV, FTAG); + + ASSERT3P(spa->spa_condense_thread, ==, NULL); + spa->spa_condense_thread = thread_create(NULL, 0, + spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, + minclsyspri); +} + +/* + * Gets the obsolete spacemap object from the vdev's ZAP. + * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't + * exist yet. + */ +int +vdev_obsolete_sm_object(vdev_t *vd) +{ + ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + if (vd->vdev_top_zap == 0) { + return (0); + } + + uint64_t sm_obj = 0; + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj); + + ASSERT(err == 0 || err == ENOENT); + + return (sm_obj); +} + +boolean_t +vdev_obsolete_counts_are_precise(vdev_t *vd) +{ + ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + if (vd->vdev_top_zap == 0) { + return (B_FALSE); + } + + uint64_t val = 0; + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); + + ASSERT(err == 0 || err == ENOENT); + + return (val != 0); +} + +/* ARGSUSED */ +static void +vdev_indirect_close(vdev_t *vd) +{ +} + +/* ARGSUSED */ +static void +vdev_indirect_io_done(zio_t *zio) +{ +} + +/* ARGSUSED */ +static int +vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + *psize = *max_psize = vd->vdev_asize + + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + *logical_ashift = vd->vdev_ashift; + *physical_ashift = vd->vdev_physical_ashift; + return (0); +} + +typedef struct remap_segment { + vdev_t *rs_vd; + uint64_t rs_offset; + uint64_t rs_asize; + uint64_t rs_split_offset; + list_node_t rs_node; +} remap_segment_t; + +remap_segment_t * +rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) +{ + remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); + rs->rs_vd = vd; + rs->rs_offset = offset; + rs->rs_asize = asize; + rs->rs_split_offset = split_offset; + return (rs); +} + +/* + * Goes through the relevant indirect mappings until it hits a concrete vdev + * and issues the callback. On the way to the concrete vdev, if any other + * indirect vdevs are encountered, then the callback will also be called on + * each of those indirect vdevs. For example, if the segment is mapped to + * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is + * mapped to segment B on concrete vdev 2, then the callback will be called on + * both vdev 1 and vdev 2. + * + * While the callback passed to vdev_indirect_remap() is called on every vdev + * the function encounters, certain callbacks only care about concrete vdevs. + * These types of callbacks should return immediately and explicitly when they + * are called on an indirect vdev. + * + * Because there is a possibility that a DVA section in the indirect device + * has been split into multiple sections in our mapping, we keep track + * of the relevant contiguous segments of the new location (remap_segment_t) + * in a stack. This way we can call the callback for each of the new sections + * created by a single section of the indirect device. Note though, that in + * this scenario the callbacks in each split block won't occur in-order in + * terms of offset, so callers should not make any assumptions about that. + * + * For callbacks that don't handle split blocks and immediately return when + * they encounter them (as is the case for remap_blkptr_cb), the caller can + * assume that its callback will be applied from the first indirect vdev + * encountered to the last one and then the concrete vdev, in that order. + */ +static void +vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, + void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) +{ + list_t stack; + spa_t *spa = vd->vdev_spa; + + list_create(&stack, sizeof (remap_segment_t), + offsetof(remap_segment_t, rs_node)); + + for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); + rs != NULL; rs = list_remove_head(&stack)) { + vdev_t *v = rs->rs_vd; + + /* + * Note: this can be called from open context + * (eg. zio_read()), so we need the rwlock to prevent + * the mapping from being changed by condensing. + */ + rw_enter(&v->vdev_indirect_rwlock, RW_READER); + vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; + ASSERT3P(vim, !=, NULL); + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + ASSERT(rs->rs_asize > 0); + + vdev_indirect_mapping_entry_phys_t *mapping = + vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset); + ASSERT3P(mapping, !=, NULL); + + while (rs->rs_asize > 0) { + /* + * Note: the vdev_indirect_mapping can not change + * while we are running. It only changes while the + * removal is in progress, and then only from syncing + * context. While a removal is in progress, this + * function is only called for frees, which also only + * happen from syncing context. + */ + + uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); + uint64_t dst_offset = + DVA_GET_OFFSET(&mapping->vimep_dst); + uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst); + + ASSERT3U(rs->rs_offset, >=, + DVA_MAPPING_GET_SRC_OFFSET(mapping)); + ASSERT3U(rs->rs_offset, <, + DVA_MAPPING_GET_SRC_OFFSET(mapping) + size); + ASSERT3U(dst_vdev, !=, v->vdev_id); + + uint64_t inner_offset = rs->rs_offset - + DVA_MAPPING_GET_SRC_OFFSET(mapping); + uint64_t inner_size = + MIN(rs->rs_asize, size - inner_offset); + + vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); + ASSERT3P(dst_v, !=, NULL); + + if (dst_v->vdev_ops == &vdev_indirect_ops) { + list_insert_head(&stack, + rs_alloc(dst_v, dst_offset + inner_offset, + inner_size, rs->rs_split_offset)); + + } + + if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && + IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { + /* + * Note: This clause exists only solely for + * testing purposes. We use it to ensure that + * split blocks work and that the callbacks + * using them yield the same result if issued + * in reverse order. + */ + uint64_t inner_half = inner_size / 2; + + func(rs->rs_split_offset + inner_half, dst_v, + dst_offset + inner_offset + inner_half, + inner_half, arg); + + func(rs->rs_split_offset, dst_v, + dst_offset + inner_offset, + inner_half, arg); + } else { + func(rs->rs_split_offset, dst_v, + dst_offset + inner_offset, + inner_size, arg); + } + + rs->rs_offset += inner_size; + rs->rs_asize -= inner_size; + rs->rs_split_offset += inner_size; + mapping++; + } + + rw_exit(&v->vdev_indirect_rwlock); + kmem_free(rs, sizeof (remap_segment_t)); + } + list_destroy(&stack); +} + +static void +vdev_indirect_child_io_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); + + abd_put(zio->io_abd); +} + +static void +vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + zio_t *zio = arg; + + ASSERT3P(vd, !=, NULL); + + if (vd->vdev_ops == &vdev_indirect_ops) + return; + + zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, + abd_get_offset(zio->io_abd, split_offset), + size, zio->io_type, zio->io_priority, + 0, vdev_indirect_child_io_done, zio)); +} + +static void +vdev_indirect_io_start(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + if (zio->io_type != ZIO_TYPE_READ) { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT((zio->io_flags & + (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0); + } + + vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, + vdev_indirect_io_start_cb, zio); + + zio_execute(zio); +} + +vdev_ops_t vdev_indirect_ops = { + vdev_indirect_open, + vdev_indirect_close, + vdev_default_asize, + vdev_indirect_io_start, + vdev_indirect_io_done, + NULL, + NULL, + NULL, + vdev_indirect_remap, + VDEV_TYPE_INDIRECT, /* name of this vdev type */ + B_FALSE /* leaf vdev */ +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c new file mode 100644 index 000000000000..fbecbe830929 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c @@ -0,0 +1,212 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include + +static boolean_t +vdev_indirect_births_verify(vdev_indirect_births_t *vib) +{ + ASSERT(vib != NULL); + + ASSERT(vib->vib_object != 0); + ASSERT(vib->vib_objset != NULL); + ASSERT(vib->vib_phys != NULL); + ASSERT(vib->vib_dbuf != NULL); + + EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL); + + return (B_TRUE); +} + +uint64_t +vdev_indirect_births_count(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib->vib_phys->vib_count); +} + +uint64_t +vdev_indirect_births_object(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib->vib_object); +} + +static uint64_t +vdev_indirect_births_size_impl(vdev_indirect_births_t *vib) +{ + return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries)); +} + +void +vdev_indirect_births_close(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + if (vib->vib_phys->vib_count > 0) { + uint64_t births_size = vdev_indirect_births_size_impl(vib); + + kmem_free(vib->vib_entries, births_size); + vib->vib_entries = NULL; + } + + dmu_buf_rele(vib->vib_dbuf, vib); + + vib->vib_objset = NULL; + vib->vib_object = 0; + vib->vib_dbuf = NULL; + vib->vib_phys = NULL; + + kmem_free(vib, sizeof (*vib)); +} + +uint64_t +vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + + return (dmu_object_alloc(os, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t), + tx)); +} + +vdev_indirect_births_t * +vdev_indirect_births_open(objset_t *os, uint64_t births_object) +{ + vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP); + + vib->vib_objset = os; + vib->vib_object = births_object; + + VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf)); + vib->vib_phys = vib->vib_dbuf->db_data; + + if (vib->vib_phys->vib_count > 0) { + uint64_t births_size = vdev_indirect_births_size_impl(vib); + vib->vib_entries = kmem_alloc(births_size, KM_SLEEP); + VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0, + births_size, vib->vib_entries, DMU_READ_PREFETCH)); + } + + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib); +} + +void +vdev_indirect_births_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + VERIFY0(dmu_object_free(os, object, tx)); +} + +void +vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, + uint64_t max_offset, uint64_t txg, dmu_tx_t *tx) +{ + vdev_indirect_birth_entry_phys_t vibe; + uint64_t old_size; + uint64_t new_size; + vdev_indirect_birth_entry_phys_t *new_entries; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); + ASSERT(vdev_indirect_births_verify(vib)); + + dmu_buf_will_dirty(vib->vib_dbuf, tx); + + vibe.vibe_offset = max_offset; + vibe.vibe_phys_birth_txg = txg; + + old_size = vdev_indirect_births_size_impl(vib); + dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe), + &vibe, tx); + vib->vib_phys->vib_count++; + new_size = vdev_indirect_births_size_impl(vib); + + new_entries = kmem_alloc(new_size, KM_SLEEP); + if (old_size > 0) { + bcopy(vib->vib_entries, new_entries, old_size); + kmem_free(vib->vib_entries, old_size); + } + new_entries[vib->vib_phys->vib_count - 1] = vibe; + vib->vib_entries = new_entries; +} + +uint64_t +vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + ASSERT(vib->vib_phys->vib_count > 0); + + vdev_indirect_birth_entry_phys_t *last = + &vib->vib_entries[vib->vib_phys->vib_count - 1]; + return (last->vibe_phys_birth_txg); +} + +/* + * Return the txg in which the given range was copied (i.e. its physical + * birth txg). The specified offset+asize must be contiguously mapped + * (i.e. not a split block). + * + * The entries are sorted by increasing phys_birth, and also by increasing + * offset. We find the specified offset by binary search. Note that we + * can not use bsearch() because looking at each entry independently is + * insufficient to find the correct entry. Each entry implicitly relies + * on the previous entry: an entry indicates that the offsets from the + * end of the previous entry to the end of this entry were written in the + * specified txg. + */ +uint64_t +vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset, + uint64_t asize) +{ + vdev_indirect_birth_entry_phys_t *base; + vdev_indirect_birth_entry_phys_t *last; + + ASSERT(vdev_indirect_births_verify(vib)); + ASSERT(vib->vib_phys->vib_count > 0); + + base = vib->vib_entries; + last = base + vib->vib_phys->vib_count - 1; + + ASSERT3U(offset, <, last->vibe_offset); + + while (last >= base) { + vdev_indirect_birth_entry_phys_t *p = + base + ((last - base) / 2); + if (offset >= p->vibe_offset) { + base = p + 1; + } else if (p == vib->vib_entries || + offset >= (p - 1)->vibe_offset) { + ASSERT3U(offset + asize, <=, p->vibe_offset); + return (p->vibe_phys_birth_txg); + } else { + last = p - 1; + } + } + ASSERT(!"offset not found"); + return (-1); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c new file mode 100644 index 000000000000..ea80fbc4733f --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c @@ -0,0 +1,594 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +static boolean_t +vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) +{ + ASSERT(vim != NULL); + + ASSERT(vim->vim_object != 0); + ASSERT(vim->vim_objset != NULL); + ASSERT(vim->vim_phys != NULL); + ASSERT(vim->vim_dbuf != NULL); + + EQUIV(vim->vim_phys->vimp_num_entries > 0, + vim->vim_entries != NULL); + if (vim->vim_phys->vimp_num_entries > 0) { + vdev_indirect_mapping_entry_phys_t *last_entry = + &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]; + uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry); + uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst); + + ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size); + } + if (vim->vim_havecounts) { + ASSERT(vim->vim_phys->vimp_counts_object != 0); + } + + return (B_TRUE); +} + +uint64_t +vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_num_entries); +} + +uint64_t +vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_max_offset); +} + +uint64_t +vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_object); +} + +uint64_t +vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_bytes_mapped); +} + +/* + * The length (in bytes) of the mapping object array in memory and + * (logically) on disk. + * + * Note that unlike most of our accessor functions, + * we don't assert that the struct is consistent; therefore it can be + * called while there may be concurrent changes, if we don't care about + * the value being immediately stale (e.g. from spa_removal_get_stats()). + */ +uint64_t +vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim) +{ + return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries)); +} + +/* + * Compare an offset with an indirect mapping entry; there are three + * possible scenarios: + * + * 1. The offset is "less than" the mapping entry; meaning the + * offset is less than the source offset of the mapping entry. In + * this case, there is no overlap between the offset and the + * mapping entry and -1 will be returned. + * + * 2. The offset is "greater than" the mapping entry; meaning the + * offset is greater than the mapping entry's source offset plus + * the entry's size. In this case, there is no overlap between + * the offset and the mapping entry and 1 will be returned. + * + * NOTE: If the offset is actually equal to the entry's offset + * plus size, this is considered to be "greater" than the entry, + * and this case applies (i.e. 1 will be returned). Thus, the + * entry's "range" can be considered to be inclusive at its + * start, but exclusive at its end: e.g. [src, src + size). + * + * 3. The last case to consider is if the offset actually falls + * within the mapping entry's range. If this is the case, the + * offset is considered to be "equal to" the mapping entry and + * 0 will be returned. + * + * NOTE: If the offset is equal to the entry's source offset, + * this case applies and 0 will be returned. If the offset is + * equal to the entry's source plus its size, this case does + * *not* apply (see "NOTE" above for scenario 2), and 1 will be + * returned. + */ +static int +dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) +{ + const uint64_t *key = v_key; + const vdev_indirect_mapping_entry_phys_t *array_elem = + v_array_elem; + uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); + + if (*key < src_offset) { + return (-1); + } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { + return (0); + } else { + return (1); + } +} + +/* + * Returns the mapping entry for the given offset. + * + * It's possible that the given offset will not be in the mapping table + * (i.e. no mapping entries contain this offset), in which case, the + * return value value depends on the "next_if_missing" parameter. + * + * If the offset is not found in the table and "next_if_missing" is + * B_FALSE, then NULL will always be returned. The behavior is intended + * to allow consumers to get the entry corresponding to the offset + * parameter, iff the offset overlaps with an entry in the table. + * + * If the offset is not found in the table and "next_if_missing" is + * B_TRUE, then the entry nearest to the given offset will be returned, + * such that the entry's source offset is greater than the offset + * passed in (i.e. the "next" mapping entry in the table is returned, if + * the offset is missing from the table). If there are no entries whose + * source offset is greater than the passed in offset, NULL is returned. + */ +static vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim, + uint64_t offset, boolean_t next_if_missing) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + ASSERT(vim->vim_phys->vimp_num_entries > 0); + + vdev_indirect_mapping_entry_phys_t *entry = NULL; + + uint64_t last = vim->vim_phys->vimp_num_entries - 1; + uint64_t base = 0; + + /* + * We don't define these inside of the while loop because we use + * their value in the case that offset isn't in the mapping. + */ + uint64_t mid; + int result; + + while (last >= base) { + mid = base + ((last - base) >> 1); + + result = dva_mapping_overlap_compare(&offset, + &vim->vim_entries[mid]); + + if (result == 0) { + entry = &vim->vim_entries[mid]; + break; + } else if (result < 0) { + last = mid - 1; + } else { + base = mid + 1; + } + } + + if (entry == NULL && next_if_missing) { + ASSERT3U(base, ==, last + 1); + ASSERT(mid == base || mid == last); + ASSERT3S(result, !=, 0); + + /* + * The offset we're looking for isn't actually contained + * in the mapping table, thus we need to return the + * closest mapping entry that is greater than the + * offset. We reuse the result of the last comparison, + * comparing the mapping entry at index "mid" and the + * offset. The offset is guaranteed to lie between + * indices one less than "mid", and one greater than + * "mid"; we just need to determine if offset is greater + * than, or less than the mapping entry contained at + * index "mid". + */ + + uint64_t index; + if (result < 0) + index = mid; + else + index = mid + 1; + + ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries); + + if (index == vim->vim_phys->vimp_num_entries) { + /* + * If "index" is past the end of the entries + * array, then not only is the offset not in the + * mapping table, but it's actually greater than + * all entries in the table. In this case, we + * can't return a mapping entry greater than the + * offset (since none exist), so we return NULL. + */ + + ASSERT3S(dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index - 1]), >, 0); + + return (NULL); + } else { + /* + * Just to be safe, we verify the offset falls + * in between the mapping entries at index and + * one less than index. Since we know the offset + * doesn't overlap an entry, and we're supposed + * to return the entry just greater than the + * offset, both of the following tests must be + * true. + */ + ASSERT3S(dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index]), <, 0); + IMPLY(index >= 1, dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index - 1]) > 0); + + return (&vim->vim_entries[index]); + } + } else { + return (entry); + } +} + +vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, + uint64_t offset) +{ + return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, + B_FALSE)); +} + +vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, + uint64_t offset) +{ + return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, + B_TRUE)); +} + + +void +vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + if (vim->vim_phys->vimp_num_entries > 0) { + uint64_t map_size = vdev_indirect_mapping_size(vim); + kmem_free(vim->vim_entries, map_size); + vim->vim_entries = NULL; + } + + dmu_buf_rele(vim->vim_dbuf, vim); + + vim->vim_objset = NULL; + vim->vim_object = 0; + vim->vim_dbuf = NULL; + vim->vim_phys = NULL; + + kmem_free(vim, sizeof (*vim)); +} + +uint64_t +vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t object; + ASSERT(dmu_tx_is_syncing(tx)); + uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0; + + if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + bonus_size = sizeof (vdev_indirect_mapping_phys_t); + } + + object = dmu_object_alloc(os, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, bonus_size, + tx); + + if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + dmu_buf_t *dbuf; + vdev_indirect_mapping_phys_t *vimp; + + VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + vimp = dbuf->db_data; + vimp->vimp_counts_object = dmu_object_alloc(os, + DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + dmu_buf_rele(dbuf, FTAG); + } + + return (object); +} + + +vdev_indirect_mapping_t * +vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object) +{ + vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP); + dmu_object_info_t doi; + VERIFY0(dmu_object_info(os, mapping_object, &doi)); + + vim->vim_objset = os; + vim->vim_object = mapping_object; + + VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim, + &vim->vim_dbuf)); + vim->vim_phys = vim->vim_dbuf->db_data; + + vim->vim_havecounts = + (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0); + + if (vim->vim_phys->vimp_num_entries > 0) { + uint64_t map_size = vdev_indirect_mapping_size(vim); + vim->vim_entries = kmem_alloc(map_size, KM_SLEEP); + VERIFY0(dmu_read(os, vim->vim_object, 0, map_size, + vim->vim_entries, DMU_READ_PREFETCH)); + } + + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim); +} + +void +vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object); + if (vim->vim_havecounts) { + VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object, + tx)); + spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + } + vdev_indirect_mapping_close(vim); + + VERIFY0(dmu_object_free(os, object, tx)); +} + +/* + * Append the list of vdev_indirect_mapping_entry_t's to the on-disk + * mapping object. Also remove the entries from the list and free them. + * This also implicitly extends the max_offset of the mapping (to the end + * of the last entry). + */ +void +vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, + list_t *list, dmu_tx_t *tx) +{ + vdev_indirect_mapping_entry_phys_t *mapbuf; + uint64_t old_size; + uint32_t *countbuf = NULL; + vdev_indirect_mapping_entry_phys_t *old_entries; + uint64_t old_count; + uint64_t entries_written = 0; + + ASSERT(vdev_indirect_mapping_verify(vim)); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); + ASSERT(!list_is_empty(list)); + + old_size = vdev_indirect_mapping_size(vim); + old_entries = vim->vim_entries; + old_count = vim->vim_phys->vimp_num_entries; + + dmu_buf_will_dirty(vim->vim_dbuf, tx); + + mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); + if (vim->vim_havecounts) { + countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); + ASSERT(spa_feature_is_active(vim->vim_objset->os_spa, + SPA_FEATURE_OBSOLETE_COUNTS)); + } + while (!list_is_empty(list)) { + uint64_t i; + /* + * Write entries from the list to the + * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE. + */ + for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) { + vdev_indirect_mapping_entry_t *entry = + list_remove_head(list); + if (entry == NULL) + break; + + uint64_t size = + DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst); + uint64_t src_offset = + DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping); + + /* + * We shouldn't be adding an entry which is fully + * obsolete. + */ + ASSERT3U(entry->vime_obsolete_count, <, size); + IMPLY(entry->vime_obsolete_count != 0, + vim->vim_havecounts); + + mapbuf[i] = entry->vime_mapping; + if (vim->vim_havecounts) + countbuf[i] = entry->vime_obsolete_count; + + vim->vim_phys->vimp_bytes_mapped += size; + ASSERT3U(src_offset, >=, + vim->vim_phys->vimp_max_offset); + vim->vim_phys->vimp_max_offset = src_offset + size; + + entries_written++; + + kmem_free(entry, sizeof (*entry)); + } + dmu_write(vim->vim_objset, vim->vim_object, + vim->vim_phys->vimp_num_entries * sizeof (*mapbuf), + i * sizeof (*mapbuf), + mapbuf, tx); + if (vim->vim_havecounts) { + dmu_write(vim->vim_objset, + vim->vim_phys->vimp_counts_object, + vim->vim_phys->vimp_num_entries * + sizeof (*countbuf), + i * sizeof (*countbuf), countbuf, tx); + } + vim->vim_phys->vimp_num_entries += i; + } + zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); + if (vim->vim_havecounts) + zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE); + + /* + * Update the entry array to reflect the new entries. First, copy + * over any old entries then read back the new entries we just wrote. + */ + uint64_t new_size = vdev_indirect_mapping_size(vim); + ASSERT3U(new_size, >, old_size); + ASSERT3U(new_size - old_size, ==, + entries_written * sizeof (vdev_indirect_mapping_entry_phys_t)); + vim->vim_entries = kmem_alloc(new_size, KM_SLEEP); + if (old_size > 0) { + bcopy(old_entries, vim->vim_entries, old_size); + kmem_free(old_entries, old_size); + } + VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size, + new_size - old_size, &vim->vim_entries[old_count], + DMU_READ_PREFETCH)); + + zfs_dbgmsg("txg %llu: wrote %llu entries to " + "indirect mapping obj %llu; max offset=0x%llx", + (u_longlong_t)dmu_tx_get_txg(tx), + (u_longlong_t)entries_written, + (u_longlong_t)vim->vim_object, + (u_longlong_t)vim->vim_phys->vimp_max_offset); +} + +/* + * Increment the relevant counts for the specified offset and length. + * The counts array must be obtained from + * vdev_indirect_mapping_load_obsolete_counts(). + */ +void +vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim, + uint64_t offset, uint64_t length, uint32_t *counts) +{ + vdev_indirect_mapping_entry_phys_t *mapping; + uint64_t index; + + mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); + + ASSERT(length > 0); + ASSERT3P(mapping, !=, NULL); + + index = mapping - vim->vim_entries; + + while (length > 0) { + ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim)); + + uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); + uint64_t inner_offset = offset - + DVA_MAPPING_GET_SRC_OFFSET(mapping); + VERIFY3U(inner_offset, <, size); + uint64_t inner_size = MIN(length, size - inner_offset); + + VERIFY3U(counts[index] + inner_size, <=, size); + counts[index] += inner_size; + + offset += inner_size; + length -= inner_size; + mapping++; + index++; + } +} + +typedef struct load_obsolete_space_map_arg { + vdev_indirect_mapping_t *losma_vim; + uint32_t *losma_counts; +} load_obsolete_space_map_arg_t; + +static int +load_obsolete_sm_callback(maptype_t type, uint64_t offset, uint64_t size, + void *arg) +{ + load_obsolete_space_map_arg_t *losma = arg; + ASSERT3S(type, ==, SM_ALLOC); + + vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, + offset, size, losma->losma_counts); + + return (0); +} + +/* + * Modify the counts (increment them) based on the spacemap. + */ +void +vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, + uint32_t *counts, space_map_t *obsolete_space_sm) +{ + load_obsolete_space_map_arg_t losma; + losma.losma_counts = counts; + losma.losma_vim = vim; + VERIFY0(space_map_iterate(obsolete_space_sm, + load_obsolete_sm_callback, &losma)); +} + +/* + * Read the obsolete counts from disk, returning them in an array. + */ +uint32_t * +vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + uint64_t counts_size = + vim->vim_phys->vimp_num_entries * sizeof (uint32_t); + uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP); + if (vim->vim_havecounts) { + VERIFY0(dmu_read(vim->vim_objset, + vim->vim_phys->vimp_counts_object, + 0, counts_size, + counts, DMU_READ_PREFETCH)); + } else { + bzero(counts, counts_size); + } + return (counts); +} + +extern void +vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim, + uint32_t *counts) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c index 47b31ad180b1..9b14dbf105b8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -143,6 +143,7 @@ #include #include #include +#include #include #include #include @@ -222,6 +223,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags) { nvlist_t *nv = NULL; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; nv = fnvlist_alloc(); @@ -285,9 +287,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); - if (vd->vdev_removing) + if (vd->vdev_removing) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); + } } if (vd->vdev_dtl_sm != NULL) { @@ -295,6 +298,21 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, space_map_object(vd->vdev_dtl_sm)); } + if (vic->vic_mapping_object != 0) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, + vic->vic_mapping_object); + } + + if (vic->vic_births_object != 0) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, + vic->vic_births_object); + } + + if (vic->vic_prev_indirect_vdev != UINT64_MAX) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, + vic->vic_prev_indirect_vdev); + } + if (vd->vdev_crtxg) fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); @@ -314,18 +332,73 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (getstats) { vdev_stat_t vs; - pool_scan_stat_t ps; vdev_get_stats(vd, &vs); fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)); /* provide either current or previous scan information */ + pool_scan_stat_t ps; if (spa_scan_get_stats(spa, &ps) == 0) { fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, sizeof (pool_scan_stat_t) / sizeof (uint64_t)); } + + pool_removal_stat_t prs; + if (spa_removal_get_stats(spa, &prs) == 0) { + fnvlist_add_uint64_array(nv, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, + sizeof (prs) / sizeof (uint64_t)); + } + + /* + * Note: this can be called from open context + * (spa_get_stats()), so we need the rwlock to prevent + * the mapping from being changed by condensing. + */ + rw_enter(&vd->vdev_indirect_rwlock, RW_READER); + if (vd->vdev_indirect_mapping != NULL) { + ASSERT(vd->vdev_indirect_births != NULL); + vdev_indirect_mapping_t *vim = + vd->vdev_indirect_mapping; + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, + vdev_indirect_mapping_size(vim)); + } + rw_exit(&vd->vdev_indirect_rwlock); + if (vd->vdev_mg != NULL && + vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) { + /* + * Compute approximately how much memory would be used + * for the indirect mapping if this device were to + * be removed. + * + * Note: If the frag metric is invalid, then not + * enough metaslabs have been converted to have + * histograms. + */ + uint64_t seg_count = 0; + + /* + * There are the same number of allocated segments + * as free segments, so we will have at least one + * entry per free segment. + */ + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + seg_count += vd->vdev_mg->mg_histogram[i]; + } + + /* + * The maximum length of a mapping is SPA_MAXBLOCKSIZE, + * so we need at least one entry per SPA_MAXBLOCKSIZE + * of allocated data. + */ + seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE; + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, + seg_count * + sizeof (vdev_indirect_mapping_entry_phys_t)); + } } if (!vd->vdev_ops->vdev_op_leaf) { @@ -422,8 +495,9 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) for (c = 0, idx = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - if (tvd->vdev_ishole) + if (tvd->vdev_ishole) { array[idx++] = c; + } } if (idx) { @@ -1109,8 +1183,11 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) */ zio = zio_root(spa, NULL, NULL, flags); - for (int v = 0; v < svdcount; v++) - zio_flush(zio, svd[v]); + for (int v = 0; v < svdcount; v++) { + if (vdev_writeable(svd[v])) { + zio_flush(zio, svd[v]); + } + } (void) zio_wait(zio); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c index b7254261cfb3..8bbff272f907 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -677,6 +677,7 @@ vdev_ops_t vdev_mirror_ops = { vdev_mirror_state_change, NULL, NULL, + NULL, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -690,6 +691,7 @@ vdev_ops_t vdev_replacing_ops = { vdev_mirror_state_change, NULL, NULL, + NULL, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -703,6 +705,7 @@ vdev_ops_t vdev_spare_ops = { vdev_mirror_state_change, NULL, NULL, + NULL, VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c index 24a44a697769..bc719a48df96 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c @@ -89,6 +89,7 @@ vdev_ops_t vdev_missing_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_MISSING, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -102,6 +103,7 @@ vdev_ops_t vdev_hole_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_HOLE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index ecc97348308a..32aaf952e37f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -155,6 +155,8 @@ uint32_t zfs_vdev_trim_min_active = 1; * that a typical SSD can process the queued IOs in a single request. */ uint32_t zfs_vdev_trim_max_active = 64; +uint32_t zfs_vdev_removal_min_active = 1; +uint32_t zfs_vdev_removal_max_active = 2; /* @@ -530,6 +532,8 @@ vdev_queue_class_min_active(zio_priority_t p) return (zfs_vdev_scrub_min_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); + case ZIO_PRIORITY_REMOVAL: + return (zfs_vdev_removal_min_active); default: panic("invalid priority %u", p); return (0); @@ -591,6 +595,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); + case ZIO_PRIORITY_REMOVAL: + return (zfs_vdev_removal_max_active); default: panic("invalid priority %u", p); return (0); @@ -688,7 +694,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) while (t != NULL && (dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && - IO_GAP(dio, first) <= maxgap) { + IO_GAP(dio, first) <= maxgap && + dio->io_type == zio->io_type) { first = dio; if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = first; @@ -712,7 +719,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit || (dio->io_flags & ZIO_FLAG_OPTIONAL)) && - IO_GAP(last, dio) <= maxgap) { + IO_GAP(last, dio) <= maxgap && + dio->io_type == zio->io_type) { last = dio; if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = last; @@ -872,11 +880,13 @@ vdev_queue_io(zio_t *zio) if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && zio->io_priority != ZIO_PRIORITY_ASYNC_READ && - zio->io_priority != ZIO_PRIORITY_SCRUB) + zio->io_priority != ZIO_PRIORITY_SCRUB && + zio->io_priority != ZIO_PRIORITY_REMOVAL) zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && - zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE) + zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && + zio->io_priority != ZIO_PRIORITY_REMOVAL) zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } else { ASSERT(zio->io_type == ZIO_TYPE_FREE); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c index 9395f3c5b9ef..29be8c95f325 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -2593,6 +2593,7 @@ vdev_ops_t vdev_raidz_ops = { vdev_raidz_state_change, NULL, NULL, + NULL, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c new file mode 100644 index 000000000000..3d125e3c9fae --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c @@ -0,0 +1,1919 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file contains the necessary logic to remove vdevs from a + * storage pool. Currently, the only devices that can be removed + * are log, cache, and spare devices; and top level vdevs from a pool + * w/o raidz. (Note that members of a mirror can also be removed + * by the detach operation.) + * + * Log vdevs are removed by evacuating them and then turning the vdev + * into a hole vdev while holding spa config locks. + * + * Top level vdevs are removed and converted into an indirect vdev via + * a multi-step process: + * + * - Disable allocations from this device (spa_vdev_remove_top). + * + * - From a new thread (spa_vdev_remove_thread), copy data from + * the removing vdev to a different vdev. The copy happens in open + * context (spa_vdev_copy_impl) and issues a sync task + * (vdev_mapping_sync) so the sync thread can update the partial + * indirect mappings in core and on disk. + * + * - If a free happens during a removal, it is freed from the + * removing vdev, and if it has already been copied, from the new + * location as well (free_from_removing_vdev). + * + * - After the removal is completed, the copy thread converts the vdev + * into an indirect vdev (vdev_remove_complete) before instructing + * the sync thread to destroy the space maps and finish the removal + * (spa_finish_removal). + */ + +typedef struct vdev_copy_arg { + metaslab_t *vca_msp; + uint64_t vca_outstanding_bytes; + kcondvar_t vca_cv; + kmutex_t vca_lock; +} vdev_copy_arg_t; + +typedef struct vdev_copy_seg_arg { + vdev_copy_arg_t *vcsa_copy_arg; + uint64_t vcsa_txg; + dva_t *vcsa_dest_dva; + blkptr_t *vcsa_dest_bp; +} vdev_copy_seg_arg_t; + +/* + * The maximum amount of allowed data we're allowed to copy from a device + * at a time when removing it. + */ +int zfs_remove_max_copy_bytes = 8 * 1024 * 1024; + +/* + * The largest contiguous segment that we will attempt to allocate when + * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If + * there is a performance problem with attempting to allocate large blocks, + * consider decreasing this. + * + * Note: we will issue I/Os of up to this size. The mpt driver does not + * respond well to I/Os larger than 1MB, so we set this to 1MB. (When + * mpt processes an I/O larger than 1MB, it needs to do an allocation of + * 2 physically contiguous pages; if this allocation fails, mpt will drop + * the I/O and hang the device.) + */ +int zfs_remove_max_segment = 1024 * 1024; + +#define VDEV_REMOVAL_ZAP_OBJS "lzap" + +static void spa_vdev_remove_thread(void *arg); + +static void +spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) +{ + VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_REMOVING, sizeof (uint64_t), + sizeof (spa->spa_removing_phys) / sizeof (uint64_t), + &spa->spa_removing_phys, tx)); +} + +static nvlist_t * +spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) +{ + for (int i = 0; i < count; i++) { + uint64_t guid = + fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); + + if (guid == target_guid) + return (nvpp[i]); + } + + return (NULL); +} + +static void +spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, + nvlist_t *dev_to_remove) +{ + nvlist_t **newdev = NULL; + + if (count > 1) + newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); + + for (int i = 0, j = 0; i < count; i++) { + if (dev[i] == dev_to_remove) + continue; + VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); + } + + VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); + VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); + + for (int i = 0; i < count - 1; i++) + nvlist_free(newdev[i]); + + if (count > 1) + kmem_free(newdev, (count - 1) * sizeof (void *)); +} + +static spa_vdev_removal_t * +spa_vdev_removal_create(vdev_t *vd) +{ + spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); + mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); + svr->svr_allocd_segs = range_tree_create(NULL, NULL); + svr->svr_vdev = vd; + + for (int i = 0; i < TXG_SIZE; i++) { + svr->svr_frees[i] = range_tree_create(NULL, NULL); + list_create(&svr->svr_new_segments[i], + sizeof (vdev_indirect_mapping_entry_t), + offsetof(vdev_indirect_mapping_entry_t, vime_node)); + } + + return (svr); +} + +void +spa_vdev_removal_destroy(spa_vdev_removal_t *svr) +{ + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(svr->svr_bytes_done[i]); + ASSERT0(svr->svr_max_offset_to_sync[i]); + range_tree_destroy(svr->svr_frees[i]); + list_destroy(&svr->svr_new_segments[i]); + } + + range_tree_destroy(svr->svr_allocd_segs); + mutex_destroy(&svr->svr_lock); + cv_destroy(&svr->svr_cv); + kmem_free(svr, sizeof (*svr)); +} + +/* + * This is called as a synctask in the txg in which we will mark this vdev + * as removing (in the config stored in the MOS). + * + * It begins the evacuation of a toplevel vdev by: + * - initializing the spa_removing_phys which tracks this removal + * - computing the amount of space to remove for accounting purposes + * - dirtying all dbufs in the spa_config_object + * - creating the spa_vdev_removal + * - starting the spa_vdev_remove_thread + */ +static void +vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *vd = arg; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; + spa_vdev_removal_t *svr = NULL; + uint64_t txg = dmu_tx_get_txg(tx); + + ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + svr = spa_vdev_removal_create(vd); + + ASSERT(vd->vdev_removing); + ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); + + spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + /* + * By activating the OBSOLETE_COUNTS feature, we prevent + * the pool from being downgraded and ensure that the + * refcounts are precise. + */ + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + uint64_t one = 1; + VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, + &one, tx)); + ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0); + } + + vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); + vd->vdev_indirect_mapping = + vdev_indirect_mapping_open(mos, vic->vic_mapping_object); + vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); + vd->vdev_indirect_births = + vdev_indirect_births_open(mos, vic->vic_births_object); + spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; + spa->spa_removing_phys.sr_start_time = gethrestime_sec(); + spa->spa_removing_phys.sr_end_time = 0; + spa->spa_removing_phys.sr_state = DSS_SCANNING; + spa->spa_removing_phys.sr_to_copy = 0; + spa->spa_removing_phys.sr_copied = 0; + + /* + * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because + * there may be space in the defer tree, which is free, but still + * counted in vs_alloc. + */ + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { + metaslab_t *ms = vd->vdev_ms[i]; + if (ms->ms_sm == NULL) + continue; + + /* + * Sync tasks happen before metaslab_sync(), therefore + * smp_alloc and sm_alloc must be the same. + */ + ASSERT3U(space_map_allocated(ms->ms_sm), ==, + ms->ms_sm->sm_phys->smp_alloc); + + spa->spa_removing_phys.sr_to_copy += + space_map_allocated(ms->ms_sm); + + /* + * Space which we are freeing this txg does not need to + * be copied. + */ + spa->spa_removing_phys.sr_to_copy -= + range_tree_space(ms->ms_freeingtree); + + ASSERT0(range_tree_space(ms->ms_freedtree)); + for (int t = 0; t < TXG_SIZE; t++) + ASSERT0(range_tree_space(ms->ms_alloctree[t])); + } + + /* + * Sync tasks are called before metaslab_sync(), so there should + * be no already-synced metaslabs in the TXG_CLEAN list. + */ + ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); + + spa_sync_removing_state(spa, tx); + + /* + * All blocks that we need to read the most recent mapping must be + * stored on concrete vdevs. Therefore, we must dirty anything that + * is read before spa_remove_init(). Specifically, the + * spa_config_object. (Note that although we already modified the + * spa_config_object in spa_sync_removing_state, that may not have + * modified all blocks of the object.) + */ + dmu_object_info_t doi; + VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); + for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { + dmu_buf_t *dbuf; + VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, + offset, FTAG, &dbuf, 0)); + dmu_buf_will_dirty(dbuf, tx); + offset += dbuf->db_size; + dmu_buf_rele(dbuf, FTAG); + } + + /* + * Now that we've allocated the im_object, dirty the vdev to ensure + * that the object gets written to the config on disk. + */ + vdev_config_dirty(vd); + + zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu " + "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), + vic->vic_mapping_object); + + spa_history_log_internal(spa, "vdev remove started", tx, + "%s vdev %llu %s", spa_name(spa), vd->vdev_id, + (vd->vdev_path != NULL) ? vd->vdev_path : "-"); + /* + * Setting spa_vdev_removal causes subsequent frees to call + * free_from_removing_vdev(). Note that we don't need any locking + * because we are the sync thread, and metaslab_free_impl() is only + * called from syncing context (potentially from a zio taskq thread, + * but in any case only when there are outstanding free i/os, which + * there are not). + */ + ASSERT3P(spa->spa_vdev_removal, ==, NULL); + spa->spa_vdev_removal = svr; + svr->svr_thread = thread_create(NULL, 0, + spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri); +} + +/* + * When we are opening a pool, we must read the mapping for each + * indirect vdev in order from most recently removed to least + * recently removed. We do this because the blocks for the mapping + * of older indirect vdevs may be stored on more recently removed vdevs. + * In order to read each indirect mapping object, we must have + * initialized all more recently removed vdevs. + */ +int +spa_remove_init(spa_t *spa) +{ + int error; + + error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_REMOVING, sizeof (uint64_t), + sizeof (spa->spa_removing_phys) / sizeof (uint64_t), + &spa->spa_removing_phys); + + if (error == ENOENT) { + spa->spa_removing_phys.sr_state = DSS_NONE; + spa->spa_removing_phys.sr_removing_vdev = -1; + spa->spa_removing_phys.sr_prev_indirect_vdev = -1; + return (0); + } else if (error != 0) { + return (error); + } + + if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { + /* + * We are currently removing a vdev. Create and + * initialize a spa_vdev_removal_t from the bonus + * buffer of the removing vdevs vdev_im_object, and + * initialize its partial mapping. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *vd = vdev_lookup_top(spa, + spa->spa_removing_phys.sr_removing_vdev); + spa_config_exit(spa, SCL_STATE, FTAG); + + if (vd == NULL) + return (EINVAL); + + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + ASSERT(vdev_is_concrete(vd)); + spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); + ASSERT(svr->svr_vdev->vdev_removing); + + vd->vdev_indirect_mapping = vdev_indirect_mapping_open( + spa->spa_meta_objset, vic->vic_mapping_object); + vd->vdev_indirect_births = vdev_indirect_births_open( + spa->spa_meta_objset, vic->vic_births_object); + + spa->spa_vdev_removal = svr; + } + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + uint64_t indirect_vdev_id = + spa->spa_removing_phys.sr_prev_indirect_vdev; + while (indirect_vdev_id != UINT64_MAX) { + vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + vd->vdev_indirect_mapping = vdev_indirect_mapping_open( + spa->spa_meta_objset, vic->vic_mapping_object); + vd->vdev_indirect_births = vdev_indirect_births_open( + spa->spa_meta_objset, vic->vic_births_object); + + indirect_vdev_id = vic->vic_prev_indirect_vdev; + } + spa_config_exit(spa, SCL_STATE, FTAG); + + /* + * Now that we've loaded all the indirect mappings, we can allow + * reads from other blocks (e.g. via predictive prefetch). + */ + spa->spa_indirect_vdevs_loaded = B_TRUE; + return (0); +} + +void +spa_restart_removal(spa_t *spa) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + if (svr == NULL) + return; + + /* + * In general when this function is called there is no + * removal thread running. The only scenario where this + * is not true is during spa_import() where this function + * is called twice [once from spa_import_impl() and + * spa_async_resume()]. Thus, in the scenario where we + * import a pool that has an ongoing removal we don't + * want to spawn a second thread. + */ + if (svr->svr_thread != NULL) + return; + + if (!spa_writeable(spa)) + return; + + vdev_t *vd = svr->svr_vdev; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT3P(vd, !=, NULL); + ASSERT(vd->vdev_removing); + + zfs_dbgmsg("restarting removal of %llu at count=%llu", + vd->vdev_id, vdev_indirect_mapping_num_entries(vim)); + svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd, + 0, &p0, TS_RUN, minclsyspri); +} + +/* + * Process freeing from a device which is in the middle of being removed. + * We must handle this carefully so that we attempt to copy freed data, + * and we correctly free already-copied data. + */ +void +free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size, + uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t max_offset_yet = 0; + + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, + vdev_indirect_mapping_object(vim)); + ASSERT3P(vd, ==, svr->svr_vdev); + ASSERT3U(spa_syncing_txg(spa), ==, txg); + + mutex_enter(&svr->svr_lock); + + /* + * Remove the segment from the removing vdev's spacemap. This + * ensures that we will not attempt to copy this space (if the + * removal thread has not yet visited it), and also ensures + * that we know what is actually allocated on the new vdevs + * (needed if we cancel the removal). + * + * Note: we must do the metaslab_free_concrete() with the svr_lock + * held, so that the remove_thread can not load this metaslab and then + * visit this offset between the time that we metaslab_free_concrete() + * and when we check to see if it has been visited. + */ + metaslab_free_concrete(vd, offset, size, txg); + + uint64_t synced_size = 0; + uint64_t synced_offset = 0; + uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); + if (offset < max_offset_synced) { + /* + * The mapping for this offset is already on disk. + * Free from the new location. + * + * Note that we use svr_max_synced_offset because it is + * updated atomically with respect to the in-core mapping. + * By contrast, vim_max_offset is not. + * + * This block may be split between a synced entry and an + * in-flight or unvisited entry. Only process the synced + * portion of it here. + */ + synced_size = MIN(size, max_offset_synced - offset); + synced_offset = offset; + + ASSERT3U(max_offset_yet, <=, max_offset_synced); + max_offset_yet = max_offset_synced; + + DTRACE_PROBE3(remove__free__synced, + spa_t *, spa, + uint64_t, offset, + uint64_t, synced_size); + + size -= synced_size; + offset += synced_size; + } + + /* + * Look at all in-flight txgs starting from the currently syncing one + * and see if a section of this free is being copied. By starting from + * this txg and iterating forward, we might find that this region + * was copied in two different txgs and handle it appropriately. + */ + for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { + int txgoff = (txg + i) & TXG_MASK; + if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { + /* + * The mapping for this offset is in flight, and + * will be synced in txg+i. + */ + uint64_t inflight_size = MIN(size, + svr->svr_max_offset_to_sync[txgoff] - offset); + + DTRACE_PROBE4(remove__free__inflight, + spa_t *, spa, + uint64_t, offset, + uint64_t, inflight_size, + uint64_t, txg + i); + + /* + * We copy data in order of increasing offset. + * Therefore the max_offset_to_sync[] must increase + * (or be zero, indicating that nothing is being + * copied in that txg). + */ + if (svr->svr_max_offset_to_sync[txgoff] != 0) { + ASSERT3U(svr->svr_max_offset_to_sync[txgoff], + >=, max_offset_yet); + max_offset_yet = + svr->svr_max_offset_to_sync[txgoff]; + } + + /* + * We've already committed to copying this segment: + * we have allocated space elsewhere in the pool for + * it and have an IO outstanding to copy the data. We + * cannot free the space before the copy has + * completed, or else the copy IO might overwrite any + * new data. To free that space, we record the + * segment in the appropriate svr_frees tree and free + * the mapped space later, in the txg where we have + * completed the copy and synced the mapping (see + * vdev_mapping_sync). + */ + range_tree_add(svr->svr_frees[txgoff], + offset, inflight_size); + size -= inflight_size; + offset += inflight_size; + + /* + * This space is already accounted for as being + * done, because it is being copied in txg+i. + * However, if i!=0, then it is being copied in + * a future txg. If we crash after this txg + * syncs but before txg+i syncs, then the space + * will be free. Therefore we must account + * for the space being done in *this* txg + * (when it is freed) rather than the future txg + * (when it will be copied). + */ + ASSERT3U(svr->svr_bytes_done[txgoff], >=, + inflight_size); + svr->svr_bytes_done[txgoff] -= inflight_size; + svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; + } + } + ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); + + if (size > 0) { + /* + * The copy thread has not yet visited this offset. Ensure + * that it doesn't. + */ + + DTRACE_PROBE3(remove__free__unvisited, + spa_t *, spa, + uint64_t, offset, + uint64_t, size); + + if (svr->svr_allocd_segs != NULL) + range_tree_clear(svr->svr_allocd_segs, offset, size); + + /* + * Since we now do not need to copy this data, for + * accounting purposes we have done our job and can count + * it as completed. + */ + svr->svr_bytes_done[txg & TXG_MASK] += size; + } + mutex_exit(&svr->svr_lock); + + /* + * Now that we have dropped svr_lock, process the synced portion + * of this free. + */ + if (synced_size > 0) { + vdev_indirect_mark_obsolete(vd, synced_offset, synced_size, + txg); + /* + * Note: this can only be called from syncing context, + * and the vdev_indirect_mapping is only changed from the + * sync thread, so we don't need svr_lock while doing + * metaslab_free_impl_cb. + */ + vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, + metaslab_free_impl_cb, &txg); + } +} + +/* + * Stop an active removal and update the spa_removing phys. + */ +static void +spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); + + /* Ensure the removal thread has completed before we free the svr. */ + spa_vdev_remove_suspend(spa); + + ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); + + if (state == DSS_FINISHED) { + spa_removing_phys_t *srp = &spa->spa_removing_phys; + vdev_t *vd = svr->svr_vdev; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + if (srp->sr_prev_indirect_vdev != UINT64_MAX) { + vdev_t *pvd = vdev_lookup_top(spa, + srp->sr_prev_indirect_vdev); + ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); + } + + vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; + srp->sr_prev_indirect_vdev = vd->vdev_id; + } + spa->spa_removing_phys.sr_state = state; + spa->spa_removing_phys.sr_end_time = gethrestime_sec(); + + spa->spa_vdev_removal = NULL; + spa_vdev_removal_destroy(svr); + + spa_sync_removing_state(spa, tx); + + vdev_config_dirty(spa->spa_root_vdev); +} + +static void +free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) +{ + vdev_t *vd = arg; + vdev_indirect_mark_obsolete(vd, offset, size, + vd->vdev_spa->spa_syncing_txg); + vdev_indirect_ops.vdev_op_remap(vd, offset, size, + metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg); +} + +/* + * On behalf of the removal thread, syncs an incremental bit more of + * the indirect mapping to disk and updates the in-memory mapping. + * Called as a sync task in every txg that the removal thread makes progress. + */ +static void +vdev_mapping_sync(void *arg, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = svr->svr_vdev; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + uint64_t txg = dmu_tx_get_txg(tx); + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT(vic->vic_mapping_object != 0); + ASSERT3U(txg, ==, spa_syncing_txg(spa)); + + vdev_indirect_mapping_add_entries(vim, + &svr->svr_new_segments[txg & TXG_MASK], tx); + vdev_indirect_births_add_entry(vd->vdev_indirect_births, + vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); + + /* + * Free the copied data for anything that was freed while the + * mapping entries were in flight. + */ + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_frees[txg & TXG_MASK], + free_mapped_segment_cb, vd); + ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, + vdev_indirect_mapping_max_offset(vim)); + svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; + mutex_exit(&svr->svr_lock); + + spa_sync_removing_state(spa, tx); +} + +static void +spa_vdev_copy_segment_write_done(zio_t *zio) +{ + vdev_copy_seg_arg_t *vcsa = zio->io_private; + vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg; + spa_config_exit(zio->io_spa, SCL_STATE, FTAG); + abd_free(zio->io_abd); + + mutex_enter(&vca->vca_lock); + vca->vca_outstanding_bytes -= zio->io_size; + cv_signal(&vca->vca_cv); + mutex_exit(&vca->vca_lock); + + ASSERT0(zio->io_error); + kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t)); + kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t)); +} + +static void +spa_vdev_copy_segment_read_done(zio_t *zio) +{ + vdev_copy_seg_arg_t *vcsa = zio->io_private; + dva_t *dest_dva = vcsa->vcsa_dest_dva; + uint64_t txg = vcsa->vcsa_txg; + spa_t *spa = zio->io_spa; + vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva)); + blkptr_t *bp = NULL; + dva_t *dva = NULL; + uint64_t size = zio->io_size; + + ASSERT3P(dest_vd, !=, NULL); + ASSERT0(zio->io_error); + + vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + bp = vcsa->vcsa_dest_bp; + dva = bp->blk_dva; + + BP_ZERO(bp); + + /* initialize with dest_dva */ + bcopy(dest_dva, dva, sizeof (dva_t)); + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa, + txg, bp, zio->io_abd, size, + spa_vdev_copy_segment_write_done, vcsa, + ZIO_PRIORITY_REMOVAL, 0, NULL)); +} + +static int +spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, + vdev_copy_arg_t *vca, zio_alloc_list_t *zal) +{ + metaslab_group_t *mg = vd->vdev_mg; + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_indirect_mapping_entry_t *entry; + vdev_copy_seg_arg_t *private; + dva_t dst = { 0 }; + blkptr_t blk, *bp = &blk; + dva_t *dva = bp->blk_dva; + + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + + int error = metaslab_alloc_dva(spa, mg->mg_class, size, + &dst, 0, NULL, txg, 0, zal); + if (error != 0) + return (error); + + /* + * We can't have any padding of the allocated size, otherwise we will + * misunderstand what's allocated, and the size of the mapping. + * The caller ensures this will be true by passing in a size that is + * aligned to the worst (highest) ashift in the pool. + */ + ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); + + mutex_enter(&vca->vca_lock); + vca->vca_outstanding_bytes += size; + mutex_exit(&vca->vca_lock); + + entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); + DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); + entry->vime_mapping.vimep_dst = dst; + + private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP); + private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; + private->vcsa_txg = txg; + private->vcsa_copy_arg = vca; + + /* + * This lock is eventually released by the donefunc for the + * zio_write_phys that finishes copying the data. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + /* + * Do logical I/O, letting the redundancy vdevs (like mirror) + * handle their own I/O instead of duplicating that code here. + */ + BP_ZERO(bp); + + DVA_SET_VDEV(&dva[0], vd->vdev_id); + DVA_SET_OFFSET(&dva[0], start); + DVA_SET_GANG(&dva[0], 0); + DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size)); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, + bp, abd_alloc_for_io(size, B_FALSE), size, + spa_vdev_copy_segment_read_done, private, + ZIO_PRIORITY_REMOVAL, 0, NULL)); + + list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); + ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); + vdev_dirty(vd, 0, NULL, txg); + + return (0); +} + +/* + * Complete the removal of a toplevel vdev. This is called as a + * synctask in the same txg that we will sync out the new config (to the + * MOS object) which indicates that this vdev is indirect. + */ +static void +vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = arg; + vdev_t *vd = svr->svr_vdev; + spa_t *spa = vd->vdev_spa; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(svr->svr_bytes_done[i]); + } + + ASSERT3U(spa->spa_removing_phys.sr_copied, ==, + spa->spa_removing_phys.sr_to_copy); + + vdev_destroy_spacemaps(vd, tx); + + /* destroy leaf zaps, if any */ + ASSERT3P(svr->svr_zaplist, !=, NULL); + for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); + pair != NULL; + pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { + vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); + } + fnvlist_free(svr->svr_zaplist); + + spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); + /* vd->vdev_path is not available here */ + spa_history_log_internal(spa, "vdev remove completed", tx, + "%s vdev %llu", spa_name(spa), vd->vdev_id); +} + +static void +vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd) +{ + ivd->vdev_indirect_config = vd->vdev_indirect_config; + + ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL); + ASSERT(vd->vdev_indirect_mapping != NULL); + ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping; + vd->vdev_indirect_mapping = NULL; + + ASSERT3P(ivd->vdev_indirect_births, ==, NULL); + ASSERT(vd->vdev_indirect_births != NULL); + ivd->vdev_indirect_births = vd->vdev_indirect_births; + vd->vdev_indirect_births = NULL; + + ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); + ASSERT0(range_tree_space(ivd->vdev_obsolete_segments)); + + if (vd->vdev_obsolete_sm != NULL) { + ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize); + + /* + * We cannot use space_map_{open,close} because we hold all + * the config locks as writer. + */ + ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL); + ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm; + vd->vdev_obsolete_sm = NULL; + } +} + +static void +vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) +{ + ASSERT3P(zlist, !=, NULL); + ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + + if (vd->vdev_leaf_zap != 0) { + char zkey[32]; + (void) snprintf(zkey, sizeof (zkey), "%s-%ju", + VDEV_REMOVAL_ZAP_OBJS, (uintmax_t)vd->vdev_leaf_zap); + fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); + } + + for (uint64_t id = 0; id < vd->vdev_children; id++) { + vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); + } +} + +static void +vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) +{ + vdev_t *ivd; + dmu_tx_t *tx; + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + /* + * First, build a list of leaf zaps to be destroyed. + * This is passed to the sync context thread, + * which does the actual unlinking. + */ + svr->svr_zaplist = fnvlist_alloc(); + vdev_remove_enlist_zaps(vd, svr->svr_zaplist); + + ivd = vdev_add_parent(vd, &vdev_indirect_ops); + + vd->vdev_leaf_zap = 0; + + vdev_remove_child(ivd, vd); + vdev_compact_children(ivd); + + vdev_indirect_state_transfer(ivd, vd); + + svr->svr_vdev = ivd; + + ASSERT(!ivd->vdev_removing); + ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, + 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + /* + * Indicate that this thread has exited. + * After this, we can not use svr. + */ + mutex_enter(&svr->svr_lock); + svr->svr_thread = NULL; + cv_broadcast(&svr->svr_cv); + mutex_exit(&svr->svr_lock); +} + +/* + * Complete the removal of a toplevel vdev. This is called in open + * context by the removal thread after we have copied all vdev's data. + */ +static void +vdev_remove_complete(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + uint64_t txg; + + /* + * Wait for any deferred frees to be synced before we call + * vdev_metaslab_fini() + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + + txg = spa_vdev_enter(spa); + zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", + vd->vdev_id, txg); + + /* + * Discard allocation state. + */ + if (vd->vdev_mg != NULL) { + vdev_metaslab_fini(vd); + metaslab_group_destroy(vd->vdev_mg); + vd->vdev_mg = NULL; + } + ASSERT0(vd->vdev_stat.vs_space); + ASSERT0(vd->vdev_stat.vs_dspace); + + vdev_remove_replace_with_indirect(vd, txg); + + /* + * We now release the locks, allowing spa_sync to run and finish the + * removal via vdev_remove_complete_sync in syncing context. + */ + (void) spa_vdev_exit(spa, NULL, txg, 0); + + /* + * Top ZAP should have been transferred to the indirect vdev in + * vdev_remove_replace_with_indirect. + */ + ASSERT0(vd->vdev_top_zap); + + /* + * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. + */ + ASSERT0(vd->vdev_leaf_zap); + + txg = spa_vdev_enter(spa); + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + /* + * Request to update the config and the config cachefile. + */ + vdev_config_dirty(spa->spa_root_vdev); + (void) spa_vdev_exit(spa, vd, txg, 0); +} + +/* + * Evacuates a segment of size at most max_alloc from the vdev + * via repeated calls to spa_vdev_copy_segment. If an allocation + * fails, the pool is probably too fragmented to handle such a + * large size, so decrease max_alloc so that the caller will not try + * this size again this txg. + */ +static void +spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, + uint64_t *max_alloc, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + mutex_enter(&svr->svr_lock); + + range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); + if (rs == NULL) { + mutex_exit(&svr->svr_lock); + return; + } + uint64_t offset = rs->rs_start; + uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc); + + range_tree_remove(svr->svr_allocd_segs, offset, length); + + if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { + dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, + svr, 0, ZFS_SPACE_CHECK_NONE, tx); + } + + svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length; + + /* + * Note: this is the amount of *allocated* space + * that we are taking care of each txg. + */ + svr->svr_bytes_done[txg & TXG_MASK] += length; + + mutex_exit(&svr->svr_lock); + + zio_alloc_list_t zal; + metaslab_trace_init(&zal); + uint64_t thismax = *max_alloc; + while (length > 0) { + uint64_t mylen = MIN(length, thismax); + + int error = spa_vdev_copy_segment(svr->svr_vdev, + offset, mylen, txg, vca, &zal); + + if (error == ENOSPC) { + /* + * Cut our segment in half, and don't try this + * segment size again this txg. Note that the + * allocation size must be aligned to the highest + * ashift in the pool, so that the allocation will + * not be padded out to a multiple of the ashift, + * which could cause us to think that this mapping + * is larger than we intended. + */ + ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); + thismax = P2ROUNDUP(mylen / 2, + 1 << spa->spa_max_ashift); + ASSERT3U(thismax, <, mylen); + /* + * The minimum-size allocation can not fail. + */ + ASSERT3U(mylen, >, 1 << spa->spa_max_ashift); + *max_alloc = mylen - (1 << spa->spa_max_ashift); + } else { + ASSERT0(error); + length -= mylen; + offset += mylen; + + /* + * We've performed an allocation, so reset the + * alloc trace list. + */ + metaslab_trace_fini(&zal); + metaslab_trace_init(&zal); + } + } + metaslab_trace_fini(&zal); +} + +/* + * The removal thread operates in open context. It iterates over all + * allocated space in the vdev, by loading each metaslab's spacemap. + * For each contiguous segment of allocated space (capping the segment + * size at SPA_MAXBLOCKSIZE), we: + * - Allocate space for it on another vdev. + * - Create a new mapping from the old location to the new location + * (as a record in svr_new_segments). + * - Initiate a logical read zio to get the data off the removing disk. + * - In the read zio's done callback, initiate a logical write zio to + * write it to the new vdev. + * Note that all of this will take effect when a particular TXG syncs. + * The sync thread ensures that all the phys reads and writes for the syncing + * TXG have completed (see spa_txg_zio) and writes the new mappings to disk + * (see vdev_mapping_sync()). + */ +static void +spa_vdev_remove_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_copy_arg_t vca; + uint64_t max_alloc = zfs_remove_max_segment; + uint64_t last_txg = 0; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); + + ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); + ASSERT(vdev_is_concrete(vd)); + ASSERT(vd->vdev_removing); + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT3P(svr->svr_vdev, ==, vd); + ASSERT(vim != NULL); + + mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); + vca.vca_outstanding_bytes = 0; + + mutex_enter(&svr->svr_lock); + + /* + * Start from vim_max_offset so we pick up where we left off + * if we are restarting the removal after opening the pool. + */ + uint64_t msi; + for (msi = start_offset >> vd->vdev_ms_shift; + msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + ASSERT3U(msi, <=, vd->vdev_ms_count); + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + /* + * Assert nothing in flight -- ms_*tree is empty. + */ + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(range_tree_space(msp->ms_alloctree[i])); + } + + /* + * If the metaslab has ever been allocated from (ms_sm!=NULL), + * read the allocated segments from the space map object + * into svr_allocd_segs. Since we do this while holding + * svr_lock and ms_sync_lock, concurrent frees (which + * would have modified the space map) will wait for us + * to finish loading the spacemap, and then take the + * appropriate action (see free_from_removing_vdev()). + */ + if (msp->ms_sm != NULL) { + space_map_t *sm = NULL; + + /* + * We have to open a new space map here, because + * ms_sm's sm_length and sm_alloc may not reflect + * what's in the object contents, if we are in between + * metaslab_sync() and metaslab_sync_done(). + */ + VERIFY0(space_map_open(&sm, + spa->spa_dsl_pool->dp_meta_objset, + msp->ms_sm->sm_object, msp->ms_sm->sm_start, + msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); + space_map_update(sm); + VERIFY0(space_map_load(sm, svr->svr_allocd_segs, + SM_ALLOC)); + space_map_close(sm); + + range_tree_walk(msp->ms_freeingtree, + range_tree_remove, svr->svr_allocd_segs); + + /* + * When we are resuming from a paused removal (i.e. + * when importing a pool with a removal in progress), + * discard any state that we have already processed. + */ + range_tree_clear(svr->svr_allocd_segs, 0, start_offset); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + vca.vca_msp = msp; + zfs_dbgmsg("copying %llu segments for metaslab %llu", + avl_numnodes(&svr->svr_allocd_segs->rt_root), + msp->ms_id); + + while (!svr->svr_thread_exit && + range_tree_space(svr->svr_allocd_segs) != 0) { + + mutex_exit(&svr->svr_lock); + + mutex_enter(&vca.vca_lock); + while (vca.vca_outstanding_bytes > + zfs_remove_max_copy_bytes) { + cv_wait(&vca.vca_cv, &vca.vca_lock); + } + mutex_exit(&vca.vca_lock); + + dmu_tx_t *tx = + dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + if (txg != last_txg) + max_alloc = zfs_remove_max_segment; + last_txg = txg; + + spa_vdev_copy_impl(svr, &vca, &max_alloc, tx); + + dmu_tx_commit(tx); + mutex_enter(&svr->svr_lock); + } + } + + mutex_exit(&svr->svr_lock); + /* + * Wait for all copies to finish before cleaning up the vca. + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + ASSERT0(vca.vca_outstanding_bytes); + + mutex_destroy(&vca.vca_lock); + cv_destroy(&vca.vca_cv); + + if (svr->svr_thread_exit) { + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); + svr->svr_thread = NULL; + cv_broadcast(&svr->svr_cv); + mutex_exit(&svr->svr_lock); + } else { + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + vdev_remove_complete(vd); + } + thread_exit(); +} + +void +spa_vdev_remove_suspend(spa_t *spa) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + if (svr == NULL) + return; + + mutex_enter(&svr->svr_lock); + svr->svr_thread_exit = B_TRUE; + while (svr->svr_thread != NULL) + cv_wait(&svr->svr_cv, &svr->svr_lock); + svr->svr_thread_exit = B_FALSE; + mutex_exit(&svr->svr_lock); +} + +/* ARGSUSED */ +static int +spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (spa->spa_vdev_removal == NULL) + return (ESRCH); + return (0); +} + +/* + * Cancel a removal by freeing all entries from the partial mapping + * and marking the vdev as no longer being removing. + */ +/* ARGSUSED */ +static void +spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_t *vd = svr->svr_vdev; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + objset_t *mos = spa->spa_meta_objset; + + ASSERT3P(svr->svr_thread, ==, NULL); + + spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); + if (vdev_obsolete_counts_are_precise(vd)) { + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); + } + + if (vdev_obsolete_sm_object(vd) != 0) { + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(vdev_obsolete_sm_object(vd), ==, + space_map_object(vd->vdev_obsolete_sm)); + + space_map_free(vd->vdev_obsolete_sm, tx); + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + } + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT(list_is_empty(&svr->svr_new_segments[i])); + ASSERT3U(svr->svr_max_offset_to_sync[i], <=, + vdev_indirect_mapping_max_offset(vim)); + } + + for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + + if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) + break; + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + mutex_enter(&msp->ms_lock); + + /* + * Assert nothing in flight -- ms_*tree is empty. + */ + for (int i = 0; i < TXG_SIZE; i++) + ASSERT0(range_tree_space(msp->ms_alloctree[i])); + for (int i = 0; i < TXG_DEFER_SIZE; i++) + ASSERT0(range_tree_space(msp->ms_defertree[i])); + ASSERT0(range_tree_space(msp->ms_freedtree)); + + if (msp->ms_sm != NULL) { + /* + * Assert that the in-core spacemap has the same + * length as the on-disk one, so we can use the + * existing in-core spacemap to load it from disk. + */ + ASSERT3U(msp->ms_sm->sm_alloc, ==, + msp->ms_sm->sm_phys->smp_alloc); + ASSERT3U(msp->ms_sm->sm_length, ==, + msp->ms_sm->sm_phys->smp_objsize); + + mutex_enter(&svr->svr_lock); + VERIFY0(space_map_load(msp->ms_sm, + svr->svr_allocd_segs, SM_ALLOC)); + range_tree_walk(msp->ms_freeingtree, + range_tree_remove, svr->svr_allocd_segs); + + /* + * Clear everything past what has been synced, + * because we have not allocated mappings for it yet. + */ + uint64_t syncd = vdev_indirect_mapping_max_offset(vim); + range_tree_clear(svr->svr_allocd_segs, syncd, + msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd); + + mutex_exit(&svr->svr_lock); + } + mutex_exit(&msp->ms_lock); + + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_allocd_segs, + free_mapped_segment_cb, vd); + mutex_exit(&svr->svr_lock); + } + + /* + * Note: this must happen after we invoke free_mapped_segment_cb, + * because it adds to the obsolete_segments. + */ + range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); + + ASSERT3U(vic->vic_mapping_object, ==, + vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vd->vdev_indirect_mapping = NULL; + vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); + vic->vic_mapping_object = 0; + + ASSERT3U(vic->vic_births_object, ==, + vdev_indirect_births_object(vd->vdev_indirect_births)); + vdev_indirect_births_close(vd->vdev_indirect_births); + vd->vdev_indirect_births = NULL; + vdev_indirect_births_free(mos, vic->vic_births_object, tx); + vic->vic_births_object = 0; + + /* + * We may have processed some frees from the removing vdev in this + * txg, thus increasing svr_bytes_done; discard that here to + * satisfy the assertions in spa_vdev_removal_destroy(). + * Note that future txg's can not have any bytes_done, because + * future TXG's are only modified from open context, and we have + * already shut down the copying thread. + */ + svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; + spa_finish_removal(spa, DSS_CANCELED, tx); + + vd->vdev_removing = B_FALSE; + vdev_config_dirty(vd); + + zfs_dbgmsg("canceled device removal for vdev %llu in %llu", + vd->vdev_id, dmu_tx_get_txg(tx)); + spa_history_log_internal(spa, "vdev remove canceled", tx, + "%s vdev %llu %s", spa_name(spa), + vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); +} + +int +spa_vdev_remove_cancel(spa_t *spa) +{ + spa_vdev_remove_suspend(spa); + + if (spa->spa_vdev_removal == NULL) + return (ESRCH); + + uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id; + + int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, + spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE); + + if (error == 0) { + spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); + vdev_t *vd = vdev_lookup_top(spa, vdid); + metaslab_group_activate(vd->vdev_mg); + spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); + } + + return (error); +} + +/* + * Called every sync pass of every txg if there's a svr. + */ +void +svr_sync(spa_t *spa, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + /* + * This check is necessary so that we do not dirty the + * DIRECTORY_OBJECT via spa_sync_removing_state() when there + * is nothing to do. Dirtying it every time would prevent us + * from syncing-to-convergence. + */ + if (svr->svr_bytes_done[txgoff] == 0) + return; + + /* + * Update progress accounting. + */ + spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; + svr->svr_bytes_done[txgoff] = 0; + + spa_sync_removing_state(spa, tx); +} + +static void +vdev_remove_make_hole_and_free(vdev_t *vd) +{ + uint64_t id = vd->vdev_id; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + boolean_t last_vdev = (id == (rvd->vdev_children - 1)); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + vdev_free(vd); + + if (last_vdev) { + vdev_compact_children(rvd); + } else { + vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); + vdev_add_child(rvd, vd); + } + vdev_config_dirty(rvd); + + /* + * Reassess the health of our root vdev. + */ + vdev_reopen(rvd); +} + +/* + * Remove a log device. The config lock is held for the specified TXG. + */ +static int +spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) +{ + metaslab_group_t *mg = vd->vdev_mg; + spa_t *spa = vd->vdev_spa; + int error = 0; + + ASSERT(vd->vdev_islog); + ASSERT(vd == vd->vdev_top); + + /* + * Stop allocating from this vdev. + */ + metaslab_group_passivate(mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * Evacuate the device. We don't hold the config lock as writer + * since we need to do I/O but we do keep the + * spa_namespace_lock held. Once this completes the device + * should no longer have any blocks allocated on it. + */ + if (vd->vdev_islog) { + if (vd->vdev_stat.vs_alloc != 0) + error = spa_reset_logs(spa); + } + + *txg = spa_vdev_config_enter(spa); + + if (error != 0) { + metaslab_group_activate(mg); + return (error); + } + ASSERT0(vd->vdev_stat.vs_alloc); + + /* + * The evacuation succeeded. Remove any remaining MOS metadata + * associated with this vdev, and wait for these changes to sync. + */ + vd->vdev_removing = B_TRUE; + + vdev_dirty_leaves(vd, VDD_DTL, *txg); + vdev_config_dirty(vd); + + spa_history_log_internal(spa, "vdev remove", NULL, + "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id, + (vd->vdev_path != NULL) ? vd->vdev_path : "-"); + + /* Make sure these changes are sync'ed */ + spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); + + *txg = spa_vdev_config_enter(spa); + + sysevent_t *ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_DEV); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + /* The top ZAP should have been destroyed by vdev_remove_empty. */ + ASSERT0(vd->vdev_top_zap); + /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ + ASSERT0(vd->vdev_leaf_zap); + + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + + if (list_link_active(&vd->vdev_state_dirty_node)) + vdev_state_clean(vd); + if (list_link_active(&vd->vdev_config_dirty_node)) + vdev_config_clean(vd); + + /* + * Clean up the vdev namespace. + */ + vdev_remove_make_hole_and_free(vd); + + if (ev != NULL) + spa_event_post(ev); + + return (0); +} + +static int +spa_vdev_remove_top_check(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + if (vd != vd->vdev_top) + return (SET_ERROR(ENOTSUP)); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) + return (SET_ERROR(ENOTSUP)); + + /* + * There has to be enough free space to remove the + * device and leave double the "slop" space (i.e. we + * must leave at least 3% of the pool free, in addition to + * the normal slop space). + */ + if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir, + NULL, 0, B_TRUE) < + vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { + return (SET_ERROR(ENOSPC)); + } + + /* + * There can not be a removal in progress. + */ + if (spa->spa_removing_phys.sr_state == DSS_SCANNING) + return (SET_ERROR(EBUSY)); + + /* + * The device must have all its data. + */ + if (!vdev_dtl_empty(vd, DTL_MISSING) || + !vdev_dtl_empty(vd, DTL_OUTAGE)) + return (SET_ERROR(EBUSY)); + + /* + * The device must be healthy. + */ + if (!vdev_readable(vd)) + return (SET_ERROR(EIO)); + + /* + * All vdevs in normal class must have the same ashift. + */ + if (spa->spa_max_ashift != spa->spa_min_ashift) { + return (SET_ERROR(EINVAL)); + } + + /* + * All vdevs in normal class must have the same ashift + * and not be raidz. + */ + vdev_t *rvd = spa->spa_root_vdev; + int num_indirect = 0; + for (uint64_t id = 0; id < rvd->vdev_children; id++) { + vdev_t *cvd = rvd->vdev_child[id]; + if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) + ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); + if (cvd->vdev_ops == &vdev_indirect_ops) + num_indirect++; + if (!vdev_is_concrete(cvd)) + continue; + if (cvd->vdev_ops == &vdev_raidz_ops) + return (SET_ERROR(EINVAL)); + /* + * Need the mirror to be mirror of leaf vdevs only + */ + if (cvd->vdev_ops == &vdev_mirror_ops) { + for (uint64_t cid = 0; + cid < cvd->vdev_children; cid++) { + vdev_t *tmp = cvd->vdev_child[cid]; + if (!tmp->vdev_ops->vdev_op_leaf) + return (SET_ERROR(EINVAL)); + } + } + } + + return (0); +} + +/* + * Initiate removal of a top-level vdev, reducing the total space in the pool. + * The config lock is held for the specified TXG. Once initiated, + * evacuation of all allocated space (copying it to other vdevs) happens + * in the background (see spa_vdev_remove_thread()), and can be canceled + * (see spa_vdev_remove_cancel()). If successful, the vdev will + * be transformed to an indirect vdev (see spa_vdev_remove_complete()). + */ +static int +spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) +{ + spa_t *spa = vd->vdev_spa; + int error; + + /* + * Check for errors up-front, so that we don't waste time + * passivating the metaslab group and clearing the ZIL if there + * are errors. + */ + error = spa_vdev_remove_top_check(vd); + if (error != 0) + return (error); + + /* + * Stop allocating from this vdev. Note that we must check + * that this is not the only device in the pool before + * passivating, otherwise we will not be able to make + * progress because we can't allocate from any vdevs. + * The above check for sufficient free space serves this + * purpose. + */ + metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_passivate(mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * We must ensure that no "stubby" log blocks are allocated + * on the device to be removed. These blocks could be + * written at any time, including while we are in the middle + * of copying them. + */ + error = spa_reset_logs(spa); + + *txg = spa_vdev_config_enter(spa); + + /* + * Things might have changed while the config lock was dropped + * (e.g. space usage). Check for errors again. + */ + if (error == 0) + error = spa_vdev_remove_top_check(vd); + + if (error != 0) { + metaslab_group_activate(mg); + return (error); + } + + vd->vdev_removing = B_TRUE; + + vdev_dirty_leaves(vd, VDD_DTL, *txg); + vdev_config_dirty(vd); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, + vdev_remove_initiate_sync, + vd, 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + return (0); +} + +/* + * Remove a device from the pool. + * + * Removing a device from the vdev namespace requires several steps + * and can take a significant amount of time. As a result we use + * the spa_vdev_config_[enter/exit] functions which allow us to + * grab and release the spa_config_lock while still holding the namespace + * lock. During each step the configuration is synced out. + */ +int +spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) +{ + vdev_t *vd; + nvlist_t **spares, **l2cache, *nv; + uint64_t txg = 0; + uint_t nspares, nl2cache; + int error = 0; + boolean_t locked = MUTEX_HELD(&spa_namespace_lock); + sysevent_t *ev = NULL; + + ASSERT(spa_writeable(spa)); + + if (!locked) + txg = spa_vdev_enter(spa); + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + if (spa->spa_spares.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && + (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { + /* + * Only remove the hot spare if it's not currently in use + * in this pool. + */ + if (vd == NULL || unspare) { + char *nvstr = fnvlist_lookup_string(nv, + ZPOOL_CONFIG_PATH); + spa_history_log_internal(spa, "vdev remove", NULL, + "%s vdev (%s) %s", spa_name(spa), + VDEV_TYPE_SPARE, nvstr); + if (vd == NULL) + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_AUX); + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } else { + error = SET_ERROR(EBUSY); + } + } else if (spa->spa_l2cache.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && + (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { + char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); + spa_history_log_internal(spa, "vdev remove", NULL, + "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr); + /* + * Cache devices can always be removed. + */ + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); + spa_vdev_remove_aux(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; + } else if (vd != NULL && vd->vdev_islog) { + ASSERT(!locked); + error = spa_vdev_remove_log(vd, &txg); + } else if (vd != NULL) { + ASSERT(!locked); + error = spa_vdev_remove_top(vd, &txg); + } else { + /* + * There is no vdev of any kind with the specified guid. + */ + error = SET_ERROR(ENOENT); + } + + if (!locked) + error = spa_vdev_exit(spa, NULL, txg, error); + + if (ev != NULL) { + if (error != 0) { + spa_event_discard(ev); + } else { + spa_event_post(ev); + } + } + + return (error); +} + +int +spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) +{ + prs->prs_state = spa->spa_removing_phys.sr_state; + + if (prs->prs_state == DSS_NONE) + return (SET_ERROR(ENOENT)); + + prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; + prs->prs_start_time = spa->spa_removing_phys.sr_start_time; + prs->prs_end_time = spa->spa_removing_phys.sr_end_time; + prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; + prs->prs_copied = spa->spa_removing_phys.sr_copied; + + if (spa->spa_vdev_removal != NULL) { + for (int i = 0; i < TXG_SIZE; i++) { + prs->prs_copied += + spa->spa_vdev_removal->svr_bytes_done[i]; + } + } + + prs->prs_mapping_memory = 0; + uint64_t indirect_vdev_id = + spa->spa_removing_phys.sr_prev_indirect_vdev; + while (indirect_vdev_id != -1) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); + indirect_vdev_id = vic->vic_prev_indirect_vdev; + } + + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c index 03068880f76c..a540e29a1a36 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include @@ -118,6 +118,7 @@ vdev_ops_t vdev_root_ops = { vdev_root_state_change, NULL, NULL, + NULL, VDEV_TYPE_ROOT, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c index 6afdaaf1c492..fd85e27a63f5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c @@ -420,6 +420,9 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval); (void) strcpy(setpoint, ""); break; + case ZFS_PROP_REMAPTXG: + error = dsl_dir_get_remaptxg(ds->ds_dir, &numval); + break; case ZFS_PROP_NUMCLONES: numval = dsl_get_numclones(ds); break; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 8bcfc335b5fe..1a3ab50386b7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -194,6 +194,7 @@ #include #include #include +#include #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -1072,6 +1073,14 @@ zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (error); } +/* ARGSUSED */ +static int +zfs_secpolicy_remap(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_REMAP, cr)); +} + /* ARGSUSED */ static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) @@ -1994,8 +2003,8 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) /* * inputs: * zc_name name of the pool - * zc_nvlist_conf nvlist of devices to remove - * zc_cookie to stop the remove? + * zc_guid guid of vdev to remove + * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) @@ -2006,7 +2015,11 @@ zfs_ioc_vdev_remove(zfs_cmd_t *zc) error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); - error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); + if (zc->zc_cookie != 0) { + error = spa_vdev_remove_cancel(spa); + } else { + error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); + } spa_close(spa, FTAG); return (error); } @@ -2959,7 +2972,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); - spa_config_sync(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { @@ -3405,6 +3418,17 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) return (error); } +/* ARGSUSED */ +static int +zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + if (strchr(fsname, '@') || + strchr(fsname, '%')) + return (SET_ERROR(EINVAL)); + + return (dmu_objset_remap_indirects(fsname)); +} + /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } @@ -6036,6 +6060,10 @@ zfs_ioctl_init(void) zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("remap", ZFS_IOC_REMAP, + zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); + zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 532c149771d7..9cd31ee8b1d6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -478,7 +478,7 @@ zfs_register_callbacks(vfs_t *vfsp) * dsl_prop_get_int_ds() to handle the special nbmand property below. * dsl_prop_get_integer() can not be used, because it has to acquire * spa_namespace_lock and we can not do that because we already hold - * z_teardown_lock. The problem is that spa_config_sync() is called + * z_teardown_lock. The problem is that spa_write_cachefile() is called * with spa_namespace_lock held and the function calls ZFS vnode * operations to write the cache file and thus z_teardown_lock is * acquired after spa_namespace_lock. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 502868a66fc8..b964a2ed5922 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -1383,6 +1383,18 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP, + * so there's no need to zio_flush() its + * vdevs (flushing would needlesly hurt + * performance, and doesn't work on + * indirect vdevs). + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); error = 0; } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index 707f4e04bb8f..a74f6948070e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -3238,7 +3238,7 @@ zil_replaying(zilog_t *zilog, dmu_tx_t *tx) /* ARGSUSED */ int -zil_vdev_offline(const char *osname, void *arg) +zil_reset(const char *osname, void *arg) { int error; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index ecdb7171ad9f..19f7915dd15a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -908,6 +908,8 @@ void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { + zfs_blkptr_verify(spa, bp); + /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than @@ -976,7 +978,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; - dprintf_bp(bp, "claiming in txg %llu", txg); + zfs_blkptr_verify(spa, bp); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); @@ -1096,8 +1098,26 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; - ASSERT(vd->vdev_parent == - (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); + /* + * vdev child I/Os do not propagate their error to the parent. + * Therefore, for correct operation the caller *must* check for + * and handle the error in the child i/o's done callback. + * The only exceptions are i/os that we don't care about + * (OPTIONAL or REPAIR). + */ + ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || + done != NULL); + + /* + * In the common case, where the parent zio was to a normal vdev, + * the child zio must be to a child vdev of that vdev. Otherwise, + * the child zio must be to a top-level vdev. + */ + if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) { + ASSERT3P(vd->vdev_parent, ==, pio->io_vd); + } else { + ASSERT3P(vd, ==, vd->vdev_top); + } if (type == ZIO_TYPE_READ && bp != NULL) { /* @@ -1114,10 +1134,12 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; - if (vd->vdev_children == 0) + if (vd->vdev_ops->vdev_op_leaf) { + ASSERT0(vd->vdev_children); offset += VDEV_LABEL_START_SIZE; + } - flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; + flags |= ZIO_VDEV_CHILD_FLAGS(pio); /* * If we've decided to do a repair, the write is not speculative -- @@ -1227,6 +1249,8 @@ zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW)) { @@ -1245,6 +1269,7 @@ zio_read_bp_init(zio_t *zio) abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) @@ -1501,6 +1526,8 @@ zio_free_bp_init(zio_t *zio) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); + return (ZIO_PIPELINE_CONTINUE); } @@ -3121,6 +3148,11 @@ zio_vdev_io_start(zio_t *zio) } ASSERT3P(zio->io_logical, !=, zio); + if (zio->io_type == ZIO_TYPE_WRITE && zio->io_vd->vdev_removing) { + ASSERT(zio->io_flags & + (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | + ZIO_FLAG_INDUCE_DAMAGE)); + } /* * We keep track of time-sensitive I/Os so that the scan thread diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h index 1f65f398be2e..bc184fe54cb4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -165,6 +165,7 @@ typedef enum { ZFS_PROP_REDUNDANT_METADATA, ZFS_PROP_PREV_SNAP, ZFS_PROP_RECEIVE_RESUME_TOKEN, + ZFS_PROP_REMAPTXG, /* not exposed to the user */ ZFS_NUM_PROPS } zfs_prop_t; @@ -517,7 +518,9 @@ typedef struct zpool_rewind_policy { /* * The following are configuration names used in the nvlist describing a pool's - * configuration. + * configuration. New on-disk names should be prefixed with ":" + * (e.g. "org.open-zfs:") to avoid conflicting names being developed + * independently. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" @@ -531,6 +534,9 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" +#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" +#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" +#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" @@ -539,7 +545,9 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" @@ -615,6 +623,13 @@ typedef struct zpool_rewind_policy { #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" +#define VDEV_TYPE_INDIRECT "indirect" + +/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ +#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ + "com.delphix:indirect_obsolete_sm" +#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ + "com.delphix:obsolete_counts_are_precise" /* * This is needed in userland to report the minimum necessary device size. @@ -751,6 +766,20 @@ typedef struct pool_scan_stat { uint64_t pss_pass_scrub_spent_paused; } pool_scan_stat_t; +typedef struct pool_removal_stat { + uint64_t prs_state; /* dsl_scan_state_t */ + uint64_t prs_removing_vdev; + uint64_t prs_start_time; + uint64_t prs_end_time; + uint64_t prs_to_copy; /* bytes that need to be copied */ + uint64_t prs_copied; /* bytes copied so far */ + /* + * bytes of memory used for indirect mappings. + * This includes all removed vdevs. + */ + uint64_t prs_mapping_memory; +} pool_removal_stat_t; + typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, @@ -916,6 +945,7 @@ typedef enum zfs_ioc { ZFS_IOC_NEXTBOOT, #endif ZFS_IOC_CHANNEL_PROGRAM, + ZFS_IOC_REMAP, ZFS_IOC_LAST } zfs_ioc_t; diff --git a/sys/conf/files b/sys/conf/files index 4caf4bae44c9..c4c1179409dd 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -218,12 +218,16 @@ cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c optional zfs compile-with cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c optional zfs compile-with "${ZFS_C}" +cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c optional zfs compile-with "${ZFS_C}" +cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c optional zfs compile-with "${ZFS_C}" +cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c optional zfs compile-with "${ZFS_C}" +cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c optional zfs compile-with "${ZFS_C}"