From 2cbea54929c419a841c9f728ce1244102e849d6d Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 17 Jun 2014 07:44:55 +0000 Subject: [PATCH 001/380] 4390 i/o errors when deleting filesystem/zvol can lead to space map corruption Reviewed by: George Wilson Reviewed by: Christopher Siden Reviewed by: Adam Leventhal Reviewed by: Dan McDonald Reviewed by: Saso Kiselkov Approved by: Dan McDonald illumos/illumos-dist@7fd05ac4dec0c343d2f68f310d3718b715ecfbaf --- common/zfs/zpool_prop.c | 2 + uts/common/fs/zfs/bptree.c | 101 +++++++++++++++++---- uts/common/fs/zfs/dmu_traverse.c | 126 +++++++++++--------------- uts/common/fs/zfs/dsl_pool.c | 9 ++ uts/common/fs/zfs/dsl_scan.c | 144 ++++++++++++++++++++---------- uts/common/fs/zfs/spa.c | 14 ++- uts/common/fs/zfs/spa_misc.c | 33 ++++++- uts/common/fs/zfs/sys/bptree.h | 3 +- uts/common/fs/zfs/sys/dmu.h | 1 - uts/common/fs/zfs/sys/dsl_dir.h | 1 + uts/common/fs/zfs/sys/dsl_pool.h | 1 + uts/common/fs/zfs/sys/dsl_scan.h | 1 + uts/common/fs/zfs/sys/zfs_debug.h | 3 +- uts/common/fs/zfs/zfs_debug.c | 2 +- uts/common/fs/zfs/zio.c | 7 -- uts/common/sys/fs/zfs.h | 1 + 16 files changed, 293 insertions(+), 156 deletions(-) diff --git a/common/zfs/zpool_prop.c b/common/zfs/zpool_prop.c index 72db87937110..51041a8766cc 100644 --- a/common/zfs/zpool_prop.c +++ b/common/zfs/zpool_prop.c @@ -81,6 +81,8 @@ zpool_prop_init(void) ZFS_TYPE_POOL, "", "FREE"); zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FREEING"); + zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "LEAKED"); zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "ALLOC"); zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, diff --git a/uts/common/fs/zfs/bptree.c b/uts/common/fs/zfs/bptree.c index 83f365864dec..77067d24e98a 100644 --- a/uts/common/fs/zfs/bptree.c +++ b/uts/common/fs/zfs/bptree.c @@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) return (dmu_object_free(os, obj, tx)); } +boolean_t +bptree_is_empty(objset_t *os, uint64_t obj) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + boolean_t rv; + + VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + rv = (bt->bt_begin == bt->bt_end); + dmu_buf_rele(db, FTAG); + return (rv); +} + void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) { dmu_buf_t *db; bptree_phys_t *bt; - bptree_entry_phys_t bte; + bptree_entry_phys_t bte = { 0 }; /* * bptree objects are in the pool mos, therefore they can only be @@ -122,7 +136,6 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, bte.be_birth_txg = birth_txg; bte.be_bp = *bp; - bzero(&bte.be_zb, sizeof (bte.be_zb)); dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); dmu_buf_will_dirty(db, tx); @@ -153,10 +166,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, return (err); } +/* + * If "free" is set: + * - It is assumed that "func" will be freeing the block pointers. + * - If "func" returns nonzero, the bookmark will be remembered and + * iteration will be restarted from this point on next invocation. + * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), + * bptree_iterate will remember the bookmark, continue traversing + * any additional entries, and return 0. + * + * If "free" is not set, traversal will stop and return an error if + * an i/o error is encountered. + * + * In either case, if zfs_free_leak_on_eio is set, i/o errors will be + * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to + * traverse_dataset_destroyed()). + */ int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, void *arg, dmu_tx_t *tx) { + boolean_t ioerr = B_FALSE; int err; uint64_t i; dmu_buf_t *db; @@ -182,49 +212,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, bptree_entry_phys_t bte; int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; - ASSERT(!free || i == ba.ba_phys->bt_begin); - err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), &bte, DMU_READ_NO_PREFETCH); if (err != 0) break; - if (zfs_recover) + if (zfs_free_leak_on_eio) flags |= TRAVERSE_HARD; + zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld " + "bookmark %lld/%lld/%lld/%lld", + i, (longlong_t)bte.be_birth_txg, + (longlong_t)bte.be_zb.zb_objset, + (longlong_t)bte.be_zb.zb_object, + (longlong_t)bte.be_zb.zb_level, + (longlong_t)bte.be_zb.zb_blkid); err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, bte.be_birth_txg, &bte.be_zb, flags, bptree_visit_cb, &ba); if (free) { - if (err == ERESTART) { + /* + * The callback has freed the visited block pointers. + * Record our traversal progress on disk, either by + * updating this record's bookmark, or by logically + * removing this record by advancing bt_begin. + */ + if (err != 0) { /* save bookmark for future resume */ ASSERT3U(bte.be_zb.zb_objset, ==, ZB_DESTROYED_OBJSET); ASSERT0(bte.be_zb.zb_level); dmu_write(os, obj, i * sizeof (bte), sizeof (bte), &bte, tx); - break; - } - if (err != 0) { + if (err == EIO || err == ECKSUM || + err == ENXIO) { + /* + * Skip the rest of this tree and + * continue on to the next entry. + */ + err = 0; + ioerr = B_TRUE; + } else { + break; + } + } else if (ioerr) { /* - * We can not properly handle an i/o - * error, because the traversal code - * does not know how to resume from an - * arbitrary bookmark. + * This entry is finished, but there were + * i/o errors on previous entries, so we + * can't adjust bt_begin. Set this entry's + * be_birth_txg such that it will be + * treated as a no-op in future traversals. */ - zfs_panic_recover("error %u from " - "traverse_dataset_destroyed()", err); + bte.be_birth_txg = UINT64_MAX; + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); } - ba.ba_phys->bt_begin++; - (void) dmu_free_range(os, obj, - i * sizeof (bte), sizeof (bte), tx); + if (!ioerr) { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } else if (err != 0) { + break; } } - ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + ASSERT(!free || err != 0 || ioerr || + ba.ba_phys->bt_begin == ba.ba_phys->bt_end); /* if all blocks are free there should be no used space */ if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + if (zfs_free_leak_on_eio) { + ba.ba_phys->bt_bytes = 0; + ba.ba_phys->bt_comp = 0; + ba.ba_phys->bt_uncomp = 0; + } + ASSERT0(ba.ba_phys->bt_bytes); ASSERT0(ba.ba_phys->bt_comp); ASSERT0(ba.ba_phys->bt_uncomp); diff --git a/uts/common/fs/zfs/dmu_traverse.c b/uts/common/fs/zfs/dmu_traverse.c index db0869ca6f18..1c698b5ee03c 100644 --- a/uts/common/fs/zfs/dmu_traverse.c +++ b/uts/common/fs/zfs/dmu_traverse.c @@ -58,6 +58,7 @@ typedef struct traverse_data { zbookmark_t *td_resume; int td_flags; prefetch_data_t *td_pfd; + boolean_t td_paused; blkptr_cb_t *td_func; void *td_arg; } traverse_data_t; @@ -163,7 +164,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, * If we found the block we're trying to resume from, zero * the bookmark out to indicate that we have resumed. */ - ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object); if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { bzero(td->td_resume, sizeof (*zb)); if (td->td_flags & TRAVERSE_POST) @@ -173,14 +173,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, return (RESUME_SKIP_NONE); } -static void -traverse_pause(traverse_data_t *td, const zbookmark_t *zb) -{ - ASSERT(td->td_resume != NULL); - ASSERT0(zb->zb_level); - bcopy(zb, td->td_resume, sizeof (*td->td_resume)); -} - static void traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_t *zb) @@ -210,11 +202,10 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb) { zbookmark_t czb; - int err = 0, lasterr = 0; + int err = 0; arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; - boolean_t pause = B_FALSE; switch (resume_skip_check(td, dnp, zb)) { case RESUME_SKIP_ALL: @@ -253,7 +244,9 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); - return (err); + if (err != 0) + goto post; + return (0); } if (pd && !pd->pd_exited && @@ -273,8 +266,6 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); - if (err == ERESTART) - pause = B_TRUE; /* handle pausing at a common point */ if (err != 0) goto post; } @@ -288,7 +279,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) - return (err); + goto post; cbp = buf->b_data; for (i = 0; i < epb; i++) { @@ -304,11 +295,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, &cbp[i], &czb); - if (err != 0) { - if (!hard) - break; - lasterr = err; - } + if (err != 0) + break; } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { uint32_t flags = ARC_WAIT; @@ -318,7 +306,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) - return (err); + goto post; dnp = buf->b_data; for (i = 0; i < epb; i++) { @@ -330,11 +318,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, for (i = 0; i < epb; i++) { err = traverse_dnode(td, &dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); - if (err != 0) { - if (!hard) - break; - lasterr = err; - } + if (err != 0) + break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { uint32_t flags = ARC_WAIT; @@ -344,7 +329,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) - return (err); + goto post; osp = buf->b_data; dnp = &osp->os_meta_dnode; @@ -359,19 +344,11 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = traverse_dnode(td, dnp, zb->zb_objset, DMU_META_DNODE_OBJECT); - if (err && hard) { - lasterr = err; - err = 0; - } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { dnp = &osp->os_groupused_dnode; err = traverse_dnode(td, dnp, zb->zb_objset, DMU_GROUPUSED_OBJECT); } - if (err && hard) { - lasterr = err; - err = 0; - } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { dnp = &osp->os_userused_dnode; err = traverse_dnode(td, dnp, zb->zb_objset, @@ -383,19 +360,37 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, (void) arc_buf_remove_ref(buf, &buf); post: - if (err == 0 && (td->td_flags & TRAVERSE_POST)) { + if (err == 0 && (td->td_flags & TRAVERSE_POST)) err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); - if (err == ERESTART) - pause = B_TRUE; + + if (hard && (err == EIO || err == ECKSUM)) { + /* + * Ignore this disk error as requested by the HARD flag, + * and continue traversal. + */ + err = 0; } - if (pause && td->td_resume != NULL) { - ASSERT3U(err, ==, ERESTART); - ASSERT(!hard); - traverse_pause(td, zb); + /* + * If we are stopping here, set td_resume. + */ + if (td->td_resume != NULL && err != 0 && !td->td_paused) { + td->td_resume->zb_objset = zb->zb_objset; + td->td_resume->zb_object = zb->zb_object; + td->td_resume->zb_level = 0; + /* + * If we have stopped on an indirect block (e.g. due to + * i/o error), we have not visited anything below it. + * Set the bookmark to the first level-0 block that we need + * to visit. This way, the resuming code does not need to + * deal with resuming from indirect blocks. + */ + td->td_resume->zb_blkid = zb->zb_blkid << + (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + td->td_paused = B_TRUE; } - return (err != 0 ? err : lasterr); + return (err); } static void @@ -420,30 +415,21 @@ static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { - int j, err = 0, lasterr = 0; + int j, err = 0; zbookmark_t czb; - boolean_t hard = (td->td_flags & TRAVERSE_HARD); for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); - if (err != 0) { - if (!hard) - break; - lasterr = err; - } + if (err != 0) + break; } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); - if (err != 0) { - if (!hard) - return (err); - lasterr = err; - } } - return (err != 0 ? err : lasterr); + return (err); } /* ARGSUSED */ @@ -530,6 +516,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, td.td_arg = arg; td.td_pfd = &pd; td.td_flags = flags; + td.td_paused = B_FALSE; pd.pd_blks_max = zfs_pd_blks_max; pd.pd_flags = flags; @@ -603,7 +590,7 @@ int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - int err, lasterr = 0; + int err; uint64_t obj; dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; @@ -616,16 +603,15 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, return (err); /* visit each dataset */ - for (obj = 1; err == 0 || (err != ESRCH && hard); + for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); if (err != 0) { - if (!hard) - return (err); - lasterr = err; - continue; + if (hard) + continue; + break; } if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { @@ -636,23 +622,19 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); dsl_pool_config_exit(dp, FTAG); if (err != 0) { - if (!hard) - return (err); - lasterr = err; - continue; + if (hard) + continue; + break; } if (ds->ds_phys->ds_prev_snap_txg > txg) txg = ds->ds_phys->ds_prev_snap_txg; err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); - if (err != 0) { - if (!hard) - return (err); - lasterr = err; - } + if (err != 0) + break; } } if (err == ESRCH) err = 0; - return (err != 0 ? err : lasterr); + return (err); } diff --git a/uts/common/fs/zfs/dsl_pool.c b/uts/common/fs/zfs/dsl_pool.c index b6bdc78f09c2..76ca96b1431a 100644 --- a/uts/common/fs/zfs/dsl_pool.c +++ b/uts/common/fs/zfs/dsl_pool.c @@ -250,6 +250,13 @@ dsl_pool_open(dsl_pool_t *dp) dp->dp_meta_objset, obj)); } + /* + * Note: errors ignored, because the leak dir will not exist if we + * have not encountered a leak yet. + */ + (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, + &dp->dp_leak_dir); + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, @@ -297,6 +304,8 @@ dsl_pool_close(dsl_pool_t *dp) dsl_dir_rele(dp->dp_mos_dir, dp); if (dp->dp_free_dir) dsl_dir_rele(dp->dp_free_dir, dp); + if (dp->dp_leak_dir) + dsl_dir_rele(dp->dp_leak_dir, dp); if (dp->dp_root_dir) dsl_dir_rele(dp->dp_root_dir, dp); diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c index 484420ea3f17..860f9dfb4e86 100644 --- a/uts/common/fs/zfs/dsl_scan.c +++ b/uts/common/fs/zfs/dsl_scan.c @@ -52,9 +52,7 @@ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); -static scan_cb_t dsl_scan_defrag_cb; static scan_cb_t dsl_scan_scrub_cb; -static scan_cb_t dsl_scan_remove_cb; static void dsl_scan_cancel_sync(void *, dmu_tx_t *); static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); @@ -67,7 +65,7 @@ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ -boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ +boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ @@ -1358,7 +1356,7 @@ dsl_scan_active(dsl_scan_t *scn) if (spa_shutting_down(spa)) return (B_FALSE); if (scn->scn_phys.scn_state == DSS_SCANNING || - scn->scn_async_destroying) + (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { @@ -1373,7 +1371,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; - int err; + int err = 0; /* * Check for scn_restart_txg before checking spa_load_state, so @@ -1391,7 +1389,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_scan_setup_sync(&func, tx); } - if (!dsl_scan_active(scn) || + /* + * If the scan is inactive due to a stalled async destroy, try again. + */ + if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) || spa_sync_pass(dp->dp_spa) > 1) return; @@ -1401,10 +1402,11 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) spa->spa_scrub_active = B_TRUE; /* - * First process the free list. If we pause the free, don't do - * any scanning. This ensures that there is no free list when - * we are scanning, so the scan code doesn't have to worry about - * traversing it. + * First process the async destroys. If we pause, don't do + * any scrubbing or resilvering. This ensures that there are no + * async destroys while we are scanning, so the scan code doesn't + * have to worry about traversing it. It is also faster to free the + * blocks than to scrub them. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; @@ -1414,48 +1416,92 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); - if (err == 0 && spa_feature_is_active(spa, - SPA_FEATURE_ASYNC_DESTROY)) { - ASSERT(scn->scn_async_destroying); - scn->scn_is_bptree = B_TRUE; - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, - NULL, ZIO_FLAG_MUSTSUCCEED); - err = bptree_iterate(dp->dp_meta_objset, - dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, - scn, tx); - VERIFY0(zio_wait(scn->scn_zio_root)); + if (err != 0 && err != ERESTART) + zfs_panic_recover("error %u from bpobj_iterate()", err); + } - if (err == 0) { - /* finished; deactivate async destroy feature */ - spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, - tx); - ASSERT(!spa_feature_is_active(spa, - SPA_FEATURE_ASYNC_DESTROY)); - VERIFY0(zap_remove(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, tx)); - VERIFY0(bptree_free(dp->dp_meta_objset, - dp->dp_bptree_obj, tx)); - dp->dp_bptree_obj = 0; - scn->scn_async_destroying = B_FALSE; - } + if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { + ASSERT(scn->scn_async_destroying); + scn->scn_is_bptree = B_TRUE; + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bptree_iterate(dp->dp_meta_objset, + dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); + VERIFY0(zio_wait(scn->scn_zio_root)); + + if (err == EIO || err == ECKSUM) { + err = 0; + } else if (err != 0 && err != ERESTART) { + zfs_panic_recover("error %u from " + "traverse_dataset_destroyed()", err); } - if (scn->scn_visited_this_txg) { - zfs_dbgmsg("freed %llu blocks in %llums from " - "free_bpobj/bptree txg %llu", - (longlong_t)scn->scn_visited_this_txg, - (longlong_t) - NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), - (longlong_t)tx->tx_txg); - scn->scn_visited_this_txg = 0; - /* - * Re-sync the ddt so that we can further modify - * it when doing bprewrite. - */ - ddt_sync(spa, tx->tx_txg); + + /* + * If we didn't make progress, mark the async destroy as + * stalled, so that we will not initiate a spa_sync() on + * its behalf. + */ + scn->scn_async_stalled = (scn->scn_visited_this_txg == 0); + + if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { + /* finished; deactivate async destroy feature */ + spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); + ASSERT(!spa_feature_is_active(spa, + SPA_FEATURE_ASYNC_DESTROY)); + VERIFY0(zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY0(bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + scn->scn_async_destroying = B_FALSE; } - if (err == ERESTART) - return; + } + if (scn->scn_visited_this_txg) { + zfs_dbgmsg("freed %llu blocks in %llums from " + "free_bpobj/bptree txg %llu; err=%u", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t) + NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), + (longlong_t)tx->tx_txg, err); + scn->scn_visited_this_txg = 0; + + /* + * Write out changes to the DDT that may be required as a + * result of the blocks freed. This ensures that the DDT + * is clean when a scrub/resilver runs. + */ + ddt_sync(spa, tx->tx_txg); + } + if (err != 0) + return; + if (!scn->scn_async_destroying && zfs_free_leak_on_eio && + (dp->dp_free_dir->dd_phys->dd_used_bytes != 0 || + dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 || + dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) { + /* + * We have finished background destroying, but there is still + * some space left in the dp_free_dir. Transfer this leaked + * space to the dp_leak_dir. + */ + if (dp->dp_leak_dir == NULL) { + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, + LEAK_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + LEAK_DIR_NAME, &dp->dp_leak_dir)); + rrw_exit(&dp->dp_config_rwlock, FTAG); + } + dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, + dp->dp_free_dir->dd_phys->dd_used_bytes, + dp->dp_free_dir->dd_phys->dd_compressed_bytes, + dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + -dp->dp_free_dir->dd_phys->dd_used_bytes, + -dp->dp_free_dir->dd_phys->dd_compressed_bytes, + -dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx); + } + if (!scn->scn_async_destroying) { /* finished; verify that space accounting went to zero */ ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes); ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes); diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index d4734d21d0f6..f708ce3ef22a 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -241,19 +241,25 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) } if (pool != NULL) { - dsl_dir_t *freedir = pool->dp_free_dir; - /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ - if (freedir != NULL) { + if (pool->dp_free_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, - freedir->dd_phys->dd_used_bytes, src); + pool->dp_free_dir->dd_phys->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); } + + if (pool->dp_leak_dir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, + pool->dp_leak_dir->dd_phys->dd_used_bytes, src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, + NULL, 0, src); + } } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c index 1c5743ac4ed0..ea386591b27f 100644 --- a/uts/common/fs/zfs/spa_misc.c +++ b/uts/common/fs/zfs/spa_misc.c @@ -250,7 +250,38 @@ int zfs_flags = 0; * This should only be used as a last resort, as it typically results * in leaked space, or worse. */ -int zfs_recover = 0; +boolean_t zfs_recover = B_FALSE; + +/* + * If destroy encounters an EIO while reading metadata (e.g. indirect + * blocks), space referenced by the missing metadata can not be freed. + * Normally this causes the background destroy to become "stalled", as + * it is unable to make forward progress. While in this stalled state, + * all remaining space to free from the error-encountering filesystem is + * "temporarily leaked". Set this flag to cause it to ignore the EIO, + * permanently leak the space from indirect blocks that can not be read, + * and continue to free everything else that it can. + * + * The default, "stalling" behavior is useful if the storage partially + * fails (i.e. some but not all i/os fail), and then later recovers. In + * this case, we will be able to continue pool operations while it is + * partially failed, and when it recovers, we can continue to free the + * space, with no leaks. However, note that this case is actually + * fairly rare. + * + * Typically pools either (a) fail completely (but perhaps temporarily, + * e.g. a top-level vdev going offline), or (b) have localized, + * permanent errors (e.g. disk returns the wrong data due to bit flip or + * firmware bug). In case (a), this setting does not matter because the + * pool will be suspended and the sync thread will not be able to make + * forward progress regardless. In case (b), because the error is + * permanent, the best we can do is leak the minimum amount of space, + * which is what setting this flag will do. Therefore, it is reasonable + * for this flag to normally be set, but we chose the more conservative + * approach of not setting it, so that there is no possibility of + * leaking space in the "partial temporary" failure case. + */ +boolean_t zfs_free_leak_on_eio = B_FALSE; /* * Expiration time in milliseconds. This value has two meanings. First it is diff --git a/uts/common/fs/zfs/sys/bptree.h b/uts/common/fs/zfs/sys/bptree.h index 971507211875..a533cb949021 100644 --- a/uts/common/fs/zfs/sys/bptree.h +++ b/uts/common/fs/zfs/sys/bptree.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_BPTREE_H @@ -50,6 +50,7 @@ typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx); int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); +boolean_t bptree_is_empty(objset_t *os, uint64_t obj); void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx); diff --git a/uts/common/fs/zfs/sys/dmu.h b/uts/common/fs/zfs/sys/dmu.h index e9ab6d7534e2..9a6006ce6646 100644 --- a/uts/common/fs/zfs/sys/dmu.h +++ b/uts/common/fs/zfs/sys/dmu.h @@ -253,7 +253,6 @@ void zfs_znode_byteswap(void *buf, size_t size); #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) -#define DMU_DEADLIST_OBJECT (-3ULL) /* * artificial blkids for bonus buffer and spill blocks diff --git a/uts/common/fs/zfs/sys/dsl_dir.h b/uts/common/fs/zfs/sys/dsl_dir.h index 752ccadb729d..a9c4f67515a9 100644 --- a/uts/common/fs/zfs/sys/dsl_dir.h +++ b/uts/common/fs/zfs/sys/dsl_dir.h @@ -160,6 +160,7 @@ boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); #define ORIGIN_DIR_NAME "$ORIGIN" #define XLATION_DIR_NAME "$XLATION" #define FREE_DIR_NAME "$FREE" +#define LEAK_DIR_NAME "$LEAK" #ifdef ZFS_DEBUG #define dprintf_dd(dd, fmt, ...) do { \ diff --git a/uts/common/fs/zfs/sys/dsl_pool.h b/uts/common/fs/zfs/sys/dsl_pool.h index ea180c9a5b8d..a6fb201047bb 100644 --- a/uts/common/fs/zfs/sys/dsl_pool.h +++ b/uts/common/fs/zfs/sys/dsl_pool.h @@ -84,6 +84,7 @@ typedef struct dsl_pool { struct dsl_dir *dp_root_dir; struct dsl_dir *dp_mos_dir; struct dsl_dir *dp_free_dir; + struct dsl_dir *dp_leak_dir; struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; struct taskq *dp_vnrele_taskq; diff --git a/uts/common/fs/zfs/sys/dsl_scan.h b/uts/common/fs/zfs/sys/dsl_scan.h index bf8c5ac824a1..d9f396680792 100644 --- a/uts/common/fs/zfs/sys/dsl_scan.h +++ b/uts/common/fs/zfs/sys/dsl_scan.h @@ -114,6 +114,7 @@ typedef struct dsl_scan { /* for freeing blocks */ boolean_t scn_is_bptree; boolean_t scn_async_destroying; + boolean_t scn_async_stalled; /* for debugging / information */ uint64_t scn_visited_this_txg; diff --git a/uts/common/fs/zfs/sys/zfs_debug.h b/uts/common/fs/zfs/sys/zfs_debug.h index e6a1fa282878..3c8e2177bccc 100644 --- a/uts/common/fs/zfs/sys/zfs_debug.h +++ b/uts/common/fs/zfs/sys/zfs_debug.h @@ -47,7 +47,8 @@ extern "C" { #endif extern int zfs_flags; -extern int zfs_recover; +extern boolean_t zfs_recover; +extern boolean_t zfs_free_leak_on_eio; #define ZFS_DEBUG_DPRINTF (1<<0) #define ZFS_DEBUG_DBUF_VERIFY (1<<1) diff --git a/uts/common/fs/zfs/zfs_debug.c b/uts/common/fs/zfs/zfs_debug.c index 26ea561eb1d6..85fa7600d9b9 100644 --- a/uts/common/fs/zfs/zfs_debug.c +++ b/uts/common/fs/zfs/zfs_debug.c @@ -28,7 +28,7 @@ list_t zfs_dbgmsgs; int zfs_dbgmsg_size; kmutex_t zfs_dbgmsgs_lock; -int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */ +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ void zfs_dbgmsg_init(void) diff --git a/uts/common/fs/zfs/zio.c b/uts/common/fs/zfs/zio.c index 8fdbbf843231..506897c02d41 100644 --- a/uts/common/fs/zfs/zio.c +++ b/uts/common/fs/zfs/zio.c @@ -3208,13 +3208,6 @@ zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, ASSERT(zb1->zb_objset == zb2->zb_objset); ASSERT(zb2->zb_level == 0); - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == DMU_DEADLIST_OBJECT) - return (B_TRUE); - /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); diff --git a/uts/common/sys/fs/zfs.h b/uts/common/sys/fs/zfs.h index caad33c567fe..01deecdf02da 100644 --- a/uts/common/sys/fs/zfs.h +++ b/uts/common/sys/fs/zfs.h @@ -189,6 +189,7 @@ typedef enum { ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, + ZPOOL_PROP_LEAKED, ZPOOL_NUM_PROPS } zpool_prop_t; From bd47f3f79411ba6725c929367575f53e0e7e0c87 Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 17 Jun 2014 07:46:55 +0000 Subject: [PATCH 002/380] 4881 zfs send performance degradation when embedded block pointers are encountered Reviewed by: George Wilson Reviewed by: Christopher Siden Approved by: Dan McDonald illumos/illumos-gate@06315b795c0d54f0228e0b8af497a28752dd92da --- uts/common/fs/zfs/dmu_traverse.c | 35 ++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/uts/common/fs/zfs/dmu_traverse.c b/uts/common/fs/zfs/dmu_traverse.c index 1c698b5ee03c..ad2f09ed4a55 100644 --- a/uts/common/fs/zfs/dmu_traverse.c +++ b/uts/common/fs/zfs/dmu_traverse.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include @@ -197,6 +197,16 @@ traverse_prefetch_metadata(traverse_data_t *td, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); } +static boolean_t +prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) +{ + ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || + BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) + return (B_FALSE); + return (B_TRUE); +} + static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb) @@ -242,16 +252,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, return (0); } - if (BP_IS_HOLE(bp)) { - err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); - if (err != 0) - goto post; - return (0); - } - - if (pd && !pd->pd_exited && - ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || - BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { + if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { mutex_enter(&pd->pd_mtx); ASSERT(pd->pd_blks_fetched >= 0); while (pd->pd_blks_fetched == 0 && !pd->pd_exited) @@ -261,6 +262,13 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, mutex_exit(&pd->pd_mtx); } + if (BP_IS_HOLE(bp)) { + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); + if (err != 0) + goto post; + return (0); + } + if (td->td_flags & TRAVERSE_PRE) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); @@ -444,10 +452,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (pfd->pd_cancel) return (SET_ERROR(EINTR)); - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || - !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || - BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || - BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) + if (!prefetch_needed(pfd, bp)) return (0); mutex_enter(&pfd->pd_mtx); From 93a52c22ac7c8de75dd8911a7902e35222c1dd36 Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 17 Jun 2014 07:58:53 +0000 Subject: [PATCH 003/380] 4897 Space accounting mismatch in L2ARC/zpool Reviewed by: Matthew Ahrens Reviewed by: Boris Protopopov Approved by: Dan McDonald illumos/illumos-dist@3038a2b421b40dc5ac11cd88423696618584f85a --- uts/common/fs/zfs/arc.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c index 256ec3b34cfb..811a2d1b3930 100644 --- a/uts/common/fs/zfs/arc.c +++ b/uts/common/fs/zfs/arc.c @@ -23,7 +23,7 @@ * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. - * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. */ /* @@ -1592,6 +1592,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) list_remove(l2hdr->b_dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); + vdev_space_update(l2hdr->b_dev->l2ad_vdev, + -l2hdr->b_asize, 0, 0); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); if (hdr->b_state == arc_l2c_only) l2arc_hdr_stat_remove(); @@ -3445,6 +3447,8 @@ arc_release(arc_buf_t *buf, void *tag) if (l2hdr) { ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); + vdev_space_update(l2hdr->b_dev->l2ad_vdev, + -l2hdr->b_asize, 0, 0); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -buf_size); mutex_exit(&l2arc_buflist_mtx); @@ -4241,6 +4245,7 @@ l2arc_write_done(zio_t *zio) arc_buf_hdr_t *head, *ab, *ab_prev; l2arc_buf_hdr_t *abl2; kmutex_t *hash_lock; + int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT(cb != NULL); @@ -4288,6 +4293,7 @@ l2arc_write_done(zio_t *zio) */ list_remove(buflist, ab); ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); + bytes_dropped += abl2->b_asize; ab->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); @@ -4306,6 +4312,8 @@ l2arc_write_done(zio_t *zio) kmem_cache_free(hdr_cache, head); mutex_exit(&l2arc_buflist_mtx); + vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); + l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); @@ -4444,6 +4452,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_buf_hdr_t *ab, *ab_prev; kmutex_t *hash_lock; uint64_t taddr; + int64_t bytes_evicted = 0; buflist = dev->l2ad_buflist; @@ -4542,6 +4551,7 @@ top: if (ab->b_l2hdr != NULL) { abl2 = ab->b_l2hdr; ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); + bytes_evicted += abl2->b_asize; ab->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); @@ -4558,7 +4568,7 @@ top: } mutex_exit(&l2arc_buflist_mtx); - vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); + vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); dev->l2ad_evict = taddr; } @@ -4801,15 +4811,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); ARCSTAT_INCR(arcstat_l2_asize, write_asize); - vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); + vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - vdev_space_update(dev->l2ad_vdev, - dev->l2ad_end - dev->l2ad_hand, 0, 0); dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; From 929353c3a13aefafe6edb149e6e56421915d911a Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 17 Jun 2014 08:02:50 +0000 Subject: [PATCH 004/380] 4756 metaslab_group_preload() could deadlock Reviewed by: Matthew Ahrens Reviewed by: Christopher Siden Reviewed by: Dan McDonald Reviewed by: Saso Kiselkov Approved by: Garrett D'Amore illumos/illumos-gate@30beaff42d8240ebf5386e8b7a14e3d137a1631f --- uts/common/fs/zfs/metaslab.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index 4c854b710589..bba1dbf5148c 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -1207,6 +1207,8 @@ metaslab_preload(void *arg) metaslab_t *msp = arg; spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); + mutex_enter(&msp->ms_lock); metaslab_load_wait(msp); if (!msp->ms_loaded) @@ -1231,19 +1233,36 @@ metaslab_group_preload(metaslab_group_t *mg) taskq_wait(mg->mg_taskq); return; } - mutex_enter(&mg->mg_lock); + mutex_enter(&mg->mg_lock); /* - * Prefetch the next potential metaslabs + * Load the next potential metaslabs */ - for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { + msp = avl_first(t); + while (msp != NULL) { + metaslab_t *msp_next = AVL_NEXT(t, msp); /* If we have reached our preload limit then we're done */ if (++m > metaslab_preload_limit) break; + /* + * We must drop the metaslab group lock here to preserve + * lock ordering with the ms_lock (when grabbing both + * the mg_lock and the ms_lock, the ms_lock must be taken + * first). As a result, it is possible that the ordering + * of the metaslabs within the avl tree may change before + * we reacquire the lock. The metaslab cannot be removed from + * the tree while we're in syncing context so it is safe to + * drop the mg_lock here. If the metaslabs are reordered + * nothing will break -- we just may end up loading a + * less than optimal one. + */ + mutex_exit(&mg->mg_lock); VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, msp, TQ_SLEEP) != NULL); + mutex_enter(&mg->mg_lock); + msp = msp_next; } mutex_exit(&mg->mg_lock); } From cfd11dccb95eac86d4ef3516628a99ee4fde89ab Mon Sep 17 00:00:00 2001 From: pfg Date: Thu, 26 Jun 2014 19:45:35 +0000 Subject: [PATCH 005/380] Bring a new header from OpenSolaris/Illumos. This is required for r266987 and probably other DTrace probes. --- common/util/strtolctype.h | 79 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 common/util/strtolctype.h diff --git a/common/util/strtolctype.h b/common/util/strtolctype.h new file mode 100644 index 000000000000..5675e42be7a0 --- /dev/null +++ b/common/util/strtolctype.h @@ -0,0 +1,79 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +#ifndef _COMMON_UTIL_CTYPE_H +#define _COMMON_UTIL_CTYPE_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This header file contains a collection of macros that the strtou?ll? + * functions in common/util use to test characters. What we need is a kernel + * version of ctype.h. + * + * NOTE: These macros are used within several DTrace probe context functions. + * They must not be altered to make function calls or perform actions not + * safe in probe context. + */ + +#if defined(_KERNEL) && !defined(_BOOT) + +#define isalnum(ch) (isalpha(ch) || isdigit(ch)) +#define isalpha(ch) (isupper(ch) || islower(ch)) +#define isdigit(ch) ((ch) >= '0' && (ch) <= '9') +#define islower(ch) ((ch) >= 'a' && (ch) <= 'z') +#define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \ + ((ch) == '\t') || ((ch) == '\f')) +#define isupper(ch) ((ch) >= 'A' && (ch) <= 'Z') +#define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \ + ((ch) >= 'A' && (ch) <= 'F')) + +#endif /* _KERNEL && !_BOOT */ + +#define DIGIT(x) \ + (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A') + +#define MBASE ('z' - 'a' + 1 + 10) + +/* + * The following macro is a version of isalnum() that limits alphabetic + * characters to the ranges a-z and A-Z; locale dependent characters will not + * return 1. The members of a-z and A-Z are assumed to be in ascending order + * and contiguous. + */ +#define lisalnum(x) \ + (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z')) + +#ifdef __cplusplus +} +#endif + +#endif /* _COMMON_UTIL_CTYPE_H */ From b28d56edcac8773b148fa4452514d5532081ff01 Mon Sep 17 00:00:00 2001 From: pfg Date: Thu, 26 Jun 2014 20:02:29 +0000 Subject: [PATCH 006/380] Revert r267930. File looks better in vendor/ and was brought there in r267931. --- common/util/strtolctype.h | 79 --------------------------------------- 1 file changed, 79 deletions(-) delete mode 100644 common/util/strtolctype.h diff --git a/common/util/strtolctype.h b/common/util/strtolctype.h deleted file mode 100644 index 5675e42be7a0..000000000000 --- a/common/util/strtolctype.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ - -#ifndef _COMMON_UTIL_CTYPE_H -#define _COMMON_UTIL_CTYPE_H - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * This header file contains a collection of macros that the strtou?ll? - * functions in common/util use to test characters. What we need is a kernel - * version of ctype.h. - * - * NOTE: These macros are used within several DTrace probe context functions. - * They must not be altered to make function calls or perform actions not - * safe in probe context. - */ - -#if defined(_KERNEL) && !defined(_BOOT) - -#define isalnum(ch) (isalpha(ch) || isdigit(ch)) -#define isalpha(ch) (isupper(ch) || islower(ch)) -#define isdigit(ch) ((ch) >= '0' && (ch) <= '9') -#define islower(ch) ((ch) >= 'a' && (ch) <= 'z') -#define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \ - ((ch) == '\t') || ((ch) == '\f')) -#define isupper(ch) ((ch) >= 'A' && (ch) <= 'Z') -#define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \ - ((ch) >= 'A' && (ch) <= 'F')) - -#endif /* _KERNEL && !_BOOT */ - -#define DIGIT(x) \ - (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A') - -#define MBASE ('z' - 'a' + 1 + 10) - -/* - * The following macro is a version of isalnum() that limits alphabetic - * characters to the ranges a-z and A-Z; locale dependent characters will not - * return 1. The members of a-z and A-Z are assumed to be in ascending order - * and contiguous. - */ -#define lisalnum(x) \ - (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z')) - -#ifdef __cplusplus -} -#endif - -#endif /* _COMMON_UTIL_CTYPE_H */ From 229d137acfb5c3e2d13ab8a13fee0663c0ae78b9 Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 1 Jul 2014 21:14:35 +0000 Subject: [PATCH 007/380] 4914 zfs on-disk bookmark structure should be named *_phys_t Reviewed by: George Wilson Reviewed by: Christopher Siden Reviewed by: Richard Lowe Reviewed by: Saso Kiselkov Approved by: Robert Mustacchi illumos/illumos-gate@7802d7bf98dec568dadf72286893b1fe5abd8602 --- uts/common/fs/zfs/arc.c | 10 ++++---- uts/common/fs/zfs/bptree.c | 4 ++-- uts/common/fs/zfs/dbuf.c | 6 ++--- uts/common/fs/zfs/dmu.c | 6 ++--- uts/common/fs/zfs/dmu_diff.c | 4 ++-- uts/common/fs/zfs/dmu_objset.c | 4 ++-- uts/common/fs/zfs/dmu_send.c | 4 ++-- uts/common/fs/zfs/dmu_traverse.c | 28 +++++++++++----------- uts/common/fs/zfs/dsl_destroy.c | 4 ++-- uts/common/fs/zfs/dsl_scan.c | 35 ++++++++++++++-------------- uts/common/fs/zfs/spa.c | 4 ++-- uts/common/fs/zfs/spa_errlog.c | 20 ++++++++-------- uts/common/fs/zfs/sys/arc.h | 6 ++--- uts/common/fs/zfs/sys/bptree.h | 4 ++-- uts/common/fs/zfs/sys/dmu.h | 2 +- uts/common/fs/zfs/sys/dmu_traverse.h | 6 ++--- uts/common/fs/zfs/sys/dsl_scan.h | 4 ++-- uts/common/fs/zfs/sys/spa.h | 4 ++-- uts/common/fs/zfs/sys/spa_impl.h | 8 +++---- uts/common/fs/zfs/sys/zio.h | 24 +++++++++---------- uts/common/fs/zfs/zil.c | 8 +++---- uts/common/fs/zfs/zio.c | 12 +++++----- uts/common/fs/zfs/zio_inject.c | 4 ++-- uts/common/fs/zfs/zvol.c | 4 ++-- 24 files changed, 108 insertions(+), 107 deletions(-) diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c index 811a2d1b3930..752cacdb126c 100644 --- a/uts/common/fs/zfs/arc.c +++ b/uts/common/fs/zfs/arc.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. */ @@ -664,7 +664,7 @@ typedef struct l2arc_read_callback { arc_buf_t *l2rcb_buf; /* read buffer */ spa_t *l2rcb_spa; /* spa */ blkptr_t l2rcb_bp; /* original blkptr */ - zbookmark_t l2rcb_zb; /* original bookmark */ + zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; @@ -2899,7 +2899,7 @@ arc_read_done(zio_t *zio) int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, - const zbookmark_t *zb) + const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; arc_buf_t *buf = NULL; @@ -3093,7 +3093,7 @@ top: */ ASSERT3U(hdr->b_size, ==, size); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, - uint64_t, size, zbookmark_t *, zb); + uint64_t, size, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, @@ -3616,7 +3616,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, - int zio_flags, const zbookmark_t *zb) + int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; diff --git a/uts/common/fs/zfs/bptree.c b/uts/common/fs/zfs/bptree.c index 77067d24e98a..c724ed074103 100644 --- a/uts/common/fs/zfs/bptree.c +++ b/uts/common/fs/zfs/bptree.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ #include @@ -149,7 +149,7 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, /* ARGSUSED */ static int bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { int err; struct bptree_args *ba = arg; diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c index a7e12d5bcdf8..b7567d77df02 100644 --- a/uts/common/fs/zfs/dbuf.c +++ b/uts/common/fs/zfs/dbuf.c @@ -521,7 +521,7 @@ static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { dnode_t *dn; - zbookmark_t zb; + zbookmark_phys_t zb; uint32_t aflags = ARC_NOWAIT; DB_DNODE_ENTER(db); @@ -1856,7 +1856,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; - zbookmark_t zb; + zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, 0, blkid); @@ -2721,7 +2721,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; - zbookmark_t zb; + zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; int wp_flag = 0; diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c index 4f621a2297af..1a62ddb48b15 100644 --- a/uts/common/fs/zfs/dmu.c +++ b/uts/common/fs/zfs/dmu.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -1379,7 +1379,7 @@ dmu_sync_late_arrival_done(zio_t *zio) static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, - zio_prop_t *zp, zbookmark_t *zb) + zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; @@ -1440,7 +1440,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr; dmu_sync_arg_t *dsa; - zbookmark_t zb; + zbookmark_phys_t zb; zio_prop_t zp; dnode_t *dn; diff --git a/uts/common/fs/zfs/dmu_diff.c b/uts/common/fs/zfs/dmu_diff.c index a2130b1319b5..30fabbb07957 100644 --- a/uts/common/fs/zfs/dmu_diff.c +++ b/uts/common/fs/zfs/dmu_diff.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include @@ -107,7 +107,7 @@ report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp) /* ARGSUSED */ static int diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { struct diffarg *da = arg; int err = 0; diff --git a/uts/common/fs/zfs/dmu_objset.c b/uts/common/fs/zfs/dmu_objset.c index efed341d6f7f..9cfe42a32382 100644 --- a/uts/common/fs/zfs/dmu_objset.c +++ b/uts/common/fs/zfs/dmu_objset.c @@ -286,7 +286,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { uint32_t aflags = ARC_WAIT; - zbookmark_t zb; + zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); @@ -1023,7 +1023,7 @@ void dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; - zbookmark_t zb; + zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; list_t *list; diff --git a/uts/common/fs/zfs/dmu_send.c b/uts/common/fs/zfs/dmu_send.c index 5da3f8b0e75d..20d196a83303 100644 --- a/uts/common/fs/zfs/dmu_send.c +++ b/uts/common/fs/zfs/dmu_send.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ @@ -420,7 +420,7 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) /* ARGSUSED */ static int backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { dmu_sendarg_t *dsp = arg; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; diff --git a/uts/common/fs/zfs/dmu_traverse.c b/uts/common/fs/zfs/dmu_traverse.c index ad2f09ed4a55..fb1ec0755cf1 100644 --- a/uts/common/fs/zfs/dmu_traverse.c +++ b/uts/common/fs/zfs/dmu_traverse.c @@ -55,7 +55,7 @@ typedef struct traverse_data { uint64_t td_objset; blkptr_t *td_rootbp; uint64_t td_min_txg; - zbookmark_t *td_resume; + zbookmark_phys_t *td_resume; int td_flags; prefetch_data_t *td_pfd; boolean_t td_paused; @@ -72,7 +72,7 @@ static int traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { traverse_data_t *td = arg; - zbookmark_t zb; + zbookmark_phys_t zb; if (BP_IS_HOLE(bp)) return (0); @@ -96,7 +96,7 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) if (lrc->lrc_txtype == TX_WRITE) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; - zbookmark_t zb; + zbookmark_phys_t zb; if (BP_IS_HOLE(bp)) return (0); @@ -150,7 +150,7 @@ typedef enum resume_skip { */ static resume_skip_t resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, - const zbookmark_t *zb) + const zbookmark_phys_t *zb) { if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { /* @@ -175,7 +175,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, static void traverse_prefetch_metadata(traverse_data_t *td, - const blkptr_t *bp, const zbookmark_t *zb) + const blkptr_t *bp, const zbookmark_phys_t *zb) { uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; @@ -209,9 +209,9 @@ prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - const blkptr_t *bp, const zbookmark_t *zb) + const blkptr_t *bp, const zbookmark_phys_t *zb) { - zbookmark_t czb; + zbookmark_phys_t czb; int err = 0; arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; @@ -406,7 +406,7 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int j; - zbookmark_t czb; + zbookmark_phys_t czb; for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); @@ -424,7 +424,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int j, err = 0; - zbookmark_t czb; + zbookmark_phys_t czb; for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); @@ -443,7 +443,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, /* ARGSUSED */ static int traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { prefetch_data_t *pfd = arg; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; @@ -473,7 +473,7 @@ traverse_prefetch_thread(void *arg) { traverse_data_t *td_main = arg; traverse_data_t td = *td_main; - zbookmark_t czb; + zbookmark_phys_t czb; td.td_func = traverse_prefetcher; td.td_arg = td_main->td_pfd; @@ -495,12 +495,12 @@ traverse_prefetch_thread(void *arg) */ static int traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, - uint64_t txg_start, zbookmark_t *resume, int flags, + uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg) { traverse_data_t td; prefetch_data_t pd = { 0 }; - zbookmark_t czb; + zbookmark_phys_t czb; int err; ASSERT(ds == NULL || objset == ds->ds_object); @@ -581,7 +581,7 @@ traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, - uint64_t txg_start, zbookmark_t *resume, int flags, + uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, diff --git a/uts/common/fs/zfs/dsl_destroy.c b/uts/common/fs/zfs/dsl_destroy.c index 441036c25afa..0813f3288dde 100644 --- a/uts/common/fs/zfs/dsl_destroy.c +++ b/uts/common/fs/zfs/dsl_destroy.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2013 by Joyent, Inc. All rights reserved. */ @@ -534,7 +534,7 @@ struct killarg { /* ARGSUSED */ static int kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c index 860f9dfb4e86..1f705580d0bb 100644 --- a/uts/common/fs/zfs/dsl_scan.c +++ b/uts/common/fs/zfs/dsl_scan.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ #include @@ -50,7 +50,8 @@ #include #endif -typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); +typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, + const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; static void dsl_scan_cancel_sync(void *, dmu_tx_t *); @@ -349,7 +350,7 @@ dsl_scan_cancel(dsl_pool_t *dp) } static void dsl_scan_visitbp(blkptr_t *bp, - const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, + const zbookmark_phys_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx); static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, @@ -388,7 +389,7 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) } static boolean_t -dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) +dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb) { uint64_t elapsed_nanosecs; int mintime; @@ -446,7 +447,7 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; - zbookmark_t zb; + zbookmark_phys_t zb; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); @@ -478,7 +479,7 @@ dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) zil_header_t *zh = zsa->zsa_zh; lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; - zbookmark_t zb; + zbookmark_phys_t zb; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) @@ -528,7 +529,7 @@ static void dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, uint64_t object, uint64_t blkid) { - zbookmark_t czb; + zbookmark_phys_t czb; uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; if (zfs_no_scrub_prefetch) @@ -547,7 +548,7 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, static boolean_t dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, - const zbookmark_t *zb) + const zbookmark_phys_t *zb) { /* * We never skip over user/group accounting objects (obj<0) @@ -587,7 +588,7 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, static int dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, const blkptr_t *bp, - const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) + const zbookmark_phys_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) { dsl_pool_t *dp = scn->scn_dp; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; @@ -610,7 +611,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, zb->zb_object, zb->zb_blkid * epb + i); } for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { - zbookmark_t czb; + zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, @@ -694,7 +695,7 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, int j; for (j = 0; j < dnp->dn_nblkptr; j++) { - zbookmark_t czb; + zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, dnp->dn_nlevels - 1, j); @@ -703,7 +704,7 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - zbookmark_t czb; + zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 0, DMU_SPILL_BLKID); dsl_scan_visitbp(&dnp->dn_spill, @@ -716,7 +717,7 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, * first 5; we want them to be useful. */ static void -dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, +dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx) @@ -780,7 +781,7 @@ static void dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { - zbookmark_t zb; + zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); @@ -1207,7 +1208,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, const ddt_key_t *ddk = &dde->dde_key; ddt_phys_t *ddp = dde->dde_phys; blkptr_t bp; - zbookmark_t zb = { 0 }; + zbookmark_phys_t zb = { 0 }; if (scn->scn_phys.scn_state != DSS_SCANNING) return; @@ -1275,7 +1276,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) * In case we were paused right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ - bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t)); + bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); /* keep pulling things out of the zap-object-as-queue */ while (zap_cursor_init(&zc, dp->dp_meta_objset, @@ -1672,7 +1673,7 @@ dsl_scan_scrub_done(zio_t *zio) static int dsl_scan_scrub_cb(dsl_pool_t *dp, - const blkptr_t *bp, const zbookmark_t *zb) + const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; size_t size = BP_GET_PSIZE(bp); diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index f708ce3ef22a..2aedc4279dba 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -793,7 +793,7 @@ spa_error_entry_compare(const void *a, const void *b) int ret; ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, - sizeof (zbookmark_t)); + sizeof (zbookmark_phys_t)); if (ret < 0) return (-1); @@ -1831,7 +1831,7 @@ spa_load_verify_done(zio_t *zio) /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { zio_t *rio = arg; diff --git a/uts/common/fs/zfs/spa_errlog.c b/uts/common/fs/zfs/spa_errlog.c index 0dd6c7a489b7..168f3acc2e8e 100644 --- a/uts/common/fs/zfs/spa_errlog.c +++ b/uts/common/fs/zfs/spa_errlog.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ /* @@ -35,7 +35,7 @@ * deleted from the log when the scrub completes. * * The log is stored using a ZAP object whose key is a string form of the - * zbookmark tuple (objset, object, level, blkid), and whose contents is an + * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an * optional 'objset:object' human-readable string describing the data. When an * error is first logged, this string will be empty, indicating that no name is * known. This prevents us from having to issue a potentially large amount of @@ -59,7 +59,7 @@ * Convert a bookmark to a string. */ static void -bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) +bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) { (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, @@ -71,7 +71,7 @@ bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) */ #ifdef _KERNEL static void -name_to_bookmark(char *buf, zbookmark_t *zb) +name_to_bookmark(char *buf, zbookmark_phys_t *zb) { zb->zb_objset = strtonum(buf, &buf); ASSERT(*buf == ':'); @@ -92,7 +92,7 @@ name_to_bookmark(char *buf, zbookmark_t *zb) void spa_log_error(spa_t *spa, zio_t *zio) { - zbookmark_t *zb = &zio->io_logical->io_bookmark; + zbookmark_phys_t *zb = &zio->io_logical->io_bookmark; spa_error_entry_t search; spa_error_entry_t *new; avl_tree_t *tree; @@ -165,7 +165,7 @@ process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) { zap_cursor_t zc; zap_attribute_t za; - zbookmark_t zb; + zbookmark_phys_t zb; if (obj == 0) return (0); @@ -182,8 +182,8 @@ process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) name_to_bookmark(za.za_name, &zb); if (copyout(&zb, (char *)addr + - (*count - 1) * sizeof (zbookmark_t), - sizeof (zbookmark_t)) != 0) { + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { zap_cursor_fini(&zc); return (SET_ERROR(EFAULT)); } @@ -207,8 +207,8 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count) return (SET_ERROR(ENOMEM)); if (copyout(&se->se_bookmark, (char *)addr + - (*count - 1) * sizeof (zbookmark_t), - sizeof (zbookmark_t)) != 0) + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) return (SET_ERROR(EFAULT)); *count -= 1; diff --git a/uts/common/fs/zfs/sys/arc.h b/uts/common/fs/zfs/sys/arc.h index b1c60905721d..15ef0f58d02b 100644 --- a/uts/common/fs/zfs/sys/arc.h +++ b/uts/common/fs/zfs/sys/arc.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ @@ -105,12 +105,12 @@ int arc_referenced(arc_buf_t *buf); int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int flags, - uint32_t *arc_flags, const zbookmark_t *zb); + uint32_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, - int zio_flags, const zbookmark_t *zb); + int zio_flags, const zbookmark_phys_t *zb); void arc_freed(spa_t *spa, const blkptr_t *bp); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); diff --git a/uts/common/fs/zfs/sys/bptree.h b/uts/common/fs/zfs/sys/bptree.h index a533cb949021..327c128bf493 100644 --- a/uts/common/fs/zfs/sys/bptree.h +++ b/uts/common/fs/zfs/sys/bptree.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_BPTREE_H @@ -43,7 +43,7 @@ typedef struct bptree_phys { typedef struct bptree_entry_phys { blkptr_t be_bp; uint64_t be_birth_txg; /* only delete blocks born after this txg */ - zbookmark_t be_zb; /* holds traversal resume point if needed */ + zbookmark_phys_t be_zb; /* holds traversal resume point if needed */ } bptree_entry_phys_t; typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); diff --git a/uts/common/fs/zfs/sys/dmu.h b/uts/common/fs/zfs/sys/dmu.h index 9a6006ce6646..f968f4bc1220 100644 --- a/uts/common/fs/zfs/sys/dmu.h +++ b/uts/common/fs/zfs/sys/dmu.h @@ -65,7 +65,7 @@ struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; -struct zbookmark; +struct zbookmark_phys; struct spa; struct nvlist; struct arc_buf; diff --git a/uts/common/fs/zfs/sys/dmu_traverse.h b/uts/common/fs/zfs/sys/dmu_traverse.h index bc1590bc3aa8..544b721e4612 100644 --- a/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/uts/common/fs/zfs/sys/dmu_traverse.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_TRAVERSE_H @@ -40,7 +40,7 @@ struct zilog; struct arc_buf; typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg); + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg); #define TRAVERSE_PRE (1<<0) #define TRAVERSE_POST (1<<1) @@ -55,7 +55,7 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, - uint64_t txg_start, zbookmark_t *resume, int flags, + uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg); int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); diff --git a/uts/common/fs/zfs/sys/dsl_scan.h b/uts/common/fs/zfs/sys/dsl_scan.h index d9f396680792..ee8512c07dac 100644 --- a/uts/common/fs/zfs/sys/dsl_scan.h +++ b/uts/common/fs/zfs/sys/dsl_scan.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_SCAN_H @@ -62,7 +62,7 @@ typedef struct dsl_scan_phys { uint64_t scn_errors; /* scan I/O error count */ uint64_t scn_ddt_class_max; ddt_bookmark_t scn_ddt_bookmark; - zbookmark_t scn_bookmark; + zbookmark_phys_t scn_bookmark; uint64_t scn_flags; /* dsl_scan_flags_t */ } dsl_scan_phys_t; diff --git a/uts/common/fs/zfs/sys/spa.h b/uts/common/fs/zfs/sys/spa.h index 4bb3d0a9adf7..a2f200e49353 100644 --- a/uts/common/fs/zfs/sys/spa.h +++ b/uts/common/fs/zfs/sys/spa.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -799,7 +799,7 @@ extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...); /* error handling */ -struct zbookmark; +struct zbookmark_phys; extern void spa_log_error(spa_t *spa, zio_t *zio); extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, zio_t *zio, uint64_t stateoroffset, uint64_t length); diff --git a/uts/common/fs/zfs/sys/spa_impl.h b/uts/common/fs/zfs/sys/spa_impl.h index 9d74efa1849b..739b95988c25 100644 --- a/uts/common/fs/zfs/sys/spa_impl.h +++ b/uts/common/fs/zfs/sys/spa_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -46,9 +46,9 @@ extern "C" { #endif typedef struct spa_error_entry { - zbookmark_t se_bookmark; - char *se_name; - avl_node_t se_avl; + zbookmark_phys_t se_bookmark; + char *se_name; + avl_node_t se_avl; } spa_error_entry_t; typedef struct spa_history_phys { diff --git a/uts/common/fs/zfs/sys/zio.h b/uts/common/fs/zfs/sys/zio.h index 88e58884cf47..4c241e9f2680 100644 --- a/uts/common/fs/zfs/sys/zio.h +++ b/uts/common/fs/zfs/sys/zio.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -260,16 +260,16 @@ extern const char *zio_type_name[ZIO_TYPES]; * Note: this structure is called a bookmark because its original purpose * was to remember where to resume a pool-wide traverse. * - * Note: this structure is passed between userland and the kernel. - * Therefore it must not change size or alignment between 32/64 bit - * compilation options. + * Note: this structure is passed between userland and the kernel, and is + * stored on disk (by virtue of being incorporated into other on-disk + * structures, e.g. dsl_scan_phys_t). */ -typedef struct zbookmark { +typedef struct zbookmark_phys { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; -} zbookmark_t; +} zbookmark_phys_t; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ @@ -380,7 +380,7 @@ typedef struct zio_link { struct zio { /* Core information about this I/O */ - zbookmark_t io_bookmark; + zbookmark_phys_t io_bookmark; zio_prop_t io_prop; zio_type_t io_type; enum zio_child io_child_type; @@ -459,17 +459,17 @@ extern zio_t *zio_root(spa_t *spa, extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb); + zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb); + zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb); + zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite); @@ -588,9 +588,9 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); -/* zbookmark functions */ +/* zbookmark_phys functions */ boolean_t zbookmark_is_before(const struct dnode_phys *dnp, - const zbookmark_t *zb1, const zbookmark_t *zb2); + const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); #ifdef __cplusplus } diff --git a/uts/common/fs/zfs/zil.c b/uts/common/fs/zfs/zil.c index b873b12bd76e..a5fe3fb00f82 100644 --- a/uts/common/fs/zfs/zil.c +++ b/uts/common/fs/zfs/zil.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -183,7 +183,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; uint32_t aflags = ARC_WAIT; arc_buf_t *abuf = NULL; - zbookmark_t zb; + zbookmark_phys_t zb; int error; if (zilog->zl_header->zh_claim_txg == 0) @@ -256,7 +256,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) const blkptr_t *bp = &lr->lr_blkptr; uint32_t aflags = ARC_WAIT; arc_buf_t *abuf = NULL; - zbookmark_t zb; + zbookmark_phys_t zb; int error; if (BP_IS_HOLE(bp)) { @@ -866,7 +866,7 @@ zil_lwb_write_done(zio_t *zio) static void zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) { - zbookmark_t zb; + zbookmark_phys_t zb; SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, diff --git a/uts/common/fs/zfs/zio.c b/uts/common/fs/zfs/zio.c index 506897c02d41..e60f0eb614ac 100644 --- a/uts/common/fs/zfs/zio.c +++ b/uts/common/fs/zfs/zio.c @@ -489,7 +489,7 @@ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, - vdev_t *vd, uint64_t offset, const zbookmark_t *zb, + vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; @@ -598,7 +598,7 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) + zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -616,7 +616,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) + zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -654,7 +654,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb) + zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { zio_t *zio; @@ -3200,8 +3200,8 @@ static zio_pipe_stage_t *zio_pipeline[] = { /* dnp is the dnode for zb1->zb_object */ boolean_t -zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, - const zbookmark_t *zb2) +zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, + const zbookmark_phys_t *zb2) { uint64_t zb1nextL0, zb2thisobj; diff --git a/uts/common/fs/zfs/zio_inject.c b/uts/common/fs/zfs/zio_inject.c index 9f3db704b2fa..b2976111d65e 100644 --- a/uts/common/fs/zfs/zio_inject.c +++ b/uts/common/fs/zfs/zio_inject.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ /* @@ -64,7 +64,7 @@ static int inject_next_id = 1; * Returns true if the given record matches the I/O in progress. */ static boolean_t -zio_match_handler(zbookmark_t *zb, uint64_t type, +zio_match_handler(zbookmark_phys_t *zb, uint64_t type, zinject_record_t *record, int error) { /* diff --git a/uts/common/fs/zfs/zvol.c b/uts/common/fs/zfs/zvol.c index 19f6bb97536c..2c225e4e9dcc 100644 --- a/uts/common/fs/zfs/zvol.c +++ b/uts/common/fs/zfs/zvol.c @@ -24,7 +24,7 @@ * Portions Copyright 2010 Robert Milkowski * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -244,7 +244,7 @@ struct maparg { /*ARGSUSED*/ static int zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { struct maparg *ma = arg; zvol_extent_t *ze; From bb7e6d84964734155f4de126fd46261349151b82 Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 1 Jul 2014 21:16:27 +0000 Subject: [PATCH 008/380] 4936 lz4 could theoretically overflow a pointer with a certain input Reviewed by: Saso Kiselkov Reviewed by: Keith Wesolowski Approved by: Gordon Ross illumos/illumos-gate@58d0718061c87e3d647c891ec5281b93c08dba4e --- uts/common/fs/zfs/lz4.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/uts/common/fs/zfs/lz4.c b/uts/common/fs/zfs/lz4.c index 40cb0711e08f..656360a6f2e0 100644 --- a/uts/common/fs/zfs/lz4.c +++ b/uts/common/fs/zfs/lz4.c @@ -960,6 +960,9 @@ real_LZ4_uncompress(const char *source, char *dest, int osize) } /* copy literals */ cpy = op + length; + /* CORNER-CASE: cpy might overflow. */ + if (cpy < op) + goto _output_error; /* cpy was overflowed, bail! */ if unlikely(cpy > oend - COPYLENGTH) { if (cpy != oend) /* Error: we must necessarily stand at EOF */ @@ -1075,6 +1078,9 @@ LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize, } /* copy literals */ cpy = op + length; + /* CORNER-CASE: cpy might overflow. */ + if (cpy < op) + goto _output_error; /* cpy was overflowed, bail! */ if ((cpy > oend - COPYLENGTH) || (ip + length > iend - COPYLENGTH)) { if (cpy > oend) From 6c37cbaf2596c77fb699e7fd869b87af872c621e Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 1 Jul 2014 21:19:10 +0000 Subject: [PATCH 009/380] 4924 LZ4 Compression for metadata Reviewed by Matthew Ahrens Reviewed by Saso Kiselkov Approved by: Christopher Siden illumos/illumos-gate@b8289d24d866c1af02d7007348f7f057693c15d3 --- common/zfs/zfeature_common.c | 3 ++- uts/common/fs/zfs/dmu.c | 14 ++++++++++++-- uts/common/fs/zfs/spa.c | 18 +++++++++++++++++- uts/common/fs/zfs/zfs_ioctl.c | 32 +------------------------------- 4 files changed, 32 insertions(+), 35 deletions(-) diff --git a/common/zfs/zfeature_common.c b/common/zfs/zfeature_common.c index 447c64cc2007..9b046ab07db1 100644 --- a/common/zfs/zfeature_common.c +++ b/common/zfs/zfeature_common.c @@ -23,6 +23,7 @@ * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */ #ifdef _KERNEL @@ -169,7 +170,7 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_LZ4_COMPRESS, "org.illumos:lz4_compress", "lz4_compress", "LZ4 compression algorithm support.", B_FALSE, B_FALSE, - B_FALSE, NULL); + B_TRUE, NULL); zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump", diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c index 1a62ddb48b15..a1bd0d810301 100644 --- a/uts/common/fs/zfs/dmu.c +++ b/uts/common/fs/zfs/dmu.c @@ -24,6 +24,7 @@ */ /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ +/* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */ #include #include @@ -44,6 +45,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -1635,8 +1637,16 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ - compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : - ZIO_COMPRESS_LZJB; + boolean_t lz4_ac = spa_feature_is_active(os->os_spa, + SPA_FEATURE_LZ4_COMPRESS); + + if (zfs_mdcomp_disable) { + compress = ZIO_COMPRESS_EMPTY; + } else if (lz4_ac) { + compress = ZIO_COMPRESS_LZ4; + } else { + compress = ZIO_COMPRESS_LZJB; + } /* * Metadata always gets checksummed. If the data diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index 2aedc4279dba..c8cabb2c32e3 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. - * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. */ /* @@ -6106,6 +6106,22 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } + + /* + * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable + * when possibility to use lz4 compression for metadata was added + * Old pools that have this feature enabled must be upgraded to have + * this feature active + */ + if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { + boolean_t lz4_en = spa_feature_is_enabled(spa, + SPA_FEATURE_LZ4_COMPRESS); + boolean_t lz4_ac = spa_feature_is_active(spa, + SPA_FEATURE_LZ4_COMPRESS); + + if (lz4_en && !lz4_ac) + spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); + } rrw_exit(&dp->dp_config_rwlock, FTAG); } diff --git a/uts/common/fs/zfs/zfs_ioctl.c b/uts/common/fs/zfs/zfs_ioctl.c index 01105be5c51c..83ec68854ed2 100644 --- a/uts/common/fs/zfs/zfs_ioctl.c +++ b/uts/common/fs/zfs/zfs_ioctl.c @@ -27,6 +27,7 @@ * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */ /* @@ -2453,37 +2454,6 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, } break; } - case ZFS_PROP_COMPRESSION: - { - if (intval == ZIO_COMPRESS_LZ4) { - spa_t *spa; - - if ((err = spa_open(dsname, &spa, FTAG)) != 0) - return (err); - - /* - * Setting the LZ4 compression algorithm activates - * the feature. - */ - if (!spa_feature_is_active(spa, - SPA_FEATURE_LZ4_COMPRESS)) { - if ((err = zfs_prop_activate_feature(spa, - SPA_FEATURE_LZ4_COMPRESS)) != 0) { - spa_close(spa, FTAG); - return (err); - } - } - - spa_close(spa, FTAG); - } - /* - * We still want the default set action to be performed in the - * caller, we only performed zfeature settings here. - */ - err = -1; - break; - } - default: err = -1; } From fcbf7cd0f6a14e541b4b43773b2880e28e41d1d0 Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 1 Jul 2014 21:21:56 +0000 Subject: [PATCH 010/380] 4929 want prevsnap property Reviewed by: Adam Leventhal Reviewed by: Matt Amdur Reviewed by: Saso Kiselkov Reviewed by: Boris Protopopov Reviewed by: Richard Lowe Approved by: Dan McDonald illumos/illumos-gate@b461c7460e5e77cf65f00151162e654220c6e2fb --- common/zfs/zfs_prop.c | 2 ++ uts/common/fs/zfs/dsl_dataset.c | 8 +++++++- uts/common/sys/fs/zfs.h | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/common/zfs/zfs_prop.c b/common/zfs/zfs_prop.c index 7c12d6f3f471..0b4927243fb6 100644 --- a/common/zfs/zfs_prop.c +++ b/common/zfs/zfs_prop.c @@ -423,6 +423,8 @@ zfs_prop_init(void) PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID"); zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT"); + zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING, + PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP"); /* oddball properties */ zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, diff --git a/uts/common/fs/zfs/dsl_dataset.c b/uts/common/fs/zfs/dsl_dataset.c index c5872379694c..425c0d45515c 100644 --- a/uts/common/fs/zfs/dsl_dataset.c +++ b/uts/common/fs/zfs/dsl_dataset.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. */ @@ -1543,6 +1543,12 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) ds->ds_phys->ds_unique_bytes); get_clones_stat(ds, nv); } else { + if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) { + char buf[MAXNAMELEN]; + dsl_dataset_name(ds->ds_prev, buf); + dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf); + } + dsl_dir_stats(ds->ds_dir, nv); } diff --git a/uts/common/sys/fs/zfs.h b/uts/common/sys/fs/zfs.h index 01deecdf02da..f1615194b16a 100644 --- a/uts/common/sys/fs/zfs.h +++ b/uts/common/sys/fs/zfs.h @@ -147,6 +147,7 @@ typedef enum { ZFS_PROP_FILESYSTEM_COUNT, ZFS_PROP_SNAPSHOT_COUNT, ZFS_PROP_REDUNDANT_METADATA, + ZFS_PROP_PREV_SNAP, ZFS_NUM_PROPS } zfs_prop_t; From 129f80f73e8f04be6e2819c3960869a5ec14f512 Mon Sep 17 00:00:00 2001 From: delphij Date: Wed, 9 Jul 2014 08:14:13 +0000 Subject: [PATCH 011/380] 4950 files sometimes can't be removed from a full filesystem Reviewed by: Adam Leventhal Reviewed by: George Wilson Reviewed by: Sebastien Roy Reviewed by: Boris Protopopov Approved by: Dan McDonald illumos/illumos-gate@4bb73804952172060c9efb163b89c17f56804fe8 --- uts/common/fs/zfs/dmu.c | 7 +++++++ uts/common/fs/zfs/dmu_tx.c | 28 +++++++++++++++++++++++++++- uts/common/fs/zfs/sys/dmu.h | 1 + uts/common/fs/zfs/zfs_dir.c | 3 ++- uts/common/fs/zfs/zfs_vnops.c | 35 +++++++++++++++++++++-------------- uts/common/fs/zfs/zfs_znode.c | 11 ++++++----- uts/common/fs/zfs/zvol.c | 2 ++ 7 files changed, 66 insertions(+), 21 deletions(-) diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c index a1bd0d810301..ff67a894bc25 100644 --- a/uts/common/fs/zfs/dmu.c +++ b/uts/common/fs/zfs/dmu.c @@ -666,6 +666,12 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_end - chunk_begin); + + /* + * Mark this transaction as typically resulting in a net + * reduction in space used. + */ + dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); @@ -717,6 +723,7 @@ dmu_free_long_object(objset_t *os, uint64_t object) tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); + dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { err = dmu_object_free(os, object, tx); diff --git a/uts/common/fs/zfs/dmu_tx.c b/uts/common/fs/zfs/dmu_tx.c index 70c91aca8d0c..55ce31eda85c 100644 --- a/uts/common/fs/zfs/dmu_tx.c +++ b/uts/common/fs/zfs/dmu_tx.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include @@ -583,6 +583,32 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) txh->txh_space_tounref += unref; } +/* + * This function marks the transaction as being a "net free". The end + * result is that refquotas will be disabled for this transaction, and + * this transaction will be able to use half of the pool space overhead + * (see dsl_pool_adjustedsize()). Therefore this function should only + * be called for transactions that we expect will not cause a net increase + * in the amount of space used (but it's OK if that is occasionally not true). + */ +void +dmu_tx_mark_netfree(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + DMU_NEW_OBJECT, THT_FREE, 0, 0); + + /* + * Pretend that this operation will free 1GB of space. This + * should be large enough to cancel out the largest write. + * We don't want to use something like UINT64_MAX, because that would + * cause overflows when doing math with these values (e.g. in + * dmu_tx_try_assign()). + */ + txh->txh_space_tofree = txh->txh_space_tounref = 1024 * 1024 * 1024; +} + void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) { diff --git a/uts/common/fs/zfs/sys/dmu.h b/uts/common/fs/zfs/sys/dmu.h index f968f4bc1220..b73de0649377 100644 --- a/uts/common/fs/zfs/sys/dmu.h +++ b/uts/common/fs/zfs/sys/dmu.h @@ -567,6 +567,7 @@ void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); +void dmu_tx_mark_netfree(dmu_tx_t *tx); /* * To register a commit callback, dmu_tx_callback_register() must be called. diff --git a/uts/common/fs/zfs/zfs_dir.c b/uts/common/fs/zfs/zfs_dir.c index cb3b4aca15c1..bd7424b55b5e 100644 --- a/uts/common/fs/zfs/zfs_dir.c +++ b/uts/common/fs/zfs/zfs_dir.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ #include @@ -554,6 +554,7 @@ zfs_purgedir(znode_t *dzp) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* Is this really needed ? */ zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); diff --git a/uts/common/fs/zfs/zfs_vnops.c b/uts/common/fs/zfs/zfs_vnops.c index 767d3f765d07..6fd0bfed0636 100644 --- a/uts/common/fs/zfs/zfs_vnops.c +++ b/uts/common/fs/zfs/zfs_vnops.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. */ @@ -1301,7 +1301,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context - * vsecp - ACL to be set + * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * @@ -1577,7 +1577,7 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; - uint64_t xattr_obj_unlinked = 0; + uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; zfs_dirlock_t *dl; dmu_tx_t *tx; @@ -1677,6 +1677,14 @@ top: /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* + * Mark this transaction as typically resulting in a net free of + * space, unless object removal will be delayed indefinitely + * (due to active holds on the vnode due to the file being open). + */ + if (may_delete_now) + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); @@ -1707,7 +1715,6 @@ top: } if (unlinked) { - /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to @@ -4692,13 +4699,13 @@ zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, * last page is pushed. The problem occurs when the msync() call is omitted, * which by far the most common case: * - * open() - * mmap() - * - * munmap() - * close() - *