From 9c43027b3f18769f2ace16eaa222ac8b301501f4 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Wed, 6 May 2015 09:07:55 -0700 Subject: [PATCH] Illumos 5269 - zpool import slow 5269 zpool import slow Reviewed by: Matthew Ahrens Reviewed by: George Wilson Reviewed by: Dan McDonald Approved by: Dan McDonald References: https://www.illumos.org/issues/5269 https://github.com/illumos/illumos-gate/commit/12380e1e Ported-by: DHE Signed-off-by: Brian Behlendorf Closes #3396 --- include/sys/dmu.h | 1 + include/sys/dmu_objset.h | 2 + include/sys/dsl_pool.h | 1 + include/sys/vdev.h | 1 + include/sys/zil.h | 9 +- module/zfs/dmu_objset.c | 221 ++++++++++++++++++++++++++++++++------- module/zfs/dsl_pool.c | 10 +- module/zfs/spa.c | 11 +- module/zfs/vdev.c | 21 ++++ module/zfs/zil.c | 23 ++-- 10 files changed, 240 insertions(+), 60 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index d3874335b78d..a010a68c443e 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -240,6 +240,7 @@ void zfs_znode_byteswap(void *buf, size_t size); #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) +#define DS_FIND_SERIALIZE (1<<2) /* * The maximum number of bytes that can be accessed as part of one diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 8cb7bd02f4c2..bee4bbcfce03 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -142,6 +142,8 @@ struct objset { int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); +int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj, + dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); void dmu_objset_refresh_ownership(objset_t *os, void *tag); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 34dc65ba40ea..b2b9128e5274 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -158,6 +158,7 @@ boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp); void dsl_pool_config_enter(dsl_pool_t *dp, void *tag); void dsl_pool_config_exit(dsl_pool_t *dp, void *tag); boolean_t dsl_pool_config_held(dsl_pool_t *dp); +boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp); taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index bef6aaf8f11b..365789e524d6 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -59,6 +59,7 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); extern boolean_t vdev_is_bootable(vdev_t *vd); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); +extern int vdev_count_leaves(spa_t *spa); extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, diff --git a/include/sys/zil.h b/include/sys/zil.h index 362304135dac..65b14f1cd6a2 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -37,6 +37,9 @@ extern "C" { #endif +struct dsl_pool; +struct dsl_dataset; + /* * Intent log format: * @@ -466,8 +469,10 @@ extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_commit(zilog_t *zilog, uint64_t oid); extern int zil_vdev_offline(const char *osname, void *txarg); -extern int zil_claim(const char *osname, void *txarg); -extern int zil_check_log_chain(const char *osname, void *txarg); +extern int zil_claim(struct dsl_pool *dp, + struct dsl_dataset *ds, void *txarg); +extern int zil_check_log_chain(struct dsl_pool *dp, + struct dsl_dataset *ds, void *tx); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 888c251ceab9..6ef6adbd9e93 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -24,6 +24,7 @@ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2015, STRATO AG, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -48,6 +49,7 @@ #include #include #include +#include /* * Needed to close a window in dnode_move() that allows the objset to be freed @@ -55,6 +57,16 @@ */ krwlock_t os_lock; +/* + * Tunable to overwrite the maximum number of threads for the parallization + * of dmu_objset_find_dp, needed to speed up the import of pools with many + * datasets. + * Default is 4 times the number of leaf vdevs. + */ +int dmu_find_threads = 0; + +static void dmu_objset_find_dp_cb(void *arg); + void dmu_objset_init(void) { @@ -504,6 +516,25 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp) return (err); } +static int +dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) +{ + int err; + + err = dmu_objset_from_ds(ds, osp); + if (err != 0) { + dsl_dataset_disown(ds, tag); + } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { + dsl_dataset_disown(ds, tag); + return (SET_ERROR(EINVAL)); + } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + dsl_dataset_disown(ds, tag); + return (SET_ERROR(EROFS)); + } + return (err); +} + /* * dsl_pool must not be held when this is called. * Upon successful return, there will be a longhold on the dataset, @@ -525,21 +556,26 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, dsl_pool_rele(dp, FTAG); return (err); } - - err = dmu_objset_from_ds(ds, osp); + err = dmu_objset_own_impl(ds, type, readonly, tag, osp); dsl_pool_rele(dp, FTAG); - if (err != 0) { - dsl_dataset_disown(ds, tag); - } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { - dsl_dataset_disown(ds, tag); - return (SET_ERROR(EINVAL)); - } else if (!readonly && ds->ds_is_snapshot) { - dsl_dataset_disown(ds, tag); - return (SET_ERROR(EROFS)); - } + return (err); } +int +dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_own_obj(dp, obj, tag, &ds); + if (err != 0) + return (err); + + return (dmu_objset_own_impl(ds, type, readonly, tag, osp)); +} + void dmu_objset_rele(objset_t *os, void *tag) { @@ -1618,30 +1654,41 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, return (0); } -/* - * Find objsets under and including ddobj, call func(ds) on each. - */ -int -dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, - int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) +typedef struct dmu_objset_find_ctx { + taskq_t *dc_tq; + dsl_pool_t *dc_dp; + uint64_t dc_ddobj; + int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *); + void *dc_arg; + int dc_flags; + kmutex_t *dc_error_lock; + int *dc_error; +} dmu_objset_find_ctx_t; + +static void +dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp) { + dsl_pool_t *dp = dcp->dc_dp; + dmu_objset_find_ctx_t *child_dcp; dsl_dir_t *dd; dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; uint64_t thisobj; - int err; + int err = 0; - ASSERT(dsl_pool_config_held(dp)); + /* don't process if there already was an error */ + if (*dcp->dc_error != 0) + goto out; - err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); + err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd); if (err != 0) - return (err); + goto out; /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); - return (0); + goto out; } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; @@ -1650,7 +1697,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, /* * Iterate over all children. */ - if (flags & DS_FIND_CHILDREN) { + if (dcp->dc_flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; @@ -1659,24 +1706,22 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); - err = dmu_objset_find_dp(dp, attr->za_first_integer, - func, arg, flags); - if (err != 0) - break; + child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP); + *child_dcp = *dcp; + child_dcp->dc_ddobj = attr->za_first_integer; + if (dcp->dc_tq != NULL) + (void) taskq_dispatch(dcp->dc_tq, + dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP); + else + dmu_objset_find_dp_impl(child_dcp); } zap_cursor_fini(&zc); - - if (err != 0) { - dsl_dir_rele(dd, FTAG); - kmem_free(attr, sizeof (zap_attribute_t)); - return (err); - } } /* * Iterate over all snapshots. */ - if (flags & DS_FIND_SNAPSHOTS) { + if (dcp->dc_flags & DS_FIND_SNAPSHOTS) { dsl_dataset_t *ds; err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); @@ -1697,7 +1742,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, attr->za_first_integer, FTAG, &ds); if (err != 0) break; - err = func(dp, ds, arg); + err = dcp->dc_func(dp, ds, dcp->dc_arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; @@ -1710,17 +1755,115 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, kmem_free(attr, sizeof (zap_attribute_t)); if (err != 0) - return (err); + goto out; /* * Apply to self. */ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err != 0) - return (err); - err = func(dp, ds, arg); + goto out; + err = dcp->dc_func(dp, ds, dcp->dc_arg); dsl_dataset_rele(ds, FTAG); - return (err); + +out: + if (err != 0) { + mutex_enter(dcp->dc_error_lock); + /* only keep first error */ + if (*dcp->dc_error == 0) + *dcp->dc_error = err; + mutex_exit(dcp->dc_error_lock); + } + + kmem_free(dcp, sizeof (*dcp)); +} + +static void +dmu_objset_find_dp_cb(void *arg) +{ + dmu_objset_find_ctx_t *dcp = arg; + dsl_pool_t *dp = dcp->dc_dp; + + dsl_pool_config_enter(dp, FTAG); + + dmu_objset_find_dp_impl(dcp); + + dsl_pool_config_exit(dp, FTAG); +} + +/* + * Find objsets under and including ddobj, call func(ds) on each. + * The order for the enumeration is completely undefined. + * func is called with dsl_pool_config held. + */ +int +dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, + int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) +{ + int error = 0; + taskq_t *tq = NULL; + int ntasks; + dmu_objset_find_ctx_t *dcp; + kmutex_t err_lock; + + mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL); + dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP); + dcp->dc_tq = NULL; + dcp->dc_dp = dp; + dcp->dc_ddobj = ddobj; + dcp->dc_func = func; + dcp->dc_arg = arg; + dcp->dc_flags = flags; + dcp->dc_error_lock = &err_lock; + dcp->dc_error = &error; + + if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) { + /* + * In case a write lock is held we can't make use of + * parallelism, as down the stack of the worker threads + * the lock is asserted via dsl_pool_config_held. + * In case of a read lock this is solved by getting a read + * lock in each worker thread, which isn't possible in case + * of a writer lock. So we fall back to the synchronous path + * here. + * In the future it might be possible to get some magic into + * dsl_pool_config_held in a way that it returns true for + * the worker threads so that a single lock held from this + * thread suffices. For now, stay single threaded. + */ + dmu_objset_find_dp_impl(dcp); + + return (error); + } + + ntasks = dmu_find_threads; + if (ntasks == 0) + ntasks = vdev_count_leaves(dp->dp_spa) * 4; + tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks, + INT_MAX, 0); + if (tq == NULL) { + kmem_free(dcp, sizeof (*dcp)); + return (SET_ERROR(ENOMEM)); + } + dcp->dc_tq = tq; + + /* dcp will be freed by task */ + (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP); + + /* + * PORTING: this code relies on the property of taskq_wait to wait + * until no more tasks are queued and no more tasks are active. As + * we always queue new tasks from within other tasks, task_wait + * reliably waits for the full recursion to finish, even though we + * enqueue new tasks after taskq_wait has been called. + * On platforms other than illumos, taskq_wait may not have this + * property. + */ + taskq_wait(tq); + taskq_destroy(tq); + mutex_destroy(&err_lock); + + return (error); } /* diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 13961918e68d..7d84d8bf4e81 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -755,7 +755,7 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dp->dp_origin_snap != NULL); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, - tx, DS_FIND_CHILDREN)); + tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } /* ARGSUSED */ @@ -810,7 +810,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); + upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } void @@ -1055,6 +1055,12 @@ dsl_pool_config_held(dsl_pool_t *dp) return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); } +boolean_t +dsl_pool_config_held_writer(dsl_pool_t *dp) +{ + return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); +} + #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dsl_pool_config_enter); EXPORT_SYMBOL(dsl_pool_config_exit); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 2a9ef9ce53e4..c37feb733413 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1773,6 +1773,7 @@ static boolean_t spa_check_logs(spa_t *spa) { boolean_t rv = B_FALSE; + dsl_pool_t *dp = spa_get_dsl(spa); switch (spa->spa_log_state) { default: @@ -1780,8 +1781,8 @@ spa_check_logs(spa_t *spa) case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: - rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, - NULL, DS_FIND_CHILDREN) != 0); + rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); break; @@ -2763,6 +2764,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; int need_update = B_FALSE; + dsl_pool_t *dp = spa_get_dsl(spa); int c; ASSERT(state != SPA_LOAD_TRYIMPORT); @@ -2776,9 +2778,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, */ spa->spa_claiming = B_TRUE; - tx = dmu_tx_create_assigned(spa_get_dsl(spa), - spa_first_txg(spa)); - (void) dmu_objset_find(spa_name(spa), + tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); + (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index fe6631925662..d53537103b79 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -179,6 +179,27 @@ vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) return (NULL); } +static int +vdev_count_leaves_impl(vdev_t *vd) +{ + int n = 0; + int c; + + if (vd->vdev_ops->vdev_op_leaf) + return (1); + + for (c = 0; c < vd->vdev_children; c++) + n += vdev_count_leaves_impl(vd->vdev_child[c]); + + return (n); +} + +int +vdev_count_leaves(spa_t *spa) +{ + return (vdev_count_leaves_impl(spa->spa_root_vdev)); +} + void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 3ae171e592d2..ff4d2cec0a2b 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -660,7 +660,7 @@ zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) } int -zil_claim(const char *osname, void *txarg) +zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) { dmu_tx_t *tx = txarg; uint64_t first_txg = dmu_tx_get_txg(tx); @@ -669,15 +669,16 @@ zil_claim(const char *osname, void *txarg) objset_t *os; int error; - error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os); + error = dmu_objset_own_obj(dp, ds->ds_object, + DMU_OST_ANY, B_FALSE, FTAG, &os); if (error != 0) { /* * EBUSY indicates that the objset is inconsistent, in which * case it can not have a ZIL. */ if (error != EBUSY) { - cmn_err(CE_WARN, "can't open objset for %s, error %u", - osname, error); + cmn_err(CE_WARN, "can't open objset for %llu, error %u", + (unsigned long long)ds->ds_object, error); } return (0); @@ -725,8 +726,9 @@ zil_claim(const char *osname, void *txarg) * Checksum errors are ok as they indicate the end of the chain. * Any other error (no device or read failure) returns an error. */ +/* ARGSUSED */ int -zil_check_log_chain(const char *osname, void *tx) +zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) { zilog_t *zilog; objset_t *os; @@ -735,9 +737,10 @@ zil_check_log_chain(const char *osname, void *tx) ASSERT(tx == NULL); - error = dmu_objset_hold(osname, FTAG, &os); + error = dmu_objset_from_ds(ds, &os); if (error != 0) { - cmn_err(CE_WARN, "can't open objset for %s", osname); + cmn_err(CE_WARN, "can't open objset %llu, error %d", + (unsigned long long)ds->ds_object, error); return (0); } @@ -760,10 +763,8 @@ zil_check_log_chain(const char *osname, void *tx) valid = vdev_log_state_valid(vd); spa_config_exit(os->os_spa, SCL_STATE, FTAG); - if (!valid) { - dmu_objset_rele(os, FTAG); + if (!valid) return (0); - } } /* @@ -776,8 +777,6 @@ zil_check_log_chain(const char *osname, void *tx) error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); - dmu_objset_rele(os, FTAG); - return ((error == ECKSUM || error == ENOENT) ? 0 : error); }