From 55f75bf072909962e95f5c900c338b5ad7ce98b0 Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Mon, 12 Aug 2019 12:05:40 +0000 Subject: [PATCH] 8423 8199 7432 Implement large_dnode pool feature 8423 Implement large_dnode pool feature 8199 multi-threaded dmu_object_alloc() 7432 Large dnode pool feature llumos/illumos-gate@54811da5ac6b517992fdc173df5d605e4e61fdc0 https://github.com/illumos/illumos-gate/commit/54811da5ac6b517992fdc173df5d605e4e61fdc0 https://www.illumos.org/issues/8423 https://www.illumos.org/issues/8199 https://www.illumos.org/issues/7432 ZoL issues: Improved dnode allocation #6564 Clean up large dnode code #6262 Fix dnode_hold() freeing dnode behavior #8172 Fix dnode allocation race #6414, #6439 Partial: Raw sends must be able to decrease nlevels #6821, #6864 Remove unnecessary txg syncs from receive_object() Closes #7197 Author: Toomas Soome --- cmd/zdb/zdb.c | 56 ++- cmd/zdb/zdb_il.c | 16 +- cmd/zstreamdump/zstreamdump.c | 10 +- cmd/ztest/ztest.c | 237 ++++++++++-- common/zfs/zfeature_common.c | 11 + common/zfs/zfeature_common.h | 1 + common/zfs/zfs_prop.c | 15 + common/zfs/zpool_prop.c | 2 + man/man5/zpool-features.5 | 24 ++ uts/common/fs/zfs/dbuf.c | 69 ++-- uts/common/fs/zfs/dmu.c | 19 +- uts/common/fs/zfs/dmu_object.c | 295 +++++++++++--- uts/common/fs/zfs/dmu_objset.c | 52 ++- uts/common/fs/zfs/dmu_send.c | 107 ++++- uts/common/fs/zfs/dmu_traverse.c | 8 +- uts/common/fs/zfs/dmu_tx.c | 14 +- uts/common/fs/zfs/dnode.c | 581 ++++++++++++++++++++++------ uts/common/fs/zfs/dnode_sync.c | 22 +- uts/common/fs/zfs/dsl_scan.c | 10 +- uts/common/fs/zfs/sa.c | 20 +- uts/common/fs/zfs/spa.c | 11 +- uts/common/fs/zfs/spa_misc.c | 15 +- uts/common/fs/zfs/sys/arc.h | 1 + uts/common/fs/zfs/sys/dmu.h | 15 +- uts/common/fs/zfs/sys/dmu_objset.h | 7 +- uts/common/fs/zfs/sys/dnode.h | 246 +++++++++++- uts/common/fs/zfs/sys/dsl_dataset.h | 7 + uts/common/fs/zfs/sys/sa_impl.h | 6 +- uts/common/fs/zfs/sys/spa.h | 1 + uts/common/fs/zfs/sys/zap.h | 19 + uts/common/fs/zfs/sys/zfs_ioctl.h | 9 +- uts/common/fs/zfs/sys/zfs_znode.h | 1 + uts/common/fs/zfs/sys/zil.h | 17 +- uts/common/fs/zfs/zap.c | 13 +- uts/common/fs/zfs/zap_micro.c | 58 ++- uts/common/fs/zfs/zfs_acl.c | 18 +- uts/common/fs/zfs/zfs_ioctl.c | 18 + uts/common/fs/zfs/zfs_log.c | 2 + uts/common/fs/zfs/zfs_replay.c | 30 +- uts/common/fs/zfs/zfs_sa.c | 3 +- uts/common/fs/zfs/zfs_znode.c | 40 +- uts/common/fs/zfs/zil.c | 16 +- uts/common/sys/fs/zfs.h | 12 + 43 files changed, 1741 insertions(+), 393 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index faf019cd9bec..3b56a34fb707 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2108,14 +2108,15 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { }; static void -dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) +dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, + uint64_t *dnode_slots_used) { dmu_buf_t *db = NULL; dmu_object_info_t doi; dnode_t *dn; void *bonus = NULL; size_t bsize = 0; - char iblk[32], dblk[32], lsize[32], asize[32], fill[32]; + char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; char bonus_size[32]; char aux[50]; int error; @@ -2128,9 +2129,9 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); if (*print_header) { - (void) printf("\n%10s %3s %5s %5s %5s %5s %6s %s\n", - "Object", "lvl", "iblk", "dblk", "dsize", "lsize", - "%full", "type"); + (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", + "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", + "lsize", "%full", "type"); *print_header = 0; } @@ -2147,11 +2148,15 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) } dmu_object_info_from_dnode(dn, &doi); + if (dnode_slots_used != NULL) + *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; + zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); + zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / doi.doi_max_offset); @@ -2168,13 +2173,14 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) ZDB_COMPRESS_NAME(doi.doi_compress)); } - (void) printf("%10lld %3u %5s %5s %5s %5s %6s %s%s\n", - (u_longlong_t)object, doi.doi_indirection, iblk, dblk, - asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux); + (void) printf("%10" PRIu64 + " %3u %5s %5s %5s %5s %5s %6s %s%s\n", + object, doi.doi_indirection, iblk, dblk, + asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { - (void) printf("%10s %3s %5s %5s %5s %5s %6s %s\n", - "", "", "", "", "", bonus_size, "bonus", + (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", + "", "", "", "", "", "", bonus_size, "bonus", ZDB_OT_NAME(doi.doi_bonus_type)); } @@ -2278,6 +2284,9 @@ dump_dir(objset_t *os) int print_header = 1; unsigned i; int error; + uint64_t total_slots_used = 0; + uint64_t max_slot_used = 0; + uint64_t dnode_slots; /* make sure nicenum has enough space */ CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); @@ -2322,7 +2331,7 @@ dump_dir(objset_t *os) if (zopt_objects != 0) { for (i = 0; i < zopt_objects; i++) dump_object(os, zopt_object[i], verbosity, - &print_header); + &print_header, NULL); (void) printf("\n"); return; } @@ -2347,24 +2356,39 @@ dump_dir(objset_t *os) if (BP_IS_HOLE(os->os_rootbp)) return; - dump_object(os, 0, verbosity, &print_header); + dump_object(os, 0, verbosity, &print_header, NULL); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { - dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header); - dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header); + dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, + NULL); + dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, + NULL); } object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { - dump_object(os, object, verbosity, &print_header); + dump_object(os, object, verbosity, &print_header, &dnode_slots); object_count++; + total_slots_used += dnode_slots; + max_slot_used = object + dnode_slots - 1; } ASSERT3U(object_count, ==, usedobjs); (void) printf("\n"); + (void) printf(" Dnode slots:\n"); + (void) printf("\tTotal used: %10llu\n", + (u_longlong_t)total_slots_used); + (void) printf("\tMax used: %10llu\n", + (u_longlong_t)max_slot_used); + (void) printf("\tPercent empty: %10lf\n", + (double)(max_slot_used - total_slots_used)*100 / + (double)max_slot_used); + + (void) printf("\n"); + if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); @@ -2553,7 +2577,7 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name) return (dump_path_impl(os, child_obj, s + 1)); /*FALLTHROUGH*/ case DMU_OT_PLAIN_FILE_CONTENTS: - dump_object(os, child_obj, dump_opt['v'], &header); + dump_object(os, child_obj, dump_opt['v'], &header, NULL); return (0); default: (void) fprintf(stderr, "object %llu has non-file/directory " diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index a2ebe5857e4d..9daf9a70007b 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -84,13 +84,15 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg) } (void) printf("%s%s", tab_prefix, ctime(&crtime)); - (void) printf("%sdoid %llu, foid %llu, mode %llo\n", tab_prefix, - (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid, - (longlong_t)lr->lr_mode); - (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", - tab_prefix, - (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid, - (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev); + (void) printf("%sdoid %" PRIu64 ", foid %" PRIu64 ", slots %" PRIu64 + ", mode %" PRIo64 "\n", + tab_prefix, lr->lr_doid, + LR_FOID_GET_OBJ(lr->lr_foid), + LR_FOID_GET_SLOTS(lr->lr_foid), + lr->lr_mode); + (void) printf("%suid %" PRIu64 ", gid %" PRIu64 ", gen %" PRIu64 + ", rdev %#" PRIx64 "\n", + tab_prefix, lr->lr_uid, lr->lr_gid, lr->lr_gen, lr->lr_rdev); } /* ARGSUSED */ diff --git a/cmd/zstreamdump/zstreamdump.c b/cmd/zstreamdump/zstreamdump.c index 54edb566ad2f..51c4c8e0e649 100644 --- a/cmd/zstreamdump/zstreamdump.c +++ b/cmd/zstreamdump/zstreamdump.c @@ -416,13 +416,15 @@ main(int argc, char *argv[]) drro->drr_toguid = BSWAP_64(drro->drr_toguid); } if (verbose) { - (void) printf("OBJECT object = %llu type = %u " - "bonustype = %u blksz = %u bonuslen = %u\n", - (u_longlong_t)drro->drr_object, + (void) printf("OBJECT object = %" PRIu64 + " type = %u bonustype = %u blksz = %u" + " bonuslen = %u dn_slots = %u\n", + drro->drr_object, drro->drr_type, drro->drr_bonustype, drro->drr_blksz, - drro->drr_bonuslen); + drro->drr_bonuslen, + drro->drr_dn_slots); } if (drro->drr_bonuslen > 0) { (void) ssread(buf, diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 0b9703eb5dd1..004912aa3b67 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -194,6 +194,7 @@ extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; extern boolean_t zfs_abd_scatter_enabled; +extern int dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; static ztest_shared_opts_t *ztest_shared_opts; @@ -224,6 +225,7 @@ typedef struct ztest_block_tag { uint64_t bt_magic; uint64_t bt_objset; uint64_t bt_object; + uint64_t bt_dnodesize; uint64_t bt_offset; uint64_t bt_gen; uint64_t bt_txg; @@ -274,6 +276,7 @@ typedef struct ztest_od { dmu_object_type_t od_crtype; uint64_t od_blocksize; uint64_t od_crblocksize; + uint64_t od_crdnodesize; uint64_t od_gen; uint64_t od_crgen; char od_name[ZFS_MAX_DATASET_NAME_LEN]; @@ -320,6 +323,7 @@ static ztest_shared_callstate_t *ztest_shared_callstate; ztest_func_t ztest_dmu_read_write; ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_object_alloc_free; +ztest_func_t ztest_dmu_object_next_chunk; ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; @@ -349,6 +353,7 @@ ztest_func_t ztest_device_removal; ztest_func_t ztest_remap_blocks; ztest_func_t ztest_spa_checkpoint_create_discard; ztest_func_t ztest_initialize; +ztest_func_t ztest_verify_dnode_bt; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -360,6 +365,7 @@ ztest_info_t ztest_info[] = { { ztest_dmu_read_write, 1, &zopt_always }, { ztest_dmu_write_parallel, 10, &zopt_always }, { ztest_dmu_object_alloc_free, 1, &zopt_always }, + { ztest_dmu_object_next_chunk, 1, &zopt_sometimes }, { ztest_dmu_commit_callbacks, 1, &zopt_always }, { ztest_zap, 30, &zopt_always }, { ztest_zap_parallel, 100, &zopt_always }, @@ -392,7 +398,8 @@ ztest_info_t ztest_info[] = { { ztest_device_removal, 1, &zopt_sometimes }, { ztest_remap_blocks, 1, &zopt_sometimes }, { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely }, - { ztest_initialize, 1, &zopt_sometimes } + { ztest_initialize, 1, &zopt_sometimes }, + { ztest_verify_dnode_bt, 1, &zopt_sometimes } }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -438,8 +445,8 @@ static spa_t *ztest_spa = NULL; static ztest_ds_t *ztest_ds; static kmutex_t ztest_vdev_lock; -static kmutex_t ztest_checkpoint_lock; static boolean_t ztest_device_removal_active = B_FALSE; +static kmutex_t ztest_checkpoint_lock; /* * The ztest_name_lock protects the pool and dataset namespace used by @@ -1008,6 +1015,36 @@ ztest_random_blocksize(void) return (1 << (SPA_MINBLOCKSHIFT + block_shift)); } +static int +ztest_random_dnodesize(void) +{ + int slots; + int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; + + if (max_slots == DNODE_MIN_SLOTS) + return (DNODE_MIN_SIZE); + + /* + * Weight the random distribution more heavily toward smaller + * dnode sizes since that is more likely to reflect real-world + * usage. + */ + ASSERT3U(max_slots, >, 4); + switch (ztest_random(10)) { + case 0: + slots = 5 + ztest_random(max_slots - 4); + break; + case 1 ... 4: + slots = 2 + ztest_random(3); + break; + default: + slots = 1; + break; + } + + return (slots << DNODE_SHIFT); +} + static int ztest_random_ibshift(void) { @@ -1285,11 +1322,13 @@ ztest_pattern_match(void *buf, uint64_t size, uint64_t value) static void ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, - uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) + uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, + uint64_t crtxg) { bt->bt_magic = BT_MAGIC; bt->bt_objset = dmu_objset_id(os); bt->bt_object = object; + bt->bt_dnodesize = dnodesize; bt->bt_offset = offset; bt->bt_gen = gen; bt->bt_txg = txg; @@ -1298,11 +1337,13 @@ ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, static void ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, - uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) + uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, + uint64_t crtxg) { ASSERT3U(bt->bt_magic, ==, BT_MAGIC); ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); ASSERT3U(bt->bt_object, ==, object); + ASSERT3U(bt->bt_dnodesize, ==, dnodesize); ASSERT3U(bt->bt_offset, ==, offset); ASSERT3U(bt->bt_gen, <=, gen); ASSERT3U(bt->bt_txg, <=, txg); @@ -1323,6 +1364,52 @@ ztest_bt_bonus(dmu_buf_t *db) return (bt); } +/* + * Generate a token to fill up unused bonus buffer space. Try to make + * it unique to the object, generation, and offset to verify that data + * is not getting overwritten by data from other dnodes. + */ +#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ + (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) + +/* + * Fill up the unused bonus buffer region before the block tag with a + * verifiable pattern. Filling the whole bonus area with non-zero data + * helps ensure that all dnode traversal code properly skips the + * interior regions of large dnodes. + */ +void +ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, + objset_t *os, uint64_t gen) +{ + uint64_t *bonusp; + + ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); + + for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { + uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), + gen, bonusp - (uint64_t *)db->db_data); + *bonusp = token; + } +} + +/* + * Verify that the unused area of a bonus buffer is filled with the + * expected tokens. + */ +void +ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, + objset_t *os, uint64_t gen) +{ + uint64_t *bonusp; + + for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { + uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), + gen, bonusp - (uint64_t *)db->db_data); + VERIFY3U(*bonusp, ==, token); + } +} + /* * ZIL logging ops */ @@ -1331,7 +1418,7 @@ ztest_bt_bonus(dmu_buf_t *db) #define lrz_blocksize lr_uid #define lrz_ibshift lr_gid #define lrz_bonustype lr_rdev -#define lrz_bonuslen lr_crtime[1] +#define lrz_dnodesize lr_crtime[1] static void ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) @@ -1447,6 +1534,7 @@ ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) dmu_tx_t *tx; uint64_t txg; int error = 0; + int bonuslen; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -1469,26 +1557,27 @@ ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) return (ENOSPC); ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); + bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { if (lr->lr_foid == 0) { - lr->lr_foid = zap_create(os, + lr->lr_foid = zap_create_dnsize(os, lr->lrz_type, lr->lrz_bonustype, - lr->lrz_bonuslen, tx); + bonuslen, lr->lrz_dnodesize, tx); } else { - error = zap_create_claim(os, lr->lr_foid, + error = zap_create_claim_dnsize(os, lr->lr_foid, lr->lrz_type, lr->lrz_bonustype, - lr->lrz_bonuslen, tx); + bonuslen, lr->lrz_dnodesize, tx); } } else { if (lr->lr_foid == 0) { - lr->lr_foid = dmu_object_alloc(os, + lr->lr_foid = dmu_object_alloc_dnsize(os, lr->lrz_type, 0, lr->lrz_bonustype, - lr->lrz_bonuslen, tx); + bonuslen, lr->lrz_dnodesize, tx); } else { - error = dmu_object_claim(os, lr->lr_foid, + error = dmu_object_claim_dnsize(os, lr->lr_foid, lr->lrz_type, 0, lr->lrz_bonustype, - lr->lrz_bonuslen, tx); + bonuslen, lr->lrz_dnodesize, tx); } } @@ -1508,7 +1597,9 @@ ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); bbt = ztest_bt_bonus(db); dmu_buf_will_dirty(db, tx); - ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg); + ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, + lr->lr_gen, txg, txg); + ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); dmu_buf_rele(db, FTAG); VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, @@ -1658,7 +1749,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) VERIFY(dmu_read(os, lr->lr_foid, offset, sizeof (rbt), &rbt, prefetch) == 0); if (rbt.bt_magic == BT_MAGIC) { - ztest_bt_verify(&rbt, os, lr->lr_foid, + ztest_bt_verify(&rbt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); } } @@ -1670,7 +1761,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) * as it was when the write was generated. */ if (zd->zd_zilog->zl_replay) { - ztest_bt_verify(bt, os, lr->lr_foid, offset, + ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, MAX(gen, bt->bt_gen), MAX(txg, lrtxg), bt->bt_crtxg); } @@ -1679,7 +1770,8 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) * Set the bt's gen/txg to the bonus buffer's gen/txg * so that all of the usual ASSERTs will work. */ - ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg); + ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, + crtxg); } if (abuf == NULL) { @@ -1751,7 +1843,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) dmu_tx_t *tx; dmu_buf_t *db; ztest_block_tag_t *bbt; - uint64_t txg, lrtxg, crtxg; + uint64_t txg, lrtxg, crtxg, dnodesize; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -1774,6 +1866,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); crtxg = bbt->bt_crtxg; lrtxg = lr->lr_common.lrc_txg; + dnodesize = bbt->bt_dnodesize; if (zd->zd_zilog->zl_replay) { ASSERT(lr->lr_size != 0); @@ -1792,7 +1885,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) /* * Verify that the current bonus buffer is not newer than our txg. */ - ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, + ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, MAX(txg, lrtxg), crtxg); dmu_buf_will_dirty(db, tx); @@ -1802,7 +1895,9 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); bbt = ztest_bt_bonus(db); - ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg); + ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, + txg, crtxg); + ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); dmu_buf_rele(db, FTAG); @@ -2033,7 +2128,7 @@ ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) lr->lrz_blocksize = od->od_crblocksize; lr->lrz_ibshift = ztest_random_ibshift(); lr->lrz_bonustype = DMU_OT_UINT64_OTHER; - lr->lrz_bonuslen = dmu_bonus_max(); + lr->lrz_dnodesize = od->od_crdnodesize; lr->lr_gen = od->od_crgen; lr->lr_crtime[0] = time(NULL); @@ -2212,7 +2307,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) switch (io_type) { case ZTEST_IO_WRITE_TAG: - ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0); + ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, + offset, 0, 0, 0); (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); break; @@ -2273,13 +2369,15 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) */ static void ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, - dmu_object_type_t type, uint64_t blocksize, uint64_t gen) + dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, + uint64_t gen) { od->od_dir = ZTEST_DIROBJ; od->od_object = 0; od->od_crtype = type; od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); + od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); od->od_crgen = gen; od->od_type = DMU_OT_NONE; @@ -3721,8 +3819,10 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) ztest_od_t od[4]; int batchsize = sizeof (od) / sizeof (od[0]); - for (int b = 0; b < batchsize; b++) - ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0); + for (int b = 0; b < batchsize; b++) { + ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, + 0, 0, 0); + } /* * Destroy the previous batch of objects, create a new batch, @@ -3736,6 +3836,26 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); } +/* + * Rewind the global allocator to verify object allocation backfilling. + */ +void +ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; + uint64_t object; + + /* + * Rewind the global allocator randomly back to a lower object number + * to force backfilling and reclamation of recently freed dnodes. + */ + mutex_enter(&os->os_obj_lock); + object = ztest_random(os->os_obj_next_chunk); + os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); + mutex_exit(&os->os_obj_lock); +} + /* * Verify that dmu_{read,write} work as expected. */ @@ -3782,8 +3902,10 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) /* * Read the directory info. If it's the first time, set things up. */ - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize); - ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, + chunksize); + ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, + chunksize); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; @@ -4052,8 +4174,10 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) /* * Read the directory info. If it's the first time, set things up. */ - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); - ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, + 0, 0); + ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, + chunksize); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; @@ -4254,7 +4378,8 @@ ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) * to verify that parallel writes to an object -- even to the * same blocks within the object -- doesn't cause any trouble. */ - ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); + ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, + 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; @@ -4273,7 +4398,8 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) uint64_t blocksize = ztest_random_blocksize(); void *data; - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, + 0, 0); if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) return; @@ -4319,7 +4445,7 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) int error; char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) return; @@ -4451,7 +4577,7 @@ ztest_fzap(ztest_ds_t *zd, uint64_t id) ztest_od_t od[1]; uint64_t object, txg; - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) return; @@ -4497,7 +4623,8 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) char name[20], string_value[20]; void *data; - ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0); + ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, + 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; @@ -4685,7 +4812,7 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) uint64_t old_txg, txg; int i, error; - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; @@ -4799,6 +4926,41 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) dmu_tx_commit(tx); } +/* + * Visit each object in the dataset. Verify that its properties + * are consistent what was stored in the block tag when it was created, + * and that its unused bonus buffer space has not been overwritten. + */ +void +ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + uint64_t obj; + int err = 0; + + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { + ztest_block_tag_t *bt = NULL; + dmu_object_info_t doi; + dmu_buf_t *db; + + if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) + continue; + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_size >= sizeof (*bt)) + bt = ztest_bt_bonus(db); + + if (bt && bt->bt_magic == BT_MAGIC) { + ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, + bt->bt_offset, bt->bt_gen, bt->bt_txg, + bt->bt_crtxg); + ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); + } + + dmu_buf_rele(db, FTAG); + } +} + /* ARGSUSED */ void ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) @@ -5283,7 +5445,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) blocksize = ztest_random_blocksize(); blocksize = MIN(blocksize, 2048); /* because we write so many */ - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, + 0, 0); if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; @@ -6155,7 +6318,7 @@ ztest_freeze(void) numloops++ < ztest_opts.zo_maxloops && metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { ztest_od_t od; - ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); + ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); ztest_io(zd, od.od_object, ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); diff --git a/common/zfs/zfeature_common.c b/common/zfs/zfeature_common.c index 07eb83a4d3e1..ae7a06802796 100644 --- a/common/zfs/zfeature_common.c +++ b/common/zfs/zfeature_common.c @@ -245,6 +245,17 @@ zpool_feature_init(void) "Support for blocks larger than 128KB.", ZFEATURE_FLAG_PER_DATASET, large_blocks_deps); + { + static const spa_feature_t large_dnode_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LARGE_DNODE, + "org.zfsonlinux:large_dnode", "large_dnode", + "Variable on-disk size of dnodes.", + ZFEATURE_FLAG_PER_DATASET, large_dnode_deps); + } + static const spa_feature_t sha512_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE diff --git a/common/zfs/zfeature_common.h b/common/zfs/zfeature_common.h index a852f07018e3..3376b9921bac 100644 --- a/common/zfs/zfeature_common.h +++ b/common/zfs/zfeature_common.h @@ -53,6 +53,7 @@ typedef enum spa_feature { SPA_FEATURE_BOOKMARKS, SPA_FEATURE_FS_SS_LIMIT, SPA_FEATURE_LARGE_BLOCKS, + SPA_FEATURE_LARGE_DNODE, SPA_FEATURE_SHA512, SPA_FEATURE_SKEIN, SPA_FEATURE_EDONR, diff --git a/common/zfs/zfs_prop.c b/common/zfs/zfs_prop.c index c6e7bab9a410..ee792afc0afd 100644 --- a/common/zfs/zfs_prop.c +++ b/common/zfs/zfs_prop.c @@ -210,6 +210,17 @@ zfs_prop_init(void) { NULL } }; + static zprop_index_t dnsize_table[] = { + { "legacy", ZFS_DNSIZE_LEGACY }, + { "auto", ZFS_DNSIZE_AUTO }, + { "1k", ZFS_DNSIZE_1K }, + { "2k", ZFS_DNSIZE_2K }, + { "4k", ZFS_DNSIZE_4K }, + { "8k", ZFS_DNSIZE_8K }, + { "16k", ZFS_DNSIZE_16K }, + { NULL } + }; + static zprop_index_t redundant_metadata_table[] = { { "all", ZFS_REDUNDANT_METADATA_ALL }, { "most", ZFS_REDUNDANT_METADATA_MOST }, @@ -266,6 +277,10 @@ zfs_prop_init(void) PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "latency | throughput", "LOGBIAS", logbias_table); + zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize", + ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table); + /* inherit index (boolean) properties */ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); diff --git a/common/zfs/zpool_prop.c b/common/zfs/zpool_prop.c index 0a69a51207bc..cbbd2991859b 100644 --- a/common/zfs/zpool_prop.c +++ b/common/zfs/zpool_prop.c @@ -138,6 +138,8 @@ zpool_prop_init(void) PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE"); zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING, PROP_ONETIME, ZFS_TYPE_POOL, "TNAME"); + zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize", + PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE"); } /* diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index c1b17354071a..fda83e5482f3 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -533,6 +533,30 @@ set larger than 128KB, and will return to being \fBenabled\fR once all filesystems that have ever had their recordsize larger than 128KB are destroyed. .RE +.ne 2 +.na +\fB\fBlarge_dnode\fR\fR +.ad +.RS 4n +.TS +l l . +GUID org.zfsonlinux:large_dnode +READ\-ONLY COMPATIBLE no +DEPENDENCIES extensible_dataset +.TE + +The \fBlarge_dnode\fR feature allows the size of dnodes in a dataset to be +set larger than 512B. + +This feature becomes \fBactive\fR once a dataset contains an object with a +dnode larger than 512B, which occurs as a result of setting the \fBdnodesize\fR +dataset property to a value other than \fBlegacy\fR. The feature will return to +being \fBenabled\fR once all filesystems that have ever contained a dnode larger +than 512B are destroyed. Large dnodes allow more data to be stored in the +bonus buffer, thus potentially improving performance by avoiding the use of +spill blocks. +.RE + .sp .ne 2 .na diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c index 4fcf14fba512..4bb53837716c 100644 --- a/uts/common/fs/zfs/dbuf.c +++ b/uts/common/fs/zfs/dbuf.c @@ -742,7 +742,6 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); - ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); @@ -995,13 +994,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) ASSERT(db->db_buf == NULL); if (db->db_blkid == DMU_BONUS_BLKID) { + /* + * The bonus length stored in the dnode may be less than + * the maximum available space in the bonus buffer. + */ int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); + int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(bonuslen, <=, db->db.db_size); - db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); - arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - if (bonuslen < DN_MAX_BONUSLEN) - bzero(db->db.db_data, DN_MAX_BONUSLEN); + db->db.db_data = zio_buf_alloc(max_bonuslen); + arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); + if (bonuslen < max_bonuslen) + bzero(db->db.db_data, max_bonuslen); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); DB_DNODE_EXIT(db); @@ -1108,9 +1112,11 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) ASSERT(dr->dr_txg >= txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ - dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); - arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + dnode_t *dn = DB_DNODE(db); + int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + dr->dt.dl.dr_data = zio_buf_alloc(bonuslen); + arc_space_consume(bonuslen, ARC_SPACE_BONUS); + bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = arc_buf_size(db->db_buf); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); @@ -2081,10 +2087,13 @@ dbuf_destroy(dmu_buf_impl_t *db) } if (db->db_blkid == DMU_BONUS_BLKID) { - ASSERT(db->db.db_data != NULL); - zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - db->db_state = DB_UNCACHED; + int slots = DB_DNODE(db)->dn_num_slots; + int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); + if (db->db.db_data != NULL) { + zio_buf_free(db->db.db_data, bonuslen); + arc_space_return(bonuslen, ARC_SPACE_BONUS); + db->db_state = DB_UNCACHED; + } } dbuf_clear_data(db); @@ -2188,7 +2197,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) - *bpp = &dn->dn_phys->dn_spill; + *bpp = DN_SPILL_BLKPTR(dn->dn_phys); else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); @@ -2289,7 +2298,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); - db->db.db_size = DN_MAX_BONUSLEN - + db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; @@ -3031,7 +3040,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) return; if (db->db_blkid == DMU_SPILL_BLKID) { - db->db_blkptr = &dn->dn_phys->dn_spill; + db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys); BP_ZERO(db->db_blkptr); return; } @@ -3162,13 +3171,17 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(*datap != NULL); ASSERT0(db->db_level); - ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); - bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, + DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); + bcopy(*datap, DN_BONUS(dn->dn_phys), + DN_MAX_BONUS_LEN(dn->dn_phys)); DB_DNODE_EXIT(db); if (*datap != db->db.db_data) { - zio_buf_free(*datap, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + int slots = DB_DNODE(db)->dn_num_slots; + int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); + zio_buf_free(*datap, bonuslen); + arc_space_return(bonuslen, ARC_SPACE_BONUS); } db->db_data_pending = NULL; drp = &db->db_last_dirty; @@ -3324,7 +3337,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(bp)) && - db->db_blkptr == &dn->dn_phys->dn_spill); + db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); } #endif @@ -3336,11 +3349,17 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { - dnode_phys_t *dnp = db->db.db_data; - for (i = db->db.db_size >> DNODE_SHIFT; i > 0; - i--, dnp++) { - if (dnp->dn_type != DMU_OT_NONE) + i = 0; + while (i < db->db.db_size) { + dnode_phys_t *dnp = + (void *)(((char *)db->db.db_data) + i); + + i += DNODE_MIN_SIZE; + if (dnp->dn_type != DMU_OT_NONE) { fill++; + i += dnp->dn_extra_slots * + DNODE_MIN_SIZE; + } } } else { if (BP_IS_HOLE(bp)) { @@ -3493,7 +3512,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && - db->db_blkptr == &dn->dn_phys->dn_spill); + db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); DB_DNODE_EXIT(db); } #endif diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c index 02027ef6bbd1..d338286a9b50 100644 --- a/uts/common/fs/zfs/dmu.c +++ b/uts/common/fs/zfs/dmu.c @@ -254,7 +254,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, int dmu_bonus_max(void) { - return (DN_MAX_BONUSLEN); + return (DN_OLD_MAX_BONUSLEN); } int @@ -2264,6 +2264,7 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; + doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; @@ -2326,9 +2327,21 @@ dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, dn = DB_DNODE(db); *blksize = dn->dn_datablksz; - /* add 1 for dnode space */ + /* add in number of slots used for the dnode itself */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> - SPA_MINBLOCKSHIFT) + 1; + SPA_MINBLOCKSHIFT) + dn->dn_num_slots; + DB_DNODE_EXIT(db); +} + +void +dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + *dnsize = dn->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } diff --git a/uts/common/fs/zfs/dmu_object.c b/uts/common/fs/zfs/dmu_object.c index b853081e8b7c..2fe866b89d29 100644 --- a/uts/common/fs/zfs/dmu_object.c +++ b/uts/common/fs/zfs/dmu_object.c @@ -30,53 +30,132 @@ #include #include #include +#include -uint64_t -dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, - int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +/* + * Each of the concurrent object allocators will grab + * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to + * grab 128 slots, which is 4 blocks worth. This was experimentally + * determined to be the lowest value that eliminates the measurable effect + * of lock contention from this code path. + */ +int dmu_object_alloc_chunk_shift = 7; + +static uint64_t +dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) { uint64_t object; uint64_t L1_dnode_count = DNODES_PER_BLOCK << (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; + int dn_slots = dnodesize >> DNODE_SHIFT; + boolean_t restarted = B_FALSE; + uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID % + os->os_obj_next_percpu_len]; + int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; + int error; + + if (dn_slots == 0) { + dn_slots = DNODE_MIN_SLOTS; + } else { + ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); + ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); + } + + /* + * The "chunk" of dnodes that is assigned to a CPU-specific + * allocator needs to be at least one block's worth, to avoid + * lock contention on the dbuf. It can be at most one L1 block's + * worth, so that the "rescan after polishing off a L1's worth" + * logic below will be sure to kick in. + */ + if (dnodes_per_chunk < DNODES_PER_BLOCK) + dnodes_per_chunk = DNODES_PER_BLOCK; + if (dnodes_per_chunk > L1_dnode_count) + dnodes_per_chunk = L1_dnode_count; + + object = *cpuobj; - mutex_enter(&os->os_obj_lock); for (;;) { - object = os->os_obj_next; /* - * Each time we polish off a L1 bp worth of dnodes (2^12 - * objects), move to another L1 bp that's still reasonably - * sparse (at most 1/4 full). Look from the beginning at most - * once per txg, but after that keep looking from here. - * os_scan_dnodes is set during txg sync if enough objects - * have been freed since the previous rescan to justify - * backfilling again. If we can't find a suitable block, just - * keep going from here. - * - * Note that dmu_traverse depends on the behavior that we use - * multiple blocks of the dnode object before going back to - * reuse objects. Any change to this algorithm should preserve - * that property or find another solution to the issues - * described in traverse_visitbp. + * If we finished a chunk of dnodes, get a new one from + * the global allocator. */ + if ((P2PHASE(object, dnodes_per_chunk) == 0) || + (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < + dn_slots)) { + DNODE_STAT_BUMP(dnode_alloc_next_chunk); + mutex_enter(&os->os_obj_lock); + ASSERT0(P2PHASE(os->os_obj_next_chunk, + dnodes_per_chunk)); + object = os->os_obj_next_chunk; - if (P2PHASE(object, L1_dnode_count) == 0) { - uint64_t offset; - int error; - if (os->os_rescan_dnodes) { - offset = 0; - os->os_rescan_dnodes = B_FALSE; - } else { - offset = object << DNODE_SHIFT; + /* + * Each time we polish off a L1 bp worth of dnodes + * (2^12 objects), move to another L1 bp that's + * still reasonably sparse (at most 1/4 full). Look + * from the beginning at most once per txg. If we + * still can't allocate from that L1 block, search + * for an empty L0 block, which will quickly skip + * to the end of the metadnode if the no nearby L0 + * blocks are empty. This fallback avoids a + * pathology where full dnode blocks containing + * large dnodes appear sparse because they have a + * low blk_fill, leading to many failed allocation + * attempts. In the long term a better mechanism to + * search for sparse metadnode regions, such as + * spacemaps, could be implemented. + * + * os_scan_dnodes is set during txg sync if enough + * objects have been freed since the previous + * rescan to justify backfilling again. + * + * Note that dmu_traverse depends on the behavior + * that we use multiple blocks of the dnode object + * before going back to reuse objects. Any change + * to this algorithm should preserve that property + * or find another solution to the issues described + * in traverse_visitbp. + */ + if (P2PHASE(object, L1_dnode_count) == 0) { + uint64_t offset; + uint64_t blkfill; + int minlvl; + if (os->os_rescan_dnodes) { + offset = 0; + os->os_rescan_dnodes = B_FALSE; + } else { + offset = object << DNODE_SHIFT; + } + blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; + minlvl = restarted ? 1 : 2; + restarted = B_TRUE; + error = dnode_next_offset(DMU_META_DNODE(os), + DNODE_FIND_HOLE, &offset, minlvl, + blkfill, 0); + if (error == 0) { + object = offset >> DNODE_SHIFT; + } } - error = dnode_next_offset(DMU_META_DNODE(os), - DNODE_FIND_HOLE, - &offset, 2, DNODES_PER_BLOCK >> 2, 0); - if (error == 0) - object = offset >> DNODE_SHIFT; + /* + * Note: if "restarted", we may find a L0 that + * is not suitably aligned. + */ + os->os_obj_next_chunk = + P2ALIGN(object, dnodes_per_chunk) + + dnodes_per_chunk; + (void) atomic_swap_64(cpuobj, object); + mutex_exit(&os->os_obj_lock); } - os->os_obj_next = ++object; + + /* + * The value of (*cpuobj) before adding dn_slots is the object + * ID assigned to us. The value afterwards is the object ID + * assigned to whoever wants to do an allocation next. + */ + object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; /* * XXX We should check for an i/o error here and return @@ -84,47 +163,94 @@ dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, * dmu_tx_assign(), but there is currently no mechanism * to do so. */ - (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, - FTAG, &dn); - if (dn) - break; + error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, + dn_slots, FTAG, &dn); + if (error == 0) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + /* + * Another thread could have allocated it; check + * again now that we have the struct lock. + */ + if (dn->dn_type == DMU_OT_NONE) { + dnode_allocate(dn, ot, blocksize, 0, + bonustype, bonuslen, dn_slots, tx); + rw_exit(&dn->dn_struct_rwlock); + dmu_tx_add_new_object(tx, dn); + dnode_rele(dn, FTAG); + return (object); + } + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + DNODE_STAT_BUMP(dnode_alloc_race); + } - if (dmu_object_next(os, &object, B_TRUE, 0) == 0) - os->os_obj_next = object - 1; + /* + * Skip to next known valid starting point on error. This + * is the start of the next block of dnodes. + */ + if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { + object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); + DNODE_STAT_BUMP(dnode_alloc_next_block); + } + (void) atomic_swap_64(cpuobj, object); } - - dnode_allocate(dn, ot, blocksize, indirect_blockshift, - bonustype, bonuslen, tx); - mutex_exit(&os->os_obj_lock); - - dmu_tx_add_new_object(tx, dn); - dnode_rele(dn, FTAG); - - return (object); } uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - return (dmu_object_alloc_ibs(os, ot, blocksize, 0, - bonustype, bonuslen, tx)); + return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, + bonuslen, 0, tx)); +} + +uint64_t +dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, + dmu_tx_t *tx) +{ + return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, 0, tx)); +} + +uint64_t +dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, + bonuslen, dnodesize, tx)); } int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, + bonuslen, 0, tx)); +} + +int +dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) { dnode_t *dn; + int dn_slots = dnodesize >> DNODE_SHIFT; int err; + if (dn_slots == 0) + dn_slots = DNODE_MIN_SLOTS; + ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); + ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) return (SET_ERROR(EBADF)); - err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); + err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, + FTAG, &dn); if (err) return (err); - dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); dmu_tx_add_new_object(tx, dn); dnode_rele(dn, FTAG); @@ -135,19 +261,29 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, + bonuslen, 0, tx)); +} + +int +dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, + dmu_tx_t *tx) { dnode_t *dn; + int dn_slots = dnodesize >> DNODE_SHIFT; int err; if (object == DMU_META_DNODE_OBJECT) return (SET_ERROR(EBADF)); - err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, FTAG, &dn); if (err) return (err); - dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); + dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx); dnode_rele(dn, FTAG); return (err); @@ -161,7 +297,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, FTAG, &dn); if (err) return (err); @@ -186,9 +322,54 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { - uint64_t offset = (*objectp + 1) << DNODE_SHIFT; + uint64_t offset; + uint64_t start_obj; + struct dsl_dataset *ds = os->os_dsl_dataset; int error; + if (*objectp == 0) { + start_obj = 1; + } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { + uint64_t i = *objectp + 1; + uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); + dmu_object_info_t doi; + + /* + * Scan through the remaining meta dnode block. The contents + * of each slot in the block are known so it can be quickly + * checked. If the block is exhausted without a match then + * hand off to dnode_next_offset() for further scanning. + */ + while (i <= last_obj) { + error = dmu_object_info(os, i, &doi); + if (error == ENOENT) { + if (hole) { + *objectp = i; + return (0); + } else { + i++; + } + } else if (error == EEXIST) { + i++; + } else if (error == 0) { + if (hole) { + i += doi.doi_dnodesize >> DNODE_SHIFT; + } else { + *objectp = i; + return (0); + } + } else { + return (error); + } + } + + start_obj = i; + } else { + start_obj = *objectp + 1; + } + + offset = start_obj << DNODE_SHIFT; + error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); diff --git a/uts/common/fs/zfs/dmu_objset.c b/uts/common/fs/zfs/dmu_objset.c index c5267ac18dcd..db0fff702ef8 100644 --- a/uts/common/fs/zfs/dmu_objset.c +++ b/uts/common/fs/zfs/dmu_objset.c @@ -140,6 +140,12 @@ dmu_objset_id(objset_t *os) return (ds ? ds->ds_object : 0); } +uint64_t +dmu_objset_dnodesize(objset_t *os) +{ + return (os->os_dnodesize); +} + zfs_sync_type_t dmu_objset_syncprop(objset_t *os) { @@ -269,6 +275,34 @@ redundant_metadata_changed_cb(void *arg, uint64_t newval) os->os_redundant_metadata = newval; } +static void +dnodesize_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + switch (newval) { + case ZFS_DNSIZE_LEGACY: + os->os_dnodesize = DNODE_MIN_SIZE; + break; + case ZFS_DNSIZE_AUTO: + /* + * Choose a dnode size that will work well for most + * workloads if the user specified "auto". Future code + * improvements could dynamically select a dnode size + * based on observed workload patterns. + */ + os->os_dnodesize = DNODE_MIN_SIZE * 2; + break; + case ZFS_DNSIZE_1K: + case ZFS_DNSIZE_2K: + case ZFS_DNSIZE_4K: + case ZFS_DNSIZE_8K: + case ZFS_DNSIZE_16K: + os->os_dnodesize = newval; + break; + } +} + static void logbias_changed_cb(void *arg, uint64_t newval) { @@ -477,6 +511,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), recordsize_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DNODESIZE), + dnodesize_changed_cb, os); + } } if (needlock) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); @@ -496,6 +535,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_sync = ZFS_SYNC_STANDARD; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; + os->os_dnodesize = DNODE_MIN_SIZE; } /* * These properties will be filled in by the logic in zfs_get_zplprop() @@ -524,6 +564,9 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); + os->os_obj_next_percpu_len = boot_ncpus; + os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len * + sizeof (os->os_obj_next_percpu[0]), KM_SLEEP); dnode_special_open(os, &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, &os->os_meta_dnode); @@ -802,6 +845,9 @@ dmu_objset_evict_done(objset_t *os) rw_enter(&os_lock, RW_READER); rw_exit(&os_lock); + kmem_free(os->os_obj_next_percpu, + os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0])); + mutex_destroy(&os->os_lock); mutex_destroy(&os->os_userused_lock); mutex_destroy(&os->os_obj_lock); @@ -836,8 +882,8 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, mdn = DMU_META_DNODE(os); - dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, - DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); + dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT, + DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx); /* * We don't want to have to increase the meta-dnode's nlevels @@ -1496,7 +1542,7 @@ do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags, uint64_t user, uint64_t group, boolean_t subtract) { if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { - int64_t delta = DNODE_SIZE + used; + int64_t delta = DNODE_MIN_SIZE + used; if (subtract) delta = -delta; diff --git a/uts/common/fs/zfs/dmu_send.c b/uts/common/fs/zfs/dmu_send.c index 62abee3637eb..3d6858bfec2a 100644 --- a/uts/common/fs/zfs/dmu_send.c +++ b/uts/common/fs/zfs/dmu_send.c @@ -469,6 +469,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) drro->drr_bonustype = dnp->dn_bonustype; drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; drro->drr_bonuslen = dnp->dn_bonuslen; + drro->drr_dn_slots = dnp->dn_extra_slots + 1; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; drro->drr_toguid = dsp->dsa_toguid; @@ -621,7 +622,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { - int blksz = BP_GET_LSIZE(bp); + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; @@ -633,8 +634,8 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) return (SET_ERROR(EIO)); dnode_phys_t *blk = abuf->b_data; - uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT); - for (int i = 0; i < blksz >> DNODE_SHIFT; i++) { + uint64_t dnobj = zb->zb_blkid * epb; + for (int i = 0; i < epb; i += blk[i].dn_extra_slots + 1) { err = dump_dnode(dsa, dnobj + i, blk + i); if (err != 0) break; @@ -802,6 +803,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; + if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) + featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; @@ -1396,11 +1399,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* * The receiving code doesn't know how to translate large blocks * to smaller ones, so the pool must have the LARGE_BLOCKS - * feature enabled if the stream has LARGE_BLOCKS. + * feature enabled if the stream has LARGE_BLOCKS. Same with + * large dnodes. */ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) + return (SET_ERROR(ENOTSUP)); error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { @@ -1605,6 +1612,9 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); @@ -1632,8 +1642,18 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); - /* 6 extra bytes for /%recv */ - char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + /* + * The receiving code doesn't know how to translate large blocks + * to smaller ones, so the pool must have the LARGE_BLOCKS + * feature enabled if the stream has LARGE_BLOCKS. Same with + * large dnodes. + */ + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) + return (SET_ERROR(ENOTSUP)); (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); @@ -2024,7 +2044,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) return (1); } else { return (1 + - ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); + ((DN_OLD_MAX_BONUSLEN - + MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); } } @@ -2082,15 +2103,17 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || - drro->drr_bonuslen > DN_MAX_BONUSLEN) { + drro->drr_bonuslen > + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) || + drro->drr_dn_slots > + (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { return (SET_ERROR(EINVAL)); } err = dmu_object_info(rwa->os, drro->drr_object, &doi); - if (err != 0 && err != ENOENT) + if (err != 0 && err != ENOENT && err != EEXIST) return (SET_ERROR(EINVAL)); - object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; if (drro->drr_object > rwa->max_object) rwa->max_object = drro->drr_object; @@ -2103,16 +2126,64 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, if (err == 0) { int nblkptr; + object = drro->drr_object; + nblkptr = deduce_nblkptr(drro->drr_bonustype, drro->drr_bonuslen); if (drro->drr_blksz != doi.doi_data_block_size || - nblkptr < doi.doi_nblkptr) { + nblkptr < doi.doi_nblkptr || + drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } + } else if (err == EEXIST) { + /* + * The object requested is currently an interior slot of a + * multi-slot dnode. This will be resolved when the next txg + * is synced out, since the send stream will have told us + * to free this slot when we freed the associated dnode + * earlier in the stream. + */ + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + object = drro->drr_object; + } else { + /* object is free and we are about to allocate a new one */ + object = DMU_NEW_OBJECT; + } + + /* + * If this is a multi-slot dnode there is a chance that this + * object will expand into a slot that is already used by + * another object from the previous snapshot. We must free + * these objects before we attempt to allocate the new dnode. + */ + if (drro->drr_dn_slots > 1) { + boolean_t need_sync = B_FALSE; + + for (uint64_t slot = drro->drr_object + 1; + slot < drro->drr_object + drro->drr_dn_slots; + slot++) { + dmu_object_info_t slot_doi; + + err = dmu_object_info(rwa->os, slot, &slot_doi); + if (err == ENOENT || err == EEXIST) + continue; + else if (err != 0) + return (err); + + err = dmu_free_long_object(rwa->os, slot); + + if (err != 0) + return (err); + + need_sync = B_TRUE; + } + + if (need_sync) + txg_wait_synced(dmu_objset_pool(rwa->os), 0); } tx = dmu_tx_create(rwa->os); @@ -2125,9 +2196,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, if (object == DMU_NEW_OBJECT) { /* currently free, want to be allocated */ - err = dmu_object_claim(rwa->os, drro->drr_object, + err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, tx); + drro->drr_bonustype, drro->drr_bonuslen, + drro->drr_dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || drro->drr_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || @@ -2179,13 +2251,18 @@ receive_freeobjects(struct receive_writer_arg *rwa, if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (SET_ERROR(EINVAL)); - for (obj = drrfo->drr_firstobj; + for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { int err; - if (dmu_object_info(rwa->os, obj, NULL) != 0) + err = dmu_object_info(rwa->os, obj, NULL); + if (err == ENOENT) { + obj++; continue; + } else if (err != 0) { + return (err); + } err = dmu_free_long_object(rwa->os, obj); if (err != 0) diff --git a/uts/common/fs/zfs/dmu_traverse.c b/uts/common/fs/zfs/dmu_traverse.c index 050cd69811a6..d54042b04108 100644 --- a/uts/common/fs/zfs/dmu_traverse.c +++ b/uts/common/fs/zfs/dmu_traverse.c @@ -327,13 +327,13 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, goto post; dnode_phys_t *child_dnp = buf->b_data; - for (i = 0; i < epb; i++) { + for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { prefetch_dnode_metadata(td, &child_dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); } /* recursively visitbp() blocks below this */ - for (i = 0; i < epb; i++) { + for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { err = traverse_dnode(td, &child_dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) @@ -435,7 +435,7 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - traverse_prefetch_metadata(td, &dnp->dn_spill, &czb); + traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb); } } @@ -470,7 +470,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); + err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb); } if (err == 0 && (td->td_flags & TRAVERSE_POST)) { diff --git a/uts/common/fs/zfs/dmu_tx.c b/uts/common/fs/zfs/dmu_tx.c index 557f7f2e8162..4f181fa54739 100644 --- a/uts/common/fs/zfs/dmu_tx.c +++ b/uts/common/fs/zfs/dmu_tx.c @@ -280,7 +280,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { - (void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG); + (void) refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE, FTAG); } void @@ -1246,11 +1246,13 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { - dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, - tx->tx_objset, object, THT_SPILL, 0, 0); + dmu_tx_hold_t *txh; - (void) refcount_add_many(&txh->txh_space_towrite, - SPA_OLD_MAXBLOCKSIZE, FTAG); + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, + THT_SPILL, 0, 0); + if (txh != NULL) + (void) refcount_add_many(&txh->txh_space_towrite, + SPA_OLD_MAXBLOCKSIZE, FTAG); } void @@ -1274,7 +1276,7 @@ dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) dmu_tx_sa_registration_hold(sa, tx); - if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) + if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) return; (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, diff --git a/uts/common/fs/zfs/dnode.c b/uts/common/fs/zfs/dnode.c index 2720cdbce4d6..1305a4f64aa9 100644 --- a/uts/common/fs/zfs/dnode.c +++ b/uts/common/fs/zfs/dnode.c @@ -40,20 +40,40 @@ #include #include -static kmem_cache_t *dnode_cache; -/* - * Define DNODE_STATS to turn on statistic gathering. By default, it is only - * turned on when DEBUG is also defined. - */ -#ifdef DEBUG -#define DNODE_STATS -#endif /* DEBUG */ +dnode_stats_t dnode_stats = { + { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 }, + { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_hits", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_txg", KSTAT_DATA_UINT64 }, + { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 }, + { "dnode_allocate", KSTAT_DATA_UINT64 }, + { "dnode_reallocate", KSTAT_DATA_UINT64 }, + { "dnode_buf_evict", KSTAT_DATA_UINT64 }, + { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 }, + { "dnode_alloc_race", KSTAT_DATA_UINT64 }, + { "dnode_alloc_next_block", KSTAT_DATA_UINT64 }, + { "dnode_move_invalid", KSTAT_DATA_UINT64 }, + { "dnode_move_recheck1", KSTAT_DATA_UINT64 }, + { "dnode_move_recheck2", KSTAT_DATA_UINT64 }, + { "dnode_move_special", KSTAT_DATA_UINT64 }, + { "dnode_move_handle", KSTAT_DATA_UINT64 }, + { "dnode_move_rwlock", KSTAT_DATA_UINT64 }, + { "dnode_move_active", KSTAT_DATA_UINT64 }, +}; -#ifdef DNODE_STATS -#define DNODE_STAT_ADD(stat) ((stat)++) -#else -#define DNODE_STAT_ADD(stat) /* nothing */ -#endif /* DNODE_STATS */ +static kstat_t *dnode_ksp; +static kmem_cache_t *dnode_cache; static dnode_phys_t dnode_phys_zero; @@ -218,12 +238,25 @@ dnode_init(void) 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); #ifdef _KERNEL kmem_cache_set_move(dnode_cache, dnode_move); + + dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc", + KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (dnode_ksp != NULL) { + dnode_ksp->ks_data = &dnode_stats; + kstat_install(dnode_ksp); + } #endif /* _KERNEL */ } void dnode_fini(void) { + if (dnode_ksp != NULL) { + kstat_delete(dnode_ksp); + dnode_ksp = NULL; + } + kmem_cache_destroy(dnode_cache); dnode_cache = NULL; } @@ -250,6 +283,7 @@ dnode_verify(dnode_t *dn) } if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { int i; + int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(dn->dn_indblkshift, >=, 0); ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); if (dn->dn_datablkshift) { @@ -261,12 +295,12 @@ dnode_verify(dnode_t *dn) ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); - ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen); ASSERT3U(dn->dn_datablksz, ==, dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + - dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + dn->dn_bonuslen, <=, max_bonuslen); for (i = 0; i < TXG_SIZE; i++) { ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); } @@ -297,6 +331,7 @@ dnode_byteswap(dnode_phys_t *dnp) dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); + dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots); dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); dnp->dn_used = BSWAP_64(dnp->dn_used); @@ -323,7 +358,8 @@ dnode_byteswap(dnode_phys_t *dnp) * dnode buffer). */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); - size_t len = DN_MAX_BONUSLEN - off; + int slots = dnp->dn_extra_slots + 1; + size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off; ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); @@ -332,23 +368,25 @@ dnode_byteswap(dnode_phys_t *dnp) /* Swap SPILL block if we have one */ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) - byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t)); + byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); } void dnode_buf_byteswap(void *vbuf, size_t size) { - dnode_phys_t *buf = vbuf; - int i; + int i = 0; ASSERT3U(sizeof (dnode_phys_t), ==, (1<>= DNODE_SHIFT; - for (i = 0; i < size; i++) { - dnode_byteswap(buf); - buf++; + while (i < size) { + dnode_phys_t *dnp = (void *)(((char *)vbuf) + i); + dnode_byteswap(dnp); + + i += DNODE_MIN_SIZE; + if (dnp->dn_type != DMU_OT_NONE) + i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } } @@ -359,7 +397,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - ASSERT3U(newsize, <=, DN_MAX_BONUSLEN - + ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t)); dn->dn_bonuslen = newsize; if (newsize == 0) @@ -439,6 +477,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dn->dn_compress = dnp->dn_compress; dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; + dn->dn_num_slots = dnp->dn_extra_slots + 1; dn->dn_maxblkid = dnp->dn_maxblkid; dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dn->dn_id_flags = 0; @@ -446,14 +485,10 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); + ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); + ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode)); mutex_enter(&os->os_lock); - if (dnh->dnh_dnode != NULL) { - /* Lost the allocation race. */ - mutex_exit(&os->os_lock); - kmem_cache_free(dnode_cache, dn); - return (dnh->dnh_dnode); - } /* * Exclude special dnodes from os_dnodes so an empty os_dnodes @@ -476,6 +511,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); + return (dn); } @@ -501,7 +537,8 @@ dnode_destroy(dnode_t *dn) mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ - zrl_remove(&dn->dn_handle->dnh_zrlock); + if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock)) + zrl_remove(&dn->dn_handle->dnh_zrlock); dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; @@ -538,10 +575,13 @@ dnode_destroy(dnode_t *dn) void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) { int i; + ASSERT3U(dn_slots, >, 0); + ASSERT3U(dn_slots << DNODE_SHIFT, <=, + spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))); ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (blocksize == 0) @@ -554,8 +594,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); - dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset, - dn->dn_object, tx->tx_txg, blocksize, ibs); + dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64 + " blocksize=%d ibs=%d dn_slots=%d\n", + dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots); + DNODE_STAT_BUMP(dnode_allocate); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); @@ -566,7 +608,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); - ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots)); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); @@ -592,11 +634,15 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dnode_setdblksz(dn, blocksize); dn->dn_indblkshift = ibs; dn->dn_nlevels = 1; + dn->dn_num_slots = dn_slots; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ dn->dn_nblkptr = 1; - else - dn->dn_nblkptr = 1 + - ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + else { + dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR, + 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> + SPA_BLKPTRSHIFT)); + } + dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; @@ -621,7 +667,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) { int nblkptr; @@ -635,7 +681,13 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, (bonustype != DMU_OT_NONE && bonuslen != 0) || (bonustype == DMU_OT_SA && bonuslen == 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); - ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT3U(bonuslen, <=, + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); + + dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS; + + dnode_free_interior_slots(dn); + DNODE_STAT_BUMP(dnode_reallocate); /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); @@ -658,7 +710,9 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ nblkptr = 1; else - nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + nblkptr = MIN(DN_MAX_NBLKPTR, + 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> + SPA_BLKPTRSHIFT)); if (dn->dn_bonustype != bonustype) dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) @@ -676,6 +730,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, mutex_enter(&dn->dn_mtx); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; + dn->dn_num_slots = dn_slots; dn->dn_nblkptr = nblkptr; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; @@ -684,7 +739,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, /* fix up the bonus db_size */ if (dn->dn_bonus) { dn->dn_bonus->db.db_size = - DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); + DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - + (dn->dn_nblkptr - 1) * sizeof (blkptr_t); ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); } @@ -692,18 +748,6 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, mutex_exit(&dn->dn_mtx); } -#ifdef DNODE_STATS -static struct { - uint64_t dms_dnode_invalid; - uint64_t dms_dnode_recheck1; - uint64_t dms_dnode_recheck2; - uint64_t dms_dnode_special; - uint64_t dms_dnode_handle; - uint64_t dms_dnode_rwlock; - uint64_t dms_dnode_active; -} dnode_move_stats; -#endif /* DNODE_STATS */ - #ifdef _KERNEL static void dnode_move_impl(dnode_t *odn, dnode_t *ndn) @@ -733,6 +777,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_datablkszsec = odn->dn_datablkszsec; ndn->dn_datablksz = odn->dn_datablksz; ndn->dn_maxblkid = odn->dn_maxblkid; + ndn->dn_num_slots = odn->dn_num_slots; bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0], sizeof (odn->dn_next_type)); bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], @@ -863,7 +908,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) */ os = odn->dn_objset; if (!POINTER_IS_VALID(os)) { - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); + DNODE_STAT_BUMP(dnode_move_invalid); return (KMEM_CBRC_DONT_KNOW); } @@ -873,7 +918,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) rw_enter(&os_lock, RW_WRITER); if (os != odn->dn_objset) { rw_exit(&os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); + DNODE_STAT_BUMP(dnode_move_recheck1); return (KMEM_CBRC_DONT_KNOW); } @@ -891,7 +936,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) if (os != odn->dn_objset) { mutex_exit(&os->os_lock); rw_exit(&os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); + DNODE_STAT_BUMP(dnode_move_recheck2); return (KMEM_CBRC_DONT_KNOW); } @@ -904,7 +949,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) rw_exit(&os_lock); if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); + DNODE_STAT_BUMP(dnode_move_special); return (KMEM_CBRC_NO); } ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ @@ -919,7 +964,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) */ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); + DNODE_STAT_BUMP(dnode_move_handle); return (KMEM_CBRC_LATER); } @@ -935,7 +980,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); + DNODE_STAT_BUMP(dnode_move_rwlock); return (KMEM_CBRC_LATER); } @@ -961,7 +1006,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) rw_exit(&odn->dn_struct_rwlock); zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); + DNODE_STAT_BUMP(dnode_move_active); return (KMEM_CBRC_LATER); } @@ -985,6 +1030,132 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) } #endif /* _KERNEL */ +static void +dnode_slots_hold(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + zrl_add(&dnh->dnh_zrlock); + } +} + +static void +dnode_slots_rele(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + if (zrl_is_locked(&dnh->dnh_zrlock)) + zrl_exit(&dnh->dnh_zrlock); + else + zrl_remove(&dnh->dnh_zrlock); + } +} + +static int +dnode_slots_tryenter(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + if (!zrl_tryenter(&dnh->dnh_zrlock)) { + for (int j = idx; j < i; j++) { + dnh = &children->dnc_children[j]; + zrl_exit(&dnh->dnh_zrlock); + } + + return (0); + } + } + + return (1); +} + +static void +dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + dnh->dnh_dnode = ptr; + } +} + +static boolean_t +dnode_check_slots_free(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + dnode_t *dn = dnh->dnh_dnode; + + if (dn == DN_SLOT_FREE) { + continue; + } else if (DN_SLOT_IS_PTR(dn)) { + mutex_enter(&dn->dn_mtx); + dmu_object_type_t type = dn->dn_type; + mutex_exit(&dn->dn_mtx); + + if (type != DMU_OT_NONE) + return (B_FALSE); + + continue; + } else { + return (B_FALSE); + } + + return (B_FALSE); + } + + return (B_TRUE); +} + +static void +dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); + + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); + dnode_destroy(dnh->dnh_dnode); + dnh->dnh_dnode = DN_SLOT_FREE; + } + } +} + +void +dnode_free_interior_slots(dnode_t *dn) +{ + dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db); + int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT; + int idx = (dn->dn_object & (epb - 1)) + 1; + int slots = dn->dn_num_slots - 1; + + if (slots == 0) + return; + + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + while (!dnode_slots_tryenter(children, idx, slots)) + DNODE_STAT_BUMP(dnode_free_interior_lock_retry); + + dnode_set_slots(children, idx, slots, DN_SLOT_FREE); + dnode_slots_rele(children, idx, slots); +} + void dnode_special_close(dnode_handle_t *dnh) { @@ -992,7 +1163,7 @@ dnode_special_close(dnode_handle_t *dnh) /* * Wait for final references to the dnode to clear. This can - * only happen if the arc is asyncronously evicting state that + * only happen if the arc is asynchronously evicting state that * has a hold on this dnode while we are trying to evict this * dnode. */ @@ -1012,19 +1183,24 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, { dnode_t *dn; - dn = dnode_create(os, dnp, NULL, object, dnh); zrl_init(&dnh->dnh_zrlock); + zrl_tryenter(&dnh->dnh_zrlock); + + dn = dnode_create(os, dnp, NULL, object, dnh); DNODE_VERIFY(dn); + + zrl_exit(&dnh->dnh_zrlock); } static void dnode_buf_evict_async(void *dbu) { - dnode_children_t *children_dnodes = dbu; - int i; + dnode_children_t *dnc = dbu; - for (i = 0; i < children_dnodes->dnc_count; i++) { - dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; + DNODE_STAT_BUMP(dnode_buf_evict); + + for (int i = 0; i < dnc->dnc_count; i++) { + dnode_handle_t *dnh = &dnc->dnc_children[i]; dnode_t *dn; /* @@ -1032,8 +1208,9 @@ dnode_buf_evict_async(void *dbu) * another valid address, so there is no need here to guard * against changes to or from NULL. */ - if (dnh->dnh_dnode == NULL) { + if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) { zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = DN_SLOT_UNINIT; continue; } @@ -1048,22 +1225,40 @@ dnode_buf_evict_async(void *dbu) ASSERT(refcount_is_zero(&dn->dn_holds)); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); - dnode_destroy(dn); /* implicit zrl_remove() */ + dnode_destroy(dn); /* implicit zrl_remove() for first slot */ zrl_destroy(&dnh->dnh_zrlock); - dnh->dnh_dnode = NULL; + dnh->dnh_dnode = DN_SLOT_UNINIT; } - kmem_free(children_dnodes, sizeof (dnode_children_t) + - children_dnodes->dnc_count * sizeof (dnode_handle_t)); + kmem_free(dnc, sizeof (dnode_children_t) + + dnc->dnc_count * sizeof (dnode_handle_t)); } /* + * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used + * to ensure the hole at the specified object offset is large enough to + * hold the dnode being created. The slots parameter is also used to ensure + * a dnode does not span multiple dnode blocks. In both of these cases, if + * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases + * are only possible when using DNODE_MUST_BE_FREE. + * + * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. + * dnode_hold_impl() will check if the requested dnode is already consumed + * as an extra dnode slot by an large dnode, in which case it returns + * ENOENT. + * * errors: - * EINVAL - invalid object number. - * EIO - i/o error. + * EINVAL - invalid object number or flags. + * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE) + * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE) + * - Refers to a freeing dnode (DNODE_MUST_BE_FREE) + * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED) + * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED) + * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED) + * EIO - i/o error error when reading the meta dnode dbuf. * succeeds even for free dnodes. */ int -dnode_hold_impl(objset_t *os, uint64_t object, int flag, +dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, void *tag, dnode_t **dnp) { int epb, idx, err; @@ -1072,9 +1267,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; - dnode_children_t *children_dnodes; + dnode_children_t *dnc; + dnode_phys_t *dn_block; dnode_handle_t *dnh; + ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0)); + ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0)); + /* * If you are holding the spa config lock as writer, you shouldn't * be asking the DMU to do *anything* unless it's the root pool @@ -1121,10 +1320,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); - if (db == NULL) + if (db == NULL) { + DNODE_STAT_BUMP(dnode_hold_dbuf_hold); return (SET_ERROR(EIO)); + } err = dbuf_read(db, NULL, DB_RF_CANFAIL); if (err) { + DNODE_STAT_BUMP(dnode_hold_dbuf_read); dbuf_rele(db, FTAG); return (err); } @@ -1132,62 +1334,194 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, ASSERT3U(db->db.db_size, >=, 1<db.db_size >> DNODE_SHIFT; - idx = object & (epb-1); + idx = object & (epb - 1); + dn_block = (dnode_phys_t *)db->db.db_data; ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); - children_dnodes = dmu_buf_get_user(&db->db); - if (children_dnodes == NULL) { - int i; + dnc = dmu_buf_get_user(&db->db); + dnh = NULL; + if (dnc == NULL) { dnode_children_t *winner; - children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + - epb * sizeof (dnode_handle_t), KM_SLEEP); - children_dnodes->dnc_count = epb; - dnh = &children_dnodes->dnc_children[0]; - for (i = 0; i < epb; i++) { - zrl_init(&dnh[i].dnh_zrlock); - } - dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL, - dnode_buf_evict_async, NULL); - winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); - if (winner != NULL) { + int skip = 0; - for (i = 0; i < epb; i++) { - zrl_destroy(&dnh[i].dnh_zrlock); + dnc = kmem_zalloc(sizeof (dnode_children_t) + + epb * sizeof (dnode_handle_t), KM_SLEEP); + dnc->dnc_count = epb; + dnh = &dnc->dnc_children[0]; + + /* Initialize dnode slot status from dnode_phys_t */ + for (int i = 0; i < epb; i++) { + zrl_init(&dnh[i].dnh_zrlock); + + if (skip) { + skip--; + continue; } - kmem_free(children_dnodes, sizeof (dnode_children_t) + + if (dn_block[i].dn_type != DMU_OT_NONE) { + int interior = dn_block[i].dn_extra_slots; + + dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED); + dnode_set_slots(dnc, i + 1, interior, + DN_SLOT_INTERIOR); + skip = interior; + } else { + dnh[i].dnh_dnode = DN_SLOT_FREE; + skip = 0; + } + } + + dmu_buf_init_user(&dnc->dnc_dbu, NULL, + dnode_buf_evict_async, NULL); + winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu); + if (winner != NULL) { + + for (int i = 0; i < epb; i++) + zrl_destroy(&dnh[i].dnh_zrlock); + + kmem_free(dnc, sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t)); - children_dnodes = winner; + dnc = winner; } } - ASSERT(children_dnodes->dnc_count == epb); - dnh = &children_dnodes->dnc_children[idx]; - zrl_add(&dnh->dnh_zrlock); - dn = dnh->dnh_dnode; - if (dn == NULL) { - dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; + ASSERT(dnc->dnc_count == epb); + dn = DN_SLOT_UNINIT; - dn = dnode_create(os, phys, db, object, dnh); - } + if (flag & DNODE_MUST_BE_ALLOCATED) { + slots = 1; - mutex_enter(&dn->dn_mtx); - type = dn->dn_type; - if (dn->dn_free_txg || - ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || - ((flag & DNODE_MUST_BE_FREE) && - (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { - mutex_exit(&dn->dn_mtx); - zrl_remove(&dnh->dnh_zrlock); + while (dn == DN_SLOT_UNINIT) { + dnode_slots_hold(dnc, idx, slots); + dnh = &dnc->dnc_children[idx]; + + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + dn = dnh->dnh_dnode; + break; + } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) { + DNODE_STAT_BUMP(dnode_hold_alloc_interior); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(EEXIST)); + } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) { + DNODE_STAT_BUMP(dnode_hold_alloc_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOENT)); + } + + dnode_slots_rele(dnc, idx, slots); + if (!dnode_slots_tryenter(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); + continue; + } + + /* + * Someone else won the race and called dnode_create() + * after we checked DN_SLOT_IS_PTR() above but before + * we acquired the lock. + */ + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses); + dn = dnh->dnh_dnode; + } else { + dn = dnode_create(os, dn_block + idx, db, + object, dnh); + } + } + + mutex_enter(&dn->dn_mtx); + if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) { + DNODE_STAT_BUMP(dnode_hold_alloc_type_none); + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOENT)); + } + + DNODE_STAT_BUMP(dnode_hold_alloc_hits); + } else if (flag & DNODE_MUST_BE_FREE) { + + if (idx + slots - 1 >= DNODES_PER_BLOCK) { + DNODE_STAT_BUMP(dnode_hold_free_overflow); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + while (dn == DN_SLOT_UNINIT) { + dnode_slots_hold(dnc, idx, slots); + + if (!dnode_check_slots_free(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_free_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + dnode_slots_rele(dnc, idx, slots); + if (!dnode_slots_tryenter(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_free_lock_retry); + continue; + } + + if (!dnode_check_slots_free(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_free_lock_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + /* + * Allocated but otherwise free dnodes which would + * be in the interior of a multi-slot dnodes need + * to be freed. Single slot dnodes can be safely + * re-purposed as a performance optimization. + */ + if (slots > 1) + dnode_reclaim_slots(dnc, idx + 1, slots - 1); + + dnh = &dnc->dnc_children[idx]; + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + dn = dnh->dnh_dnode; + } else { + dn = dnode_create(os, dn_block + idx, db, + object, dnh); + } + } + + mutex_enter(&dn->dn_mtx); + if (!refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) { + DNODE_STAT_BUMP(dnode_hold_free_refcount); + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(EEXIST)); + } + + dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR); + DNODE_STAT_BUMP(dnode_hold_free_hits); + } else { dbuf_rele(db, FTAG); - return ((flag & DNODE_MUST_BE_ALLOCATED) ? ENOENT : EEXIST); + return (SET_ERROR(EINVAL)); } + + if (dn->dn_free_txg) { + DNODE_STAT_BUMP(dnode_hold_free_txg); + type = dn->dn_type; + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ? + ENOENT : EEXIST)); + } + if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); + mutex_exit(&dn->dn_mtx); /* Now we can rely on the hold to prevent the dnode from moving. */ - zrl_remove(&dnh->dnh_zrlock); + dnode_slots_rele(dnc, idx, slots); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); @@ -1204,7 +1538,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) { - return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); + return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag, + dnp)); } /* @@ -1936,17 +2271,21 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, error = SET_ERROR(ESRCH); } else if (lvl == 0) { dnode_phys_t *dnp = data; - span = DNODE_SHIFT; - ASSERT(dn->dn_type == DMU_OT_DNODE); - for (i = (*offset >> span) & (blkfill - 1); - i >= 0 && i < blkfill; i += inc) { + ASSERT(dn->dn_type == DMU_OT_DNODE); + ASSERT(!(flags & DNODE_FIND_BACKWARDS)); + + for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); + i < blkfill; i += dnp[i].dn_extra_slots + 1) { if ((dnp[i].dn_type == DMU_OT_NONE) == hole) break; - *offset += (1ULL << span) * inc; } - if (i < 0 || i == blkfill) + + if (i == blkfill) error = SET_ERROR(ESRCH); + + *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + + (i << DNODE_SHIFT); } else { blkptr_t *bp = data; uint64_t start = *offset; diff --git a/uts/common/fs/zfs/dnode_sync.c b/uts/common/fs/zfs/dnode_sync.c index 02f263c82e42..a37607e0e307 100644 --- a/uts/common/fs/zfs/dnode_sync.c +++ b/uts/common/fs/zfs/dnode_sync.c @@ -553,7 +553,8 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) ASSERT(dn->dn_free_txg > 0); if (dn->dn_allocated_txg != dn->dn_free_txg) dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); - bzero(dn->dn_phys, sizeof (dnode_phys_t)); + bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots); + dnode_free_interior_slots(dn); mutex_enter(&dn->dn_mtx); dn->dn_type = DMU_OT_NONE; @@ -561,6 +562,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_have_spill = B_FALSE; + dn->dn_num_slots = 1; mutex_exit(&dn->dn_mtx); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); @@ -587,7 +589,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); ASSERT(dnp->dn_type != DMU_OT_NONE || - bcmp(dnp, &zerodn, DNODE_SIZE) == 0); + bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0); DNODE_VERIFY(dn); ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); @@ -619,6 +621,9 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonuslen = dn->dn_bonuslen; } + + dnp->dn_extra_slots = dn->dn_num_slots - 1; + ASSERT(dnp->dn_nlevels > 1 || BP_IS_HOLE(&dnp->dn_blkptr[0]) || BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || @@ -651,7 +656,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnp->dn_bonuslen = 0; else dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; - ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN); + ASSERT(dnp->dn_bonuslen <= + DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1)); dn->dn_next_bonuslen[txgoff] = 0; } @@ -691,7 +697,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); if (kill_spill) { - free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx); + free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx); mutex_enter(&dn->dn_mtx); dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); @@ -721,6 +727,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) return; } + if (dn->dn_num_slots > DNODE_MIN_SLOTS) { + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] = + B_TRUE; + mutex_exit(&ds->ds_lock); + } + if (dn->dn_next_nlevels[txgoff]) { dnode_increase_indirection(dn, tx); dn->dn_next_nlevels[txgoff] = 0; diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c index 6fd97d9bfcd6..c19e43bd9fa7 100644 --- a/uts/common/fs/zfs/dsl_scan.c +++ b/uts/common/fs/zfs/dsl_scan.c @@ -773,14 +773,18 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, scn->scn_phys.scn_errors++; return (err); } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + for (i = 0, cdnp = buf->b_data; i < epb; + i += cdnp->dn_extra_slots + 1, + cdnp += cdnp->dn_extra_slots + 1) { for (j = 0; j < cdnp->dn_nblkptr; j++) { blkptr_t *cbp = &cdnp->dn_blkptr[j]; dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_blkid * epb + i, j); } } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + for (i = 0, cdnp = buf->b_data; i < epb; + i += cdnp->dn_extra_slots + 1, + cdnp += cdnp->dn_extra_slots + 1) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } @@ -843,7 +847,7 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 0, DMU_SPILL_BLKID); - dsl_scan_visitbp(&dnp->dn_spill, + dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp), &czb, dnp, ds, scn, ostype, tx); } } diff --git a/uts/common/fs/zfs/sa.c b/uts/common/fs/zfs/sa.c index f36483d26531..8cb8199088eb 100644 --- a/uts/common/fs/zfs/sa.c +++ b/uts/common/fs/zfs/sa.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -543,12 +544,11 @@ sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) */ static int sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total, - boolean_t *will_spill) + dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index, + int *total, boolean_t *will_spill) { int var_size = 0; int i; - int full_space; int hdrsize; int extra_hdrsize; @@ -567,7 +567,6 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : sizeof (sa_hdr_phys_t); - full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size; ASSERT(IS_P2ALIGNED(full_space, 8)); for (i = 0; i != attr_count; i++) { @@ -653,6 +652,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, int buf_space; sa_attr_type_t *attrs, *attrs_start; int i, lot_count; + int dnodesize; int hdrsize; int spillhdrsize = 0; int used; @@ -660,20 +660,24 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, sa_lot_t *lot; int len_idx; int spill_used; + int bonuslen; boolean_t spilling; dmu_buf_will_dirty(hdl->sa_bonus, tx); bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); + dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize); + bonuslen = DN_BONUS_SIZE(dnodesize); + /* first determine bonus header size and sum of all attributes */ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, - SA_BONUS, &i, &used, &spilling); + SA_BONUS, bonuslen, &i, &used, &spilling); if (used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? - MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) : + MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) : used + hdrsize, tx)); ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || @@ -690,8 +694,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_buf_will_dirty(hdl->sa_spill, tx); spillhdrsize = sa_find_sizes(sa, &attr_desc[i], - attr_count - i, hdl->sa_spill, SA_SPILL, &i, - &spill_used, &dummy); + attr_count - i, hdl->sa_spill, SA_SPILL, + hdl->sa_spill->db_size, &i, &spill_used, &dummy); if (spill_used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index b975e5939d16..3ae96d35030f 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -350,6 +350,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + DNODE_MAX_SIZE, ZPROP_SRC_NONE); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + DNODE_MIN_SIZE, ZPROP_SRC_NONE); + } + if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, @@ -577,8 +585,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) /* * Must be ZPL, and its property settings - * must be supported by GRUB (compression - * is not gzip, and large blocks are not used). + * must be supported. */ if (dmu_objset_type(os) != DMU_OST_ZFS) { diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c index 8ba49fed4155..4481fa52003f 100644 --- a/uts/common/fs/zfs/spa_misc.c +++ b/uts/common/fs/zfs/spa_misc.c @@ -990,10 +990,10 @@ spa_aux_activate(vdev_t *vd, avl_tree_t *avl) /* * Spares are tracked globally due to the following constraints: * - * - A spare may be part of multiple pools. - * - A spare may be added to a pool even if it's actively in use within + * - A spare may be part of multiple pools. + * - A spare may be added to a pool even if it's actively in use within * another pool. - * - A spare in use in any pool can only be the source of a replacement if + * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference @@ -2104,6 +2104,15 @@ spa_maxblocksize(spa_t *spa) return (SPA_OLD_MAXBLOCKSIZE); } +int +spa_maxdnodesize(spa_t *spa) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) + return (DNODE_MAX_SIZE); + else + return (DNODE_MIN_SIZE); +} + /* * Returns the txg that the last device removal completed. No indirect mappings * have been added since this txg. diff --git a/uts/common/fs/zfs/sys/arc.h b/uts/common/fs/zfs/sys/arc.h index 45db7701e1aa..641ae93e9c23 100644 --- a/uts/common/fs/zfs/sys/arc.h +++ b/uts/common/fs/zfs/sys/arc.h @@ -149,6 +149,7 @@ typedef enum arc_space_type { ARC_SPACE_HDRS, ARC_SPACE_L2HDRS, ARC_SPACE_OTHER, + ARC_SPACE_BONUS, ARC_SPACE_NUMTYPES } arc_space_type_t; diff --git a/uts/common/fs/zfs/sys/dmu.h b/uts/common/fs/zfs/sys/dmu.h index 887a5ff7e3b0..535c13fe05c1 100644 --- a/uts/common/fs/zfs/sys/dmu.h +++ b/uts/common/fs/zfs/sys/dmu.h @@ -358,6 +358,15 @@ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, + int dnodesize, dmu_tx_t *tx); +int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, + int dnodesize, dmu_tx_t *tx); +int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, + dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, + int bonuslen, int dnodesize, dmu_tx_t *txp); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, @@ -804,7 +813,8 @@ typedef struct dmu_object_info { uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_nblkptr; - uint8_t doi_pad[4]; + int8_t doi_pad[4]; + uint64_t doi_dnodesize; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; uint64_t doi_fill_count; /* number of non-empty blocks */ @@ -846,6 +856,8 @@ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); +void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize); + typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; @@ -903,6 +915,7 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); +extern uint64_t dmu_objset_dnodesize(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, diff --git a/uts/common/fs/zfs/sys/dmu_objset.h b/uts/common/fs/zfs/sys/dmu_objset.h index 25ff8642177d..3028f0436566 100644 --- a/uts/common/fs/zfs/sys/dmu_objset.h +++ b/uts/common/fs/zfs/sys/dmu_objset.h @@ -91,6 +91,7 @@ struct objset { list_node_t os_evicting_node; /* can change, under dsl_dir's locks: */ + uint64_t os_dnodesize; /* default dnode size for new objects */ enum zio_checksum os_checksum; enum zio_compress os_compress; uint8_t os_copies; @@ -129,7 +130,11 @@ struct objset { /* Protected by os_obj_lock */ kmutex_t os_obj_lock; - uint64_t os_obj_next; + uint64_t os_obj_next_chunk; + + /* Per-CPU next object to allocate, protected by atomic ops. */ + uint64_t *os_obj_next_percpu; + int os_obj_next_percpu_len; /* Protected by os_lock */ kmutex_t os_lock; diff --git a/uts/common/fs/zfs/sys/dnode.h b/uts/common/fs/zfs/sys/dnode.h index 89a7b2ef60e4..68872a8e9e88 100644 --- a/uts/common/fs/zfs/sys/dnode.h +++ b/uts/common/fs/zfs/sys/dnode.h @@ -86,12 +86,26 @@ extern "C" { /* * Derived constants. */ -#define DNODE_SIZE (1 << DNODE_SHIFT) -#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) -#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) -#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) -#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1) -#define DN_KILL_SPILLBLK (1) +#define DNODE_MIN_SIZE (1 << DNODE_SHIFT) +#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT) +#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT) +#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT) +#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT) +#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \ + (1 << SPA_BLKPTRSHIFT)) +#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT) +#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE)) +#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) +#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1) +#define DN_KILL_SPILLBLK (1) + +#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */ +#define DN_SLOT_FREE ((void *)1UL) /* Free slot */ +#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */ +#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */ +#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR) +#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL) #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) @@ -109,6 +123,10 @@ extern "C" { #define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) +#define DN_MAX_BONUS_LEN(dnp) \ + ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \ + (uint8_t *)DN_SPILL_BLKPTR(dnp) - (uint8_t *)DN_BONUS(dnp) : \ + (uint8_t *)(dnp + (dnp->dn_extra_slots + 1)) - (uint8_t *)DN_BONUS(dnp)) #define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) @@ -132,6 +150,57 @@ enum dnode_dirtycontext { /* Does dnode have a SA spill blkptr in bonus? */ #define DNODE_FLAG_SPILL_BLKPTR (1<<2) +/* + * VARIABLE-LENGTH (LARGE) DNODES + * + * The motivation for variable-length dnodes is to eliminate the overhead + * associated with using spill blocks. Spill blocks are used to store + * system attribute data (i.e. file metadata) that does not fit in the + * dnode's bonus buffer. By allowing a larger bonus buffer area the use of + * a spill block can be avoided. Spill blocks potentially incur an + * additional read I/O for every dnode in a dnode block. As a worst case + * example, reading 32 dnodes from a 16k dnode block and all of the spill + * blocks could issue 33 separate reads. Now suppose those dnodes have size + * 1024 and therefore don't need spill blocks. Then the worst case number + * of blocks read is reduced to from 33 to two--one per dnode block. + * + * ZFS-on-Linux systems that make heavy use of extended attributes benefit + * from this feature. In particular, ZFS-on-Linux supports the xattr=sa + * dataset property which allows file extended attribute data to be stored + * in the dnode bonus buffer as an alternative to the traditional + * directory-based format. Workloads such as SELinux and the Lustre + * distributed filesystem often store enough xattr data to force spill + * blocks when xattr=sa is in effect. Large dnodes may therefore provide a + * performance benefit to such systems. Other use cases that benefit from + * this feature include files with large ACLs and symbolic links with long + * target names. + * + * The size of a dnode may be a multiple of 512 bytes up to the size of a + * dnode block (currently 16384 bytes). The dn_extra_slots field of the + * on-disk dnode_phys_t structure describes the size of the physical dnode + * on disk. The field represents how many "extra" dnode_phys_t slots a + * dnode consumes in its dnode block. This convention results in a value of + * 0 for 512 byte dnodes which preserves on-disk format compatibility with + * older software which doesn't support large dnodes. + * + * Similarly, the in-memory dnode_t structure has a dn_num_slots field + * to represent the total number of dnode_phys_t slots consumed on disk. + * Thus dn->dn_num_slots is 1 greater than the corresponding + * dnp->dn_extra_slots. This difference in convention was adopted + * because, unlike on-disk structures, backward compatibility is not a + * concern for in-memory objects, so we used a more natural way to + * represent size for a dnode_t. + * + * The default size for newly created dnodes is determined by the value of + * the "dnodesize" dataset property. By default the property is set to + * "legacy" which is compatible with older software. Setting the property + * to "auto" will allow the filesystem to choose the most suitable dnode + * size. Currently this just sets the default dnode size to 1k, but future + * code improvements could dynamically choose a size based on observed + * workload patterns. Dnodes of varying sizes can coexist within the same + * dataset and even within the same dnode block. + */ + typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ uint8_t dn_indblkshift; /* ln2(indirect block size) */ @@ -143,19 +212,32 @@ typedef struct dnode_phys { uint8_t dn_flags; /* DNODE_FLAG_* */ uint16_t dn_datablkszsec; /* data block size in 512b sectors */ uint16_t dn_bonuslen; /* length of dn_bonus */ - uint8_t dn_pad2[4]; + uint8_t dn_extra_slots; /* # of subsequent slots consumed */ + uint8_t dn_pad2[3]; /* accounting is protected by dn_dirty_mtx */ uint64_t dn_maxblkid; /* largest allocated block ID */ uint64_t dn_used; /* bytes (or sectors) of disk space */ uint64_t dn_pad3[4]; - - blkptr_t dn_blkptr[1]; - uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)]; - blkptr_t dn_spill; + union { + blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)]; + struct { + blkptr_t __dn_ignore1; + uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN]; + }; + struct { + blkptr_t __dn_ignore2; + uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN - + sizeof (blkptr_t)]; + blkptr_t dn_spill; + }; + }; } dnode_phys_t; +#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \ + (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT)) + struct dnode { /* * Protects the structure of the dnode, including the number of levels @@ -192,6 +274,7 @@ struct dnode { uint32_t dn_datablksz; /* in bytes */ uint64_t dn_maxblkid; uint8_t dn_next_type[TXG_SIZE]; + uint8_t dn_num_slots; /* metadnode slots consumed on disk */ uint8_t dn_next_nblkptr[TXG_SIZE]; uint8_t dn_next_nlevels[TXG_SIZE]; uint8_t dn_next_indblkshift[TXG_SIZE]; @@ -287,7 +370,7 @@ void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx); int dnode_hold(struct objset *dd, uint64_t object, void *ref, dnode_t **dnp); -int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, +int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots, void *ref, dnode_t **dnp); boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); @@ -295,9 +378,9 @@ void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); void dnode_free(dnode_t *dn, dmu_tx_t *tx); void dnode_byteswap(dnode_phys_t *dnp); void dnode_buf_byteswap(void *buf, size_t size); @@ -313,6 +396,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, int minlvl, uint64_t blkfill, uint64_t txg); void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn); +void dnode_free_interior_slots(dnode_t *dn); boolean_t dnode_needs_remap(const dnode_t *dn); #define DNODE_IS_CACHEABLE(_dn) \ @@ -324,6 +408,140 @@ boolean_t dnode_needs_remap(const dnode_t *dn); ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA) +/* + * Used for dnodestats kstat. + */ +typedef struct dnode_stats { + /* + * Number of failed attempts to hold a meta dnode dbuf. + */ + kstat_named_t dnode_hold_dbuf_hold; + /* + * Number of failed attempts to read a meta dnode dbuf. + */ + kstat_named_t dnode_hold_dbuf_read; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able + * to hold the requested object number which was allocated. This is + * the common case when looking up any allocated object number. + */ + kstat_named_t dnode_hold_alloc_hits; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not + * able to hold the request object number because it was not allocated. + */ + kstat_named_t dnode_hold_alloc_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not + * able to hold the request object number because the object number + * refers to an interior large dnode slot. + */ + kstat_named_t dnode_hold_alloc_interior; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed + * to retry acquiring slot zrl locks due to contention. + */ + kstat_named_t dnode_hold_alloc_lock_retry; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not + * need to create the dnode because another thread did so after + * dropping the read lock but before acquiring the write lock. + */ + kstat_named_t dnode_hold_alloc_lock_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found + * a free dnode instantiated by dnode_create() but not yet allocated + * by dnode_allocate(). + */ + kstat_named_t dnode_hold_alloc_type_none; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able + * to hold the requested range of free dnode slots. + */ + kstat_named_t dnode_hold_free_hits; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not + * able to hold the requested range of free dnode slots because + * at least one slot was allocated. + */ + kstat_named_t dnode_hold_free_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not + * able to hold the requested range of free dnode slots because + * after acquiring the zrl lock at least one slot was allocated. + */ + kstat_named_t dnode_hold_free_lock_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed + * to retry acquiring slot zrl locks due to contention. + */ + kstat_named_t dnode_hold_free_lock_retry; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested + * a range of dnode slots which were held by another thread. + */ + kstat_named_t dnode_hold_free_refcount; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested + * a range of dnode slots which would overflow the dnode_phys_t. + */ + kstat_named_t dnode_hold_free_overflow; + /* + * Number of times a dnode_hold(...) was attempted on a dnode + * which had already been unlinked in an earlier txg. + */ + kstat_named_t dnode_hold_free_txg; + /* + * Number of times dnode_free_interior_slots() needed to retry + * acquiring a slot zrl lock due to contention. + */ + kstat_named_t dnode_free_interior_lock_retry; + /* + * Number of new dnodes allocated by dnode_allocate(). + */ + kstat_named_t dnode_allocate; + /* + * Number of dnodes re-allocated by dnode_reallocate(). + */ + kstat_named_t dnode_reallocate; + /* + * Number of meta dnode dbufs evicted. + */ + kstat_named_t dnode_buf_evict; + /* + * Number of times dmu_object_alloc*() reached the end of the existing + * object ID chunk and advanced to a new one. + */ + kstat_named_t dnode_alloc_next_chunk; + /* + * Number of times multiple threads attempted to allocate a dnode + * from the same block of free dnodes. + */ + kstat_named_t dnode_alloc_race; + /* + * Number of times dmu_object_alloc*() was forced to advance to the + * next meta dnode dbuf due to an error from dmu_object_next(). + */ + kstat_named_t dnode_alloc_next_block; + /* + * Statistics for tracking dnodes which have been moved. + */ + kstat_named_t dnode_move_invalid; + kstat_named_t dnode_move_recheck1; + kstat_named_t dnode_move_recheck2; + kstat_named_t dnode_move_special; + kstat_named_t dnode_move_handle; + kstat_named_t dnode_move_rwlock; + kstat_named_t dnode_move_active; +} dnode_stats_t; + +extern dnode_stats_t dnode_stats; + +#define DNODE_STAT_INCR(stat, val) \ + atomic_add_64(&dnode_stats.stat.value.ui64, (val)); +#define DNODE_STAT_BUMP(stat) \ + DNODE_STAT_INCR(stat, 1); + #ifdef ZFS_DEBUG /* diff --git a/uts/common/fs/zfs/sys/dsl_dataset.h b/uts/common/fs/zfs/sys/dsl_dataset.h index 03dca17bee6d..15a64a832630 100644 --- a/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/uts/common/fs/zfs/sys/dsl_dataset.h @@ -86,6 +86,13 @@ struct dsl_pool; */ #define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks" +/* + * This field is present (with value=0) if this dataset may contain large + * dnodes (>512B). If it is present, then this dataset is counted in the + * refcount of the SPA_FEATURE_LARGE_DNODE feature. + */ +#define DS_FIELD_LARGE_DNODE "org.zfsonlinux:large_dnode" + /* * These fields are set on datasets that are in the middle of a resumable * receive, and allow the sender to resume the send if it is interrupted. diff --git a/uts/common/fs/zfs/sys/sa_impl.h b/uts/common/fs/zfs/sys/sa_impl.h index 50874c6bf083..4bea074b545f 100644 --- a/uts/common/fs/zfs/sys/sa_impl.h +++ b/uts/common/fs/zfs/sys/sa_impl.h @@ -101,7 +101,7 @@ typedef struct sa_lot { sa_attr_type_t *lot_attrs; /* array of attr #'s */ uint32_t lot_var_sizes; /* how many aren't fixed size */ uint32_t lot_attr_count; /* total attr count */ - list_t lot_idx_tab; /* should be only a couple of entries */ + list_t lot_idx_tab; /* should be only a couple of entries */ int lot_instance; /* used with lot_hash to identify entry */ } sa_lot_t; @@ -134,7 +134,7 @@ typedef struct sa_idx_tab { * adding a completely new attribute is a very rare operation. */ struct sa_os { - kmutex_t sa_lock; + kmutex_t sa_lock; boolean_t sa_need_attr_registration; boolean_t sa_force_spill; uint64_t sa_master_obj; @@ -237,7 +237,7 @@ struct sa_handle { #define SA_BONUSTYPE_FROM_DB(db) \ (dmu_get_bonustype((dmu_buf_t *)db)) -#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t)) +#define SA_BLKPTR_SPACE (DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t)) #define SA_LAYOUT_NUM(x, type) \ ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \ diff --git a/uts/common/fs/zfs/sys/spa.h b/uts/common/fs/zfs/sys/spa.h index dc5da8fd778d..3ecec3df3956 100644 --- a/uts/common/fs/zfs/sys/spa.h +++ b/uts/common/fs/zfs/sys/spa.h @@ -843,6 +843,7 @@ extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); +extern int spa_maxdnodesize(spa_t *spa); extern boolean_t spa_has_checkpoint(spa_t *spa); extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); extern boolean_t spa_suspend_async_destroy(spa_t *spa); diff --git a/uts/common/fs/zfs/sys/zap.h b/uts/common/fs/zfs/sys/zap.h index 10cb6b449bde..2c909ff53aa2 100644 --- a/uts/common/fs/zfs/sys/zap.h +++ b/uts/common/fs/zfs/sys/zap.h @@ -115,16 +115,30 @@ typedef enum zap_flags { /* * Create a new zapobj with no attributes and return its object number. + * + * dnodesize specifies the on-disk size of the dnode for the new zapobj. + * Valid values are multiples of 512 up to DNODE_MAX_SIZE. */ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags, + dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx); uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, + zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx); uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx); +uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, + uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx); /* * Initialize an already-allocated object. @@ -138,9 +152,14 @@ void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, */ int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); int zap_create_claim_norm(objset_t *ds, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj, + int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); /* * The zapobj passed in must be a valid ZAP object for all of the diff --git a/uts/common/fs/zfs/sys/zfs_ioctl.h b/uts/common/fs/zfs/sys/zfs_ioctl.h index f3df29218d41..824d1d8bb70f 100644 --- a/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -93,7 +93,7 @@ typedef enum drr_headertype { #define DMU_BACKUP_FEATURE_RESUMING (1 << 20) /* flag #21 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) -/* flag #23 is reserved for the large dnode feature */ +#define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) /* flag #24 is reserved for the raw send feature */ /* flag #25 is reserved for the ZSTD compression feature */ @@ -104,7 +104,7 @@ typedef enum drr_headertype { DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | \ - DMU_BACKUP_FEATURE_LARGE_BLOCKS | \ + DMU_BACKUP_FEATURE_LARGE_BLOCKS | DMU_BACKUP_FEATURE_LARGE_DNODE | \ DMU_BACKUP_FEATURE_COMPRESSED) /* Are all features in the given flag word currently supported? */ @@ -120,7 +120,7 @@ typedef enum dmu_send_resume_token_version { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * | reserved | feature-flags |C|S| + * | reserved | feature-flags |C|S| * +-------+-------+-------+-------+-------+-------+-------+-------+ * * The low order two bits indicate the header type: SUBSTREAM (0x1) @@ -197,7 +197,8 @@ typedef struct dmu_replay_record { uint32_t drr_bonuslen; uint8_t drr_checksumtype; uint8_t drr_compress; - uint8_t drr_pad[6]; + uint8_t drr_dn_slots; + uint8_t drr_pad[5]; uint64_t drr_toguid; /* bonus content follows */ } drr_object; diff --git a/uts/common/fs/zfs/sys/zfs_znode.h b/uts/common/fs/zfs/sys/zfs_znode.h index 8c4f8f7dc850..a9f9876530bd 100644 --- a/uts/common/fs/zfs/sys/zfs_znode.h +++ b/uts/common/fs/zfs/sys/zfs_znode.h @@ -185,6 +185,7 @@ typedef struct znode { uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ + uint64_t z_dnodesize; /* dnode size */ uint64_t z_gen; /* generation (cached) */ uint64_t z_size; /* file size (cached) */ uint64_t z_atime[2]; /* atime (cached) */ diff --git a/uts/common/fs/zfs/sys/zil.h b/uts/common/fs/zfs/sys/zil.h index b1567acd4add..e6b18da95be8 100644 --- a/uts/common/fs/zfs/sys/zil.h +++ b/uts/common/fs/zfs/sys/zil.h @@ -157,7 +157,7 @@ typedef enum zil_create { #define TX_ACL 13 /* Set ACL */ #define TX_CREATE_ACL 14 /* create with ACL */ #define TX_CREATE_ATTR 15 /* create + attrs */ -#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ +#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ #define TX_MKDIR_ACL 17 /* mkdir with ACL */ #define TX_MKDIR_ATTR 18 /* mkdir with attr */ #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ @@ -184,6 +184,19 @@ typedef enum zil_create { (txtype) == TX_ACL || \ (txtype) == TX_WRITE2) +/* + * The number of dnode slots consumed by the object is stored in the 8 + * unused upper bits of the object ID. We subtract 1 from the value + * stored on disk for compatibility with implementations that don't + * support large dnodes. The slot count for a single-slot dnode will + * contain 0 for those bits to preserve the log record format for + * "small" dnodes. + */ +#define LR_FOID_GET_SLOTS(oid) (BF64_GET((oid), 56, 8) + 1) +#define LR_FOID_SET_SLOTS(oid, x) BF64_SET((oid), 56, 8, (x) - 1) +#define LR_FOID_GET_OBJ(oid) BF64_GET((oid), 0, DN_MAX_OBJECT_SHIFT) +#define LR_FOID_SET_OBJ(oid, x) BF64_SET((oid), 0, DN_MAX_OBJECT_SHIFT, (x)) + /* * Format of log records. * The fields are carefully defined to allow them to be aligned @@ -422,7 +435,7 @@ extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); extern int zil_reset(const char *osname, void *txarg); extern int zil_claim(struct dsl_pool *dp, struct dsl_dataset *ds, void *txarg); -extern int zil_check_log_chain(struct dsl_pool *dp, +extern int zil_check_log_chain(struct dsl_pool *dp, struct dsl_dataset *ds, void *tx); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); diff --git a/uts/common/fs/zfs/zap.c b/uts/common/fs/zfs/zap.c index e9ed41e556ea..7a1994f603c1 100644 --- a/uts/common/fs/zfs/zap.c +++ b/uts/common/fs/zfs/zap.c @@ -948,8 +948,17 @@ uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx) { - uint64_t new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx); - VERIFY(new_obj != 0); + return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx)); +} + +uint64_t +zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, + const char *name, int dnodesize, dmu_tx_t *tx) +{ + uint64_t new_obj; + + VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, + dnodesize, tx)) > 0); VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, tx)); diff --git a/uts/common/fs/zfs/zap_micro.c b/uts/common/fs/zfs/zap_micro.c index b07079ed4408..d093fe1e736e 100644 --- a/uts/common/fs/zfs/zap_micro.c +++ b/uts/common/fs/zfs/zap_micro.c @@ -693,8 +693,16 @@ int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - return (zap_create_claim_norm(os, obj, - 0, ot, bonustype, bonuslen, tx)); + return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, + 0, tx)); +} + +int +zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_claim_norm_dnsize(os, obj, + 0, ot, bonustype, bonuslen, dnodesize, tx)); } int @@ -702,8 +710,19 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); - int err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); + return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, + bonuslen, 0, tx)); +} + +int +zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) +{ + int err; + + err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, + dnodesize, tx); if (err != 0) return (err); mzap_create_impl(os, obj, normflags, 0, tx); @@ -717,12 +736,29 @@ zap_create(objset_t *os, dmu_object_type_t ot, return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); } +uint64_t +zap_create_dnsize(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, + dnodesize, tx)); +} + uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); - uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); + return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, + 0, tx)); +} + +uint64_t +zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, + dnodesize, tx); mzap_create_impl(os, obj, normflags, 0, tx); return (obj); @@ -734,7 +770,17 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); - uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); + return (zap_create_flags_dnsize(os, normflags, flags, ot, + leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); +} + +uint64_t +zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, + dnodesize, tx); ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT && diff --git a/uts/common/fs/zfs/zfs_acl.c b/uts/common/fs/zfs/zfs_acl.c index 5906db36567e..149103206a8e 100644 --- a/uts/common/fs/zfs/zfs_acl.c +++ b/uts/common/fs/zfs/zfs_acl.c @@ -895,7 +895,7 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, int entry_type; mode_t mode; mode_t seen = 0; - zfs_ace_hdr_t *acep = NULL; + zfs_ace_hdr_t *acep = NULL; uint64_t who; uint16_t iflags, type; uint32_t access_mask; @@ -1262,7 +1262,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, otype == DMU_OT_ACL ? - DN_MAX_BONUSLEN : 0, tx); + DN_OLD_MAX_BONUSLEN : 0, tx); } else { (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, aclp->z_acl_bytes, 0, tx); @@ -1337,12 +1337,12 @@ zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim, uint64_t who; int new_count, new_bytes; int ace_size; - int entry_type; + int entry_type; uint16_t iflags, type; uint32_t access_mask; zfs_acl_node_t *newnode; - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - void *zacep; + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + void *zacep; boolean_t isdir; trivial_acl_t masks; @@ -1786,7 +1786,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) zfs_acl_t *aclp; ulong_t mask; int error; - int count = 0; + int count = 0; int largeace = 0; mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | @@ -2107,7 +2107,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, zfs_acl_t *aclp; int error; uid_t uid = crgetuid(cr); - uint64_t who; + uint64_t who; uint16_t type, iflags; uint16_t entry_type; uint32_t access_mask; @@ -2380,9 +2380,9 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) uint32_t working_mode; int error; int is_attr; - boolean_t check_privs; + boolean_t check_privs; znode_t *xzp; - znode_t *check_zp = zp; + znode_t *check_zp = zp; mode_t needed_bits; uid_t owner; diff --git a/uts/common/fs/zfs/zfs_ioctl.c b/uts/common/fs/zfs/zfs_ioctl.c index 87435f18ac1a..833cc26302ab 100644 --- a/uts/common/fs/zfs/zfs_ioctl.c +++ b/uts/common/fs/zfs/zfs_ioctl.c @@ -4055,6 +4055,24 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) } break; + case ZFS_PROP_DNODESIZE: + /* Dnode sizes above 512 need the feature to be enabled */ + if (nvpair_value_uint64(pair, &intval) == 0 && + intval != ZFS_DNSIZE_LEGACY) { + spa_t *spa; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_LARGE_DNODE)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } + break; + case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); diff --git a/uts/common/fs/zfs/zfs_log.c b/uts/common/fs/zfs/zfs_log.c index fbac2d99c289..1afaa8434bef 100644 --- a/uts/common/fs/zfs/zfs_log.c +++ b/uts/common/fs/zfs/zfs_log.c @@ -280,6 +280,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; + /* Store dnode slot count in 8 bits above object id. */ + LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT); lr->lr_mode = zp->z_mode; if (!IS_EPHEMERAL(zp->z_uid)) { lr->lr_uid = (uint64_t)zp->z_uid; diff --git a/uts/common/fs/zfs/zfs_replay.c b/uts/common/fs/zfs/zfs_replay.c index de8d9c10b616..f75ec48cd7a6 100644 --- a/uts/common/fs/zfs/zfs_replay.c +++ b/uts/common/fs/zfs/zfs_replay.c @@ -278,6 +278,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) void *fuidstart; size_t xvatlen = 0; uint64_t txtype; + uint64_t objid; + uint64_t dnodesize; int error; txtype = (lr->lr_common.lrc_txtype & ~TX_CI); @@ -303,19 +305,24 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); + objid = LR_FOID_GET_OBJ(lr->lr_foid); + dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; + xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, - lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's - * creation time and generation number. The generic VOP_CREATE() - * doesn't have either concept, so we smuggle the values inside - * the vattr's otherwise unused va_ctime and va_nblocks fields. + * creation time, generation number, and dnode size. The generic + * zfs_create() has no concept of these attributes, so we smuggle + * the values inside the vattr's otherwise unused va_ctime, + * va_nblocks, and va_fsid fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; + xva.xva_vattr.va_fsid = dnodesize; error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); if (error != ENOENT) @@ -432,21 +439,26 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); + uint64_t objid = LR_FOID_GET_OBJ(lr->lr_foid); + int dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; + xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, - lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's - * creation time and generation number. The generic VOP_CREATE() - * doesn't have either concept, so we smuggle the values inside - * the vattr's otherwise unused va_ctime and va_nblocks fields. + * creation time, generation number, and dnode slot count. The + * generic zfs_create() has no concept of these attributes, so + * we smuggle the values inside the vattr's otherwise unused + * va_ctime, va_nblocks and va_fsid fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; + xva.xva_vattr.va_fsid = dnodesize; - error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); + error = dmu_object_info(zfsvfs->z_os, objid, NULL); if (error != ENOENT) goto out; diff --git a/uts/common/fs/zfs/zfs_sa.c b/uts/common/fs/zfs/zfs_sa.c index 3a472aa11a45..a39cff1a7b9b 100644 --- a/uts/common/fs/zfs/zfs_sa.c +++ b/uts/common/fs/zfs/zfs_sa.c @@ -97,8 +97,7 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) { - VERIFY(dmu_set_bonus(db, - len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0); + VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx)); if (len) { bcopy(link, (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, len); diff --git a/uts/common/fs/zfs/zfs_znode.c b/uts/common/fs/zfs/zfs_znode.c index 536216deafe9..b56cb7bd700d 100644 --- a/uts/common/fs/zfs/zfs_znode.c +++ b/uts/common/fs/zfs/zfs_znode.c @@ -60,6 +60,7 @@ #include #include +#include #include #include #include @@ -799,9 +800,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, timestruc_t now; uint64_t gen, obj; int bonuslen; + int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; - sa_bulk_attr_t sa_attrs[ZPL_END]; + sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; @@ -811,15 +813,20 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ + dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); + dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } + if (dnodesize == 0) + dnodesize = DNODE_MIN_SIZE; + obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? - DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE; + DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. @@ -832,28 +839,28 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, */ if (vap->va_type == VDIR) { if (zfsvfs->z_replay) { - VERIFY0(zap_create_claim_norm(zfsvfs->z_os, obj, + VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, - obj_type, bonuslen, tx)); + obj_type, bonuslen, dnodesize, tx)); } else { - obj = zap_create_norm(zfsvfs->z_os, + obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, - obj_type, bonuslen, tx); + obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { - VERIFY0(dmu_object_claim(zfsvfs->z_os, obj, + VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, - obj_type, bonuslen, tx)); + obj_type, bonuslen, dnodesize, tx)); } else { - obj = dmu_object_alloc(zfsvfs->z_os, + obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, - obj_type, bonuslen, tx); + obj_type, bonuslen, dnodesize, tx); } } ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); - VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer @@ -925,6 +932,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ + sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), @@ -950,10 +958,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, - &acl_ids->z_fuid, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, - &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), + NULL, &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), + NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), @@ -1019,6 +1027,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, (*zpp)->z_pflags = pflags; (*zpp)->z_mode = mode; + (*zpp)->z_dnodesize = dnodesize; if (vap->va_mask & AT_XVATTR) zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); @@ -1027,6 +1036,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } + kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } diff --git a/uts/common/fs/zfs/zil.c b/uts/common/fs/zfs/zil.c index 9ce72d16f6a0..44df4c96f3ce 100644 --- a/uts/common/fs/zfs/zil.c +++ b/uts/common/fs/zfs/zil.c @@ -63,9 +63,9 @@ * representation, and the on-disk representation). The on-disk format * consists of 3 parts: * - * - a single, per-dataset, ZIL header; which points to a chain of - * - zero or more ZIL blocks; each of which contains - * - zero or more ZIL records + * - a single, per-dataset, ZIL header; which points to a chain of + * - zero or more ZIL blocks; each of which contains + * - zero or more ZIL records * * A ZIL record holds the information necessary to replay a single * system call transaction. A ZIL block can hold many ZIL records, and @@ -1355,7 +1355,7 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) uint64_t zil_block_buckets[] = { 4096, /* non TX_WRITE */ 8192+4096, /* data base */ - 32*1024 + 4096, /* NFS writes */ + 32*1024 + 4096, /* NFS writes */ UINT64_MAX }; @@ -1840,7 +1840,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) list_insert_tail(&itxs->i_sync_list, itx); } else { avl_tree_t *t = &itxs->i_async_tree; - uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; + uint64_t foid = + LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid); itx_async_node_t *ian; avl_index_t where; @@ -3088,7 +3089,8 @@ zil_close(zilog_t *zilog) if (zilog_is_dirty(zilog)) zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); - VERIFY(!zilog_is_dirty(zilog)); + if (txg < spa_freeze_txg(zilog->zl_spa)) + VERIFY(!zilog_is_dirty(zilog)); zilog->zl_get_data = NULL; @@ -3303,7 +3305,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) */ if (TX_OOO(txtype)) { error = dmu_object_info(zilog->zl_os, - ((lr_ooo_t *)lr)->lr_foid, NULL); + LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL); if (error == ENOENT || error == EEXIST) return (0); } diff --git a/uts/common/sys/fs/zfs.h b/uts/common/sys/fs/zfs.h index 4c26edaecde6..b0dbfe0f2537 100644 --- a/uts/common/sys/fs/zfs.h +++ b/uts/common/sys/fs/zfs.h @@ -148,6 +148,7 @@ typedef enum { ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, + ZFS_PROP_DNODESIZE, ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, @@ -211,6 +212,7 @@ typedef enum { ZPOOL_PROP_BOOTSIZE, ZPOOL_PROP_CHECKPOINT, ZPOOL_PROP_TNAME, + ZPOOL_PROP_MAXDNODESIZE, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -365,6 +367,16 @@ typedef enum { ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; +typedef enum { + ZFS_DNSIZE_LEGACY = 0, + ZFS_DNSIZE_AUTO = 1, + ZFS_DNSIZE_1K = 1024, + ZFS_DNSIZE_2K = 2048, + ZFS_DNSIZE_4K = 4096, + ZFS_DNSIZE_8K = 8192, + ZFS_DNSIZE_16K = 16384 +} zfs_dnsize_type_t; + typedef enum { ZFS_REDUNDANT_METADATA_ALL, ZFS_REDUNDANT_METADATA_MOST