From aee0fec1fb90a00bd54e6118b677781df7f27daf Mon Sep 17 00:00:00 2001 From: Martin Matuska Date: Mon, 11 Feb 2013 08:07:56 +0000 Subject: [PATCH 1/7] Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13953:0cc6917308f7 Illumos dtrace issues: 3529 iostat should display time used by dtrace --- cmd/stat/common/statcommon.h | 6 +++++- uts/common/dtrace/dtrace.c | 9 +++++++-- uts/common/sys/cpuvar.h | 4 ++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/cmd/stat/common/statcommon.h b/cmd/stat/common/statcommon.h index 9ee077431ddb..fd3e9d2d9ba7 100644 --- a/cmd/stat/common/statcommon.h +++ b/cmd/stat/common/statcommon.h @@ -20,7 +20,10 @@ */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* * Common routines for acquiring snapshots of kstats for * iostat, mpstat, and vmstat. */ @@ -221,6 +224,7 @@ struct snapshot { size_t s_iodevs_is_name_maxlen; struct sys_snapshot s_sys; struct biostats s_biostats; + size_t s_nr_active_cpus; }; /* print a message and exit with failure */ diff --git a/uts/common/dtrace/dtrace.c b/uts/common/dtrace/dtrace.c index 7a5e67ea7065..4bee43839fd0 100644 --- a/uts/common/dtrace/dtrace.c +++ b/uts/common/dtrace/dtrace.c @@ -5861,7 +5861,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, size_t size; int vtime, onintr; volatile uint16_t *flags; - hrtime_t now; + hrtime_t now, end; /* * Kick out immediately if this CPU is still being born (in which case @@ -5876,6 +5876,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, cpuid = CPU->cpu_id; onintr = CPU_ON_INTR(CPU); + CPU->cpu_dtrace_probes++; + if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE && probe->dtpr_predcache == curthread->t_predcache) { /* @@ -6455,8 +6457,11 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, buf->dtb_offset = offs + ecb->dte_size; } + end = dtrace_gethrtime(); if (vtime) - curthread->t_dtrace_start = dtrace_gethrtime(); + curthread->t_dtrace_start = end; + + CPU->cpu_dtrace_nsec += end - now; dtrace_interrupt_enable(cookie); } diff --git a/uts/common/sys/cpuvar.h b/uts/common/sys/cpuvar.h index d4075d580be7..6c07bcbc8a8c 100644 --- a/uts/common/sys/cpuvar.h +++ b/uts/common/sys/cpuvar.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_CPUVAR_H @@ -187,6 +188,9 @@ typedef struct cpu { uintptr_t cpu_dtrace_caller; /* DTrace: caller, if any */ hrtime_t cpu_dtrace_chillmark; /* DTrace: chill mark time */ hrtime_t cpu_dtrace_chilled; /* DTrace: total chill time */ + uint64_t cpu_dtrace_probes; /* DTrace: total probes fired */ + hrtime_t cpu_dtrace_nsec; /* DTrace: ns in dtrace_probe */ + volatile uint16_t cpu_mstate; /* cpu microstate */ volatile uint16_t cpu_mstate_gen; /* generation counter */ volatile hrtime_t cpu_mstate_start; /* cpu microstate start time */ From 28c0619e5e4478c645d77b874f1fd66817afaab3 Mon Sep 17 00:00:00 2001 From: Martin Matuska Date: Mon, 18 Feb 2013 11:48:08 +0000 Subject: [PATCH 2/7] Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13957:512faafc0eaf Illumos ZFS issues: 3537 want pool io kstats --- lib/libzpool/common/kernel.c | 34 ++++++++++++++++- lib/libzpool/common/sys/zfs_context.h | 10 ++++- uts/common/fs/zfs/spa_misc.c | 12 ++++++ uts/common/fs/zfs/sys/spa_impl.h | 2 + uts/common/fs/zfs/vdev_queue.c | 54 +++++++++++++++++++++++++-- 5 files changed, 105 insertions(+), 7 deletions(-) diff --git a/lib/libzpool/common/kernel.c b/lib/libzpool/common/kernel.c index c698c23ef69c..cc0d5428beeb 100644 --- a/lib/libzpool/common/kernel.c +++ b/lib/libzpool/common/kernel.c @@ -80,8 +80,8 @@ zk_thread_create(void (*func)(), void *arg) */ /*ARGSUSED*/ kstat_t * -kstat_create(char *module, int instance, char *name, char *class, - uchar_t type, ulong_t ndata, uchar_t ks_flag) +kstat_create(const char *module, int instance, const char *name, + const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag) { return (NULL); } @@ -96,6 +96,36 @@ void kstat_delete(kstat_t *ksp) {} +/*ARGSUSED*/ +void +kstat_waitq_enter(kstat_io_t *kiop) +{} + +/*ARGSUSED*/ +void +kstat_waitq_exit(kstat_io_t *kiop) +{} + +/*ARGSUSED*/ +void +kstat_runq_enter(kstat_io_t *kiop) +{} + +/*ARGSUSED*/ +void +kstat_runq_exit(kstat_io_t *kiop) +{} + +/*ARGSUSED*/ +void +kstat_waitq_to_runq(kstat_io_t *kiop) +{} + +/*ARGSUSED*/ +void +kstat_runq_back_to_waitq(kstat_io_t *kiop) +{} + /* * ========================================================================= * mutexes diff --git a/lib/libzpool/common/sys/zfs_context.h b/lib/libzpool/common/sys/zfs_context.h index c638482d2ecb..bcb27cf1a5cd 100644 --- a/lib/libzpool/common/sys/zfs_context.h +++ b/lib/libzpool/common/sys/zfs_context.h @@ -255,10 +255,16 @@ extern void cv_broadcast(kcondvar_t *cv); /* * kstat creation, installation and deletion */ -extern kstat_t *kstat_create(char *, int, - char *, char *, uchar_t, ulong_t, uchar_t); +extern kstat_t *kstat_create(const char *, int, + const char *, const char *, uchar_t, ulong_t, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); +extern void kstat_waitq_enter(kstat_io_t *); +extern void kstat_waitq_exit(kstat_io_t *); +extern void kstat_runq_enter(kstat_io_t *); +extern void kstat_runq_exit(kstat_io_t *); +extern void kstat_waitq_to_runq(kstat_io_t *); +extern void kstat_runq_back_to_waitq(kstat_io_t *); /* * Kernel memory diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c index 0e2d0efda34c..405d93c6ced4 100644 --- a/uts/common/fs/zfs/spa_misc.c +++ b/uts/common/fs/zfs/spa_misc.c @@ -480,6 +480,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); @@ -559,6 +560,13 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) KM_SLEEP) == 0); } + spa->spa_iokstat = kstat_create("zfs", 0, name, + "disk", KSTAT_TYPE_IO, 1, 0); + if (spa->spa_iokstat) { + spa->spa_iokstat->ks_lock = &spa->spa_iokstat_lock; + kstat_install(spa->spa_iokstat); + } + return (spa); } @@ -608,6 +616,9 @@ spa_remove(spa_t *spa) spa_config_lock_destroy(spa); + kstat_delete(spa->spa_iokstat); + spa->spa_iokstat = NULL; + for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); @@ -625,6 +636,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); + mutex_destroy(&spa->spa_iokstat_lock); kmem_free(spa, sizeof (spa_t)); } diff --git a/uts/common/fs/zfs/sys/spa_impl.h b/uts/common/fs/zfs/sys/spa_impl.h index 42ce5556d377..ffd676e2ed77 100644 --- a/uts/common/fs/zfs/sys/spa_impl.h +++ b/uts/common/fs/zfs/sys/spa_impl.h @@ -231,6 +231,8 @@ struct spa { uint64_t spa_deadman_calls; /* number of deadman calls */ uint64_t spa_sync_starttime; /* starting time fo spa_sync */ uint64_t spa_deadman_synctime; /* deadman expiration timer */ + kmutex_t spa_iokstat_lock; /* protects spa_iokstat_* */ + struct kstat *spa_iokstat; /* kstat of io to this pool */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/uts/common/fs/zfs/vdev_queue.c b/uts/common/fs/zfs/vdev_queue.c index 2b06040c51b7..a806e9336209 100644 --- a/uts/common/fs/zfs/vdev_queue.c +++ b/uts/common/fs/zfs/vdev_queue.c @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -142,15 +143,62 @@ vdev_queue_fini(vdev_t *vd) static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { + spa_t *spa = zio->io_spa; avl_add(&vq->vq_deadline_tree, zio); avl_add(zio->io_vdev_tree, zio); + + if (spa->spa_iokstat != NULL) { + mutex_enter(&spa->spa_iokstat_lock); + kstat_waitq_enter(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); + } } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { + spa_t *spa = zio->io_spa; avl_remove(&vq->vq_deadline_tree, zio); avl_remove(zio->io_vdev_tree, zio); + + if (spa->spa_iokstat != NULL) { + mutex_enter(&spa->spa_iokstat_lock); + kstat_waitq_exit(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); + } +} + +static void +vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + avl_add(&vq->vq_pending_tree, zio); + if (spa->spa_iokstat != NULL) { + mutex_enter(&spa->spa_iokstat_lock); + kstat_runq_enter(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); + } +} + +static void +vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + avl_remove(&vq->vq_pending_tree, zio); + if (spa->spa_iokstat != NULL) { + kstat_io_t *ksio = spa->spa_iokstat->ks_data; + + mutex_enter(&spa->spa_iokstat_lock); + kstat_runq_exit(spa->spa_iokstat->ks_data); + if (zio->io_type == ZIO_TYPE_READ) { + ksio->reads++; + ksio->nread += zio->io_size; + } else if (zio->io_type == ZIO_TYPE_WRITE) { + ksio->writes++; + ksio->nwritten += zio->io_size; + } + mutex_exit(&spa->spa_iokstat_lock); + } } static void @@ -317,7 +365,7 @@ again: zio_execute(dio); } while (dio != lio); - avl_add(&vq->vq_pending_tree, aio); + vdev_queue_pending_add(vq, aio); return (aio); } @@ -339,7 +387,7 @@ again: goto again; } - avl_add(&vq->vq_pending_tree, fio); + vdev_queue_pending_add(vq, fio); return (fio); } @@ -395,7 +443,7 @@ vdev_queue_io_done(zio_t *zio) mutex_enter(&vq->vq_lock); - avl_remove(&vq->vq_pending_tree, zio); + vdev_queue_pending_remove(vq, zio); vq->vq_io_complete_ts = ddi_get_lbolt64(); vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; From b41b004f8768d731aa63e9fe454e873f26421fa5 Mon Sep 17 00:00:00 2001 From: Martin Matuska Date: Sat, 23 Feb 2013 08:57:47 +0000 Subject: [PATCH 3/7] Update vendor-sys/illumos/dist to illumos-gate 13958:1fd91513472c Illumos ZFS issues: 3561 arc_meta_limit should be exposed via kstats 3116 zpool reguid may log negative guids to internal SPA history --- uts/common/fs/zfs/arc.c | 18 ++++++++++++------ uts/common/fs/zfs/spa.c | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c index 258a89ff3ff6..ca3baeaeebb9 100644 --- a/uts/common/fs/zfs/arc.c +++ b/uts/common/fs/zfs/arc.c @@ -294,6 +294,9 @@ typedef struct arc_stats { kstat_named_t arcstat_duplicate_buffers; kstat_named_t arcstat_duplicate_buffers_size; kstat_named_t arcstat_duplicate_reads; + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_meta_max; } arc_stats_t; static arc_stats_t arc_stats = { @@ -352,7 +355,10 @@ static arc_stats_t arc_stats = { { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "duplicate_buffers", KSTAT_DATA_UINT64 }, { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, - { "duplicate_reads", KSTAT_DATA_UINT64 } + { "duplicate_reads", KSTAT_DATA_UINT64 }, + { "arc_meta_used", KSTAT_DATA_UINT64 }, + { "arc_meta_limit", KSTAT_DATA_UINT64 }, + { "arc_meta_max", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -414,13 +420,13 @@ static arc_state_t *arc_l2c_only; #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; -static uint64_t arc_meta_used; -static uint64_t arc_meta_limit; -static uint64_t arc_meta_max = 0; typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; @@ -1218,7 +1224,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) break; } - atomic_add_64(&arc_meta_used, space); + ARCSTAT_INCR(arcstat_meta_used, space); atomic_add_64(&arc_size, space); } @@ -1245,7 +1251,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; - atomic_add_64(&arc_meta_used, -space); + ARCSTAT_INCR(arcstat_meta_used, -space); ASSERT(arc_size >= space); atomic_add_64(&arc_size, -space); } diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index 09a322bad6b7..fb68e91eed16 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -715,7 +715,7 @@ spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); - spa_history_log_internal(spa, "guid change", tx, "old=%lld new=%lld", + spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", oldguid, *newguid); } From de1744ec7cbd92d6f4cd12058a9682fb01717851 Mon Sep 17 00:00:00 2001 From: Martin Matuska Date: Sat, 23 Feb 2013 09:00:35 +0000 Subject: [PATCH 4/7] Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13959:e03e14ddfb4c Illumos ZFS issues: 3552 condensing one space map burns 3 seconds of CPU in spa_sync() thread 3564 spa_sync() spends 5-10% of its time in metaslab_sync() (when not condensing) --- cmd/zdb/zdb.c | 12 +- uts/common/fs/zfs/metaslab.c | 352 +++++++++++++++++++------- uts/common/fs/zfs/space_map.c | 33 ++- uts/common/fs/zfs/sys/metaslab_impl.h | 32 ++- uts/common/fs/zfs/sys/space_map.h | 7 +- uts/common/fs/zfs/vdev.c | 1 + 6 files changed, 326 insertions(+), 111 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 9075c47ad1a3..2234a71528bf 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -571,7 +571,7 @@ static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; - space_map_t *sm = &msp->ms_map; + space_map_t *sm = msp->ms_map; avl_tree_t *t = sm->sm_pp_root; int free_pct = sm->sm_space * 100 / sm->sm_size; @@ -587,7 +587,7 @@ dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; - space_map_t *sm = &msp->ms_map; + space_map_t *sm = msp->ms_map; space_map_obj_t *smo = &msp->ms_smo; char freebuf[32]; @@ -2216,11 +2216,11 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; mutex_enter(&msp->ms_lock); - space_map_unload(&msp->ms_map); - VERIFY(space_map_load(&msp->ms_map, + space_map_unload(msp->ms_map); + VERIFY(space_map_load(msp->ms_map, &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo, spa->spa_meta_objset) == 0); - msp->ms_map.sm_ppd = vd; + msp->ms_map->sm_ppd = vd; mutex_exit(&msp->ms_lock); } } @@ -2243,7 +2243,7 @@ zdb_leak_fini(spa_t *spa) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; mutex_enter(&msp->ms_lock); - space_map_unload(&msp->ms_map); + space_map_unload(msp->ms_map); mutex_exit(&msp->ms_lock); } } diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index ceb7c6fa5733..9c4f97a3131d 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -47,6 +47,14 @@ uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ +/* + * The in-core space map representation is more compact than its on-disk form. + * The zfs_condense_pct determines how much more compact the in-core + * space_map representation must be before we compact it on-disk. + * Values should be greater than or equal to 100. + */ +int zfs_condense_pct = 200; + /* * This value defines the number of allowed allocation failures per vdev. * If a device reaches this threshold in a given txg then we consider skipping @@ -206,9 +214,9 @@ metaslab_compare(const void *x1, const void *x2) /* * If the weights are identical, use the offset to force uniqueness. */ - if (m1->ms_map.sm_start < m2->ms_map.sm_start) + if (m1->ms_map->sm_start < m2->ms_map->sm_start) return (-1); - if (m1->ms_map.sm_start > m2->ms_map.sm_start) + if (m1->ms_map->sm_start > m2->ms_map->sm_start) return (1); ASSERT3P(m1, ==, m2); @@ -723,14 +731,15 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, * addition of new space; and for debugging, it ensures that we'd * data fault on any attempt to use this metaslab before it's ready. */ - space_map_create(&msp->ms_map, start, size, + msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); + space_map_create(msp->ms_map, start, size, vd->vdev_ashift, &msp->ms_lock); metaslab_group_add(mg, msp); if (metaslab_debug && smo->smo_object != 0) { mutex_enter(&msp->ms_lock); - VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, + VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops, SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); mutex_exit(&msp->ms_lock); } @@ -758,22 +767,27 @@ metaslab_fini(metaslab_t *msp) metaslab_group_t *mg = msp->ms_group; vdev_space_update(mg->mg_vd, - -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); + -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size); metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); - space_map_unload(&msp->ms_map); - space_map_destroy(&msp->ms_map); + space_map_unload(msp->ms_map); + space_map_destroy(msp->ms_map); + kmem_free(msp->ms_map, sizeof (*msp->ms_map)); for (int t = 0; t < TXG_SIZE; t++) { - space_map_destroy(&msp->ms_allocmap[t]); - space_map_destroy(&msp->ms_freemap[t]); + space_map_destroy(msp->ms_allocmap[t]); + space_map_destroy(msp->ms_freemap[t]); + kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t])); + kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t])); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_destroy(&msp->ms_defermap[t]); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_destroy(msp->ms_defermap[t]); + kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t])); + } ASSERT0(msp->ms_deferspace); @@ -792,7 +806,7 @@ static uint64_t metaslab_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; - space_map_t *sm = &msp->ms_map; + space_map_t *sm = msp->ms_map; space_map_obj_t *smo = &msp->ms_smo; vdev_t *vd = mg->mg_vd; uint64_t weight, space; @@ -852,7 +866,7 @@ metaslab_prefetch(metaslab_group_t *mg) * Prefetch the next potential metaslabs */ for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { - space_map_t *sm = &msp->ms_map; + space_map_t *sm = msp->ms_map; space_map_obj_t *smo = &msp->ms_smo; /* If we have reached our prefetch limit then we're done */ @@ -873,7 +887,7 @@ static int metaslab_activate(metaslab_t *msp, uint64_t activation_weight) { metaslab_group_t *mg = msp->ms_group; - space_map_t *sm = &msp->ms_map; + space_map_t *sm = msp->ms_map; space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; ASSERT(MUTEX_HELD(&msp->ms_lock)); @@ -890,7 +904,7 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight) return (error); } for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_walk(&msp->ms_defermap[t], + space_map_walk(msp->ms_defermap[t], space_map_claim, sm); } @@ -921,11 +935,157 @@ metaslab_passivate(metaslab_t *msp, uint64_t size) * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ - ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); + ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0); metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } +/* + * Determine if the in-core space map representation can be condensed on-disk. + * We would like to use the following criteria to make our decision: + * + * 1. The size of the space map object should not dramatically increase as a + * result of writing out our in-core free map. + * + * 2. The minimal on-disk space map representation is zfs_condense_pct/100 + * times the size than the in-core representation (i.e. zfs_condense_pct = 110 + * and in-core = 1MB, minimal = 1.1.MB). + * + * Checking the first condition is tricky since we don't want to walk + * the entire AVL tree calculating the estimated on-disk size. Instead we + * use the size-ordered AVL tree in the space map and calculate the + * size required for the largest segment in our in-core free map. If the + * size required to represent that segment on disk is larger than the space + * map object then we avoid condensing this map. + * + * To determine the second criterion we use a best-case estimate and assume + * each segment can be represented on-disk as a single 64-bit entry. We refer + * to this best-case estimate as the space map's minimal form. + */ +static boolean_t +metaslab_should_condense(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo_syncing; + space_seg_t *ss; + uint64_t size, entries, segsz; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(sm->sm_loaded); + + /* + * Use the sm_pp_root AVL tree, which is ordered by size, to obtain + * the largest segment in the in-core free map. If the tree is + * empty then we should condense the map. + */ + ss = avl_last(sm->sm_pp_root); + if (ss == NULL) + return (B_TRUE); + + /* + * Calculate the number of 64-bit entries this segment would + * require when written to disk. If this single segment would be + * larger on-disk than the entire current on-disk structure, then + * clearly condensing will increase the on-disk structure size. + */ + size = (ss->ss_end - ss->ss_start) >> sm->sm_shift; + entries = size / (MIN(size, SM_RUN_MAX)); + segsz = entries * sizeof (uint64_t); + + return (segsz <= smo->smo_objsize && + smo->smo_objsize >= (zfs_condense_pct * + sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100); +} + +/* + * Condense the on-disk space map representation to its minimized form. + * The minimized form consists of a small number of allocations followed by + * the in-core free map. + */ +static void +metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK]; + space_map_t condense_map; + space_map_t *sm = msp->ms_map; + objset_t *mos = spa_meta_objset(spa); + space_map_obj_t *smo = &msp->ms_smo_syncing; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(sm->sm_loaded); + + spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " + "smo size %llu, segments %lu", txg, + (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, + smo->smo_objsize, avl_numnodes(&sm->sm_root)); + + /* + * Create an map that is a 100% allocated map. We remove segments + * that have been freed in this txg, any deferred frees that exist, + * and any allocation in the future. Removing segments should be + * a relatively inexpensive operation since we expect these maps to + * a small number of nodes. + */ + space_map_create(&condense_map, sm->sm_start, sm->sm_size, + sm->sm_shift, sm->sm_lock); + space_map_add(&condense_map, condense_map.sm_start, + condense_map.sm_size); + + /* + * Remove what's been freed in this txg from the condense_map. + * Since we're in sync_pass 1, we know that all the frees from + * this txg are in the freemap. + */ + space_map_walk(freemap, space_map_remove, &condense_map); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(msp->ms_defermap[t], + space_map_remove, &condense_map); + + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) + space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK], + space_map_remove, &condense_map); + + /* + * We're about to drop the metaslab's lock thus allowing + * other consumers to change it's content. Set the + * space_map's sm_condensing flag to ensure that + * allocations on this metaslab do not occur while we're + * in the middle of committing it to disk. This is only critical + * for the ms_map as all other space_maps use per txg + * views of their content. + */ + sm->sm_condensing = B_TRUE; + + mutex_exit(&msp->ms_lock); + space_map_truncate(smo, mos, tx); + mutex_enter(&msp->ms_lock); + + /* + * While we would ideally like to create a space_map representation + * that consists only of allocation records, doing so can be + * prohibitively expensive because the in-core free map can be + * large, and therefore computationally expensive to subtract + * from the condense_map. Instead we sync out two maps, a cheap + * allocation only map followed by the in-core free map. While not + * optimal, this is typically close to optimal, and much cheaper to + * compute. + */ + space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx); + space_map_vacate(&condense_map, NULL, NULL); + space_map_destroy(&condense_map); + + space_map_sync(sm, SM_FREE, smo, mos, tx); + sm->sm_condensing = B_FALSE; + + spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, " + "smo size %llu", txg, + (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, + smo->smo_objsize); +} + /* * Write a metaslab to disk in the context of the specified transaction group. */ @@ -935,17 +1095,29 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); - space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; - space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; - space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - space_map_t *sm = &msp->ms_map; + space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK]; + space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK]; + space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *sm = msp->ms_map; space_map_obj_t *smo = &msp->ms_smo_syncing; dmu_buf_t *db; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); - if (allocmap->sm_space == 0 && freemap->sm_space == 0) + /* + * This metaslab has just been added so there's no work to do now. + */ + if (*freemap == NULL) { + ASSERT3P(allocmap, ==, NULL); + return; + } + + ASSERT3P(allocmap, !=, NULL); + ASSERT3P(*freemap, !=, NULL); + ASSERT3P(*freed_map, !=, NULL); + + if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0) return; /* @@ -973,49 +1145,36 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) mutex_enter(&msp->ms_lock); - space_map_walk(freemap, space_map_add, freed_map); - - if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= - 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { - /* - * The in-core space map representation is twice as compact - * as the on-disk one, so it's time to condense the latter - * by generating a pure allocmap from first principles. - * - * This metaslab is 100% allocated, - * minus the content of the in-core map (sm), - * minus what's been freed this txg (freed_map), - * minus deferred frees (ms_defermap[]), - * minus allocations from txgs in the future - * (because they haven't been committed yet). - */ - space_map_vacate(allocmap, NULL, NULL); - space_map_vacate(freemap, NULL, NULL); - - space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); - - space_map_walk(sm, space_map_remove, allocmap); - space_map_walk(freed_map, space_map_remove, allocmap); - - for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_walk(&msp->ms_defermap[t], - space_map_remove, allocmap); - - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) - space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], - space_map_remove, allocmap); - - mutex_exit(&msp->ms_lock); - space_map_truncate(smo, mos, tx); - mutex_enter(&msp->ms_lock); + if (sm->sm_loaded && spa_sync_pass(spa) == 1 && + metaslab_should_condense(msp)) { + metaslab_condense(msp, txg, tx); + } else { + space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); + space_map_sync(*freemap, SM_FREE, smo, mos, tx); } - space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); - space_map_sync(freemap, SM_FREE, smo, mos, tx); + space_map_vacate(allocmap, NULL, NULL); + + /* + * For sync pass 1, we avoid walking the entire space map and + * instead will just swap the pointers for freemap and + * freed_map. We can safely do this since the freed_map is + * guaranteed to be empty on the initial pass. + */ + if (spa_sync_pass(spa) == 1) { + ASSERT0((*freed_map)->sm_space); + ASSERT0(avl_numnodes(&(*freed_map)->sm_root)); + space_map_swap(freemap, freed_map); + } else { + space_map_vacate(*freemap, space_map_add, *freed_map); + } + + ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space); + ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space); mutex_exit(&msp->ms_lock); - VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, sizeof (*smo)); bcopy(smo, db->db_data, sizeof (*smo)); @@ -1033,9 +1192,9 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) { space_map_obj_t *smo = &msp->ms_smo; space_map_obj_t *smosync = &msp->ms_smo_syncing; - space_map_t *sm = &msp->ms_map; - space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; + space_map_t *sm = msp->ms_map; + space_map_t *freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; int64_t alloc_delta, defer_delta; @@ -1046,19 +1205,30 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * If this metaslab is just becoming available, initialize its - * allocmaps and freemaps and add its capacity to the vdev. + * allocmaps, freemaps, and defermap and add its capacity to the vdev. */ - if (freed_map->sm_size == 0) { + if (freed_map == NULL) { + ASSERT(defer_map == NULL); for (int t = 0; t < TXG_SIZE; t++) { - space_map_create(&msp->ms_allocmap[t], sm->sm_start, + msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t), + KM_SLEEP); + space_map_create(msp->ms_allocmap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); - space_map_create(&msp->ms_freemap[t], sm->sm_start, + msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t), + KM_SLEEP); + space_map_create(msp->ms_freemap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_create(&msp->ms_defermap[t], sm->sm_start, + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t), + KM_SLEEP); + space_map_create(msp->ms_defermap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); + } + + freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; vdev_space_update(vd, 0, 0, sm->sm_size); } @@ -1068,8 +1238,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); - ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); - ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); + ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0); + ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0); /* * If there's a space_map_load() in progress, wait for it to complete @@ -1103,7 +1273,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) int evictable = 1; for (int t = 1; t < TXG_CONCURRENT_STATES; t++) - if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) + if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space) evictable = 0; if (evictable && !metaslab_debug) @@ -1128,7 +1298,7 @@ metaslab_sync_reassess(metaslab_group_t *mg) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - if (msp->ms_map.sm_start > mg->mg_bonus_area) + if (msp->ms_map->sm_start > mg->mg_bonus_area) break; mutex_enter(&msp->ms_lock); @@ -1149,7 +1319,7 @@ metaslab_distance(metaslab_t *msp, dva_t *dva) { uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; - uint64_t start = msp->ms_map.sm_start >> ms_shift; + uint64_t start = msp->ms_map->sm_start >> ms_shift; if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (1ULL << 63); @@ -1236,6 +1406,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, mutex_enter(&msp->ms_lock); + /* + * If this metaslab is currently condensing then pick again as + * we can't manipulate this metaslab until it's committed + * to disk. + */ + if (msp->ms_map->sm_condensing) { + mutex_exit(&msp->ms_lock); + continue; + } + /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that @@ -1262,20 +1442,20 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, continue; } - if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) + if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL) break; atomic_inc_64(&mg->mg_alloc_failures); - metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); + metaslab_passivate(msp, space_map_maxsize(msp->ms_map)); mutex_exit(&msp->ms_lock); } - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); + space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize); mutex_exit(&msp->ms_lock); @@ -1507,13 +1687,13 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) mutex_enter(&msp->ms_lock); if (now) { - space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], + space_map_remove(msp->ms_allocmap[txg & TXG_MASK], offset, size); - space_map_free(&msp->ms_map, offset, size); + space_map_free(msp->ms_map, offset, size); } else { - if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) + if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); + space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size); } mutex_exit(&msp->ms_lock); @@ -1548,10 +1728,10 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); - if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded) error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) + if (error == 0 && !space_map_contains(msp->ms_map, offset, size)) error = ENOENT; if (error || txg == 0) { /* txg == 0 indicates dry run */ @@ -1559,12 +1739,12 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) return (error); } - space_map_claim(&msp->ms_map, offset, size); + space_map_claim(msp->ms_map, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size); } mutex_exit(&msp->ms_lock); diff --git a/uts/common/fs/zfs/space_map.c b/uts/common/fs/zfs/space_map.c index 17dd860eacfb..30a35c85dab2 100644 --- a/uts/common/fs/zfs/space_map.c +++ b/uts/common/fs/zfs/space_map.c @@ -107,6 +107,7 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) int merge_before, merge_after; ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(!sm->sm_condensing); VERIFY(size != 0); VERIFY3U(start, >=, sm->sm_start); VERIFY3U(end, <=, sm->sm_start + sm->sm_size); @@ -175,6 +176,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) int left_over, right_over; ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(!sm->sm_condensing); VERIFY(size != 0); VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); @@ -243,6 +245,20 @@ space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); } +void +space_map_swap(space_map_t **msrc, space_map_t **mdst) +{ + space_map_t *sm; + + ASSERT(MUTEX_HELD((*msrc)->sm_lock)); + ASSERT0((*mdst)->sm_space); + ASSERT0(avl_numnodes(&(*mdst)->sm_root)); + + sm = *msrc; + *msrc = *mdst; + *mdst = sm; +} + void space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) { @@ -424,9 +440,9 @@ space_map_sync(space_map_t *sm, uint8_t maptype, space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(os); - void *cookie = NULL; + avl_tree_t *t = &sm->sm_root; space_seg_t *ss; - uint64_t bufsize, start, size, run_len, delta, sm_space; + uint64_t bufsize, start, size, run_len, total, sm_space, nodes; uint64_t *entry, *entry_map, *entry_map_end; ASSERT(MUTEX_HELD(sm->sm_lock)); @@ -455,13 +471,14 @@ space_map_sync(space_map_t *sm, uint8_t maptype, SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); - delta = 0; + total = 0; + nodes = avl_numnodes(&sm->sm_root); sm_space = sm->sm_space; - while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { + for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) { size = ss->ss_end - ss->ss_start; start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; - delta += size; + total += size; size >>= sm->sm_shift; while (size) { @@ -483,7 +500,6 @@ space_map_sync(space_map_t *sm, uint8_t maptype, start += run_len; size -= run_len; } - kmem_cache_free(space_seg_cache, ss); } if (entry != entry_map) { @@ -499,12 +515,11 @@ space_map_sync(space_map_t *sm, uint8_t maptype, * Ensure that the space_map's accounting wasn't changed * while we were in the middle of writing it out. */ + VERIFY3U(nodes, ==, avl_numnodes(&sm->sm_root)); VERIFY3U(sm->sm_space, ==, sm_space); + VERIFY3U(sm->sm_space, ==, total); zio_buf_free(entry_map, bufsize); - - sm->sm_space -= delta; - VERIFY0(sm->sm_space); } void diff --git a/uts/common/fs/zfs/sys/metaslab_impl.h b/uts/common/fs/zfs/sys/metaslab_impl.h index f1f1b38eeed6..138e14ef59fc 100644 --- a/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/uts/common/fs/zfs/sys/metaslab_impl.h @@ -66,20 +66,38 @@ struct metaslab_group { }; /* - * Each metaslab's free space is tracked in space map object in the MOS, - * which is only updated in syncing context. Each time we sync a txg, + * Each metaslab maintains an in-core free map (ms_map) that contains the + * current list of free segments. As blocks are allocated, the allocated + * segment is removed from the ms_map and added to a per txg allocation map. + * As blocks are freed, they are added to the per txg free map. These per + * txg maps allow us to process all allocations and frees in syncing context + * where it is safe to update the on-disk space maps. + * + * Each metaslab's free space is tracked in a space map object in the MOS, + * which is only updated in syncing context. Each time we sync a txg, * we append the allocs and frees from that txg to the space map object. * When the txg is done syncing, metaslab_sync_done() updates ms_smo - * to ms_smo_syncing. Everything in ms_smo is always safe to allocate. + * to ms_smo_syncing. Everything in ms_smo is always safe to allocate. + * + * To load the in-core free map we read the space map object from disk. + * This object contains a series of alloc and free records that are + * combined to make up the list of all free segments in this metaslab. These + * segments are represented in-core by the ms_map and are stored in an + * AVL tree. + * + * As the space map objects grows (as a result of the appends) it will + * eventually become space-inefficient. When the space map object is + * zfs_condense_pct/100 times the size of the minimal on-disk representation, + * we rewrite it in its minimized form. */ struct metaslab { kmutex_t ms_lock; /* metaslab lock */ space_map_obj_t ms_smo; /* synced space map object */ space_map_obj_t ms_smo_syncing; /* syncing space map object */ - space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ - space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ - space_map_t ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */ - space_map_t ms_map; /* in-core free space map */ + space_map_t *ms_allocmap[TXG_SIZE]; /* allocated this txg */ + space_map_t *ms_freemap[TXG_SIZE]; /* freed this txg */ + space_map_t *ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */ + space_map_t *ms_map; /* in-core free space map */ int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ metaslab_group_t *ms_group; /* metaslab group */ diff --git a/uts/common/fs/zfs/sys/space_map.h b/uts/common/fs/zfs/sys/space_map.h index 463b6bb4f2a3..2da50fb7b3b5 100644 --- a/uts/common/fs/zfs/sys/space_map.h +++ b/uts/common/fs/zfs/sys/space_map.h @@ -40,17 +40,17 @@ extern "C" { typedef struct space_map_ops space_map_ops_t; typedef struct space_map { - avl_tree_t sm_root; /* AVL tree of map segments */ + avl_tree_t sm_root; /* offset-ordered segment AVL tree */ uint64_t sm_space; /* sum of all segments in the map */ uint64_t sm_start; /* start of map */ uint64_t sm_size; /* size of map */ uint8_t sm_shift; /* unit shift */ - uint8_t sm_pad[3]; /* unused */ uint8_t sm_loaded; /* map loaded? */ uint8_t sm_loading; /* map loading? */ + uint8_t sm_condensing; /* map condensing? */ kcondvar_t sm_load_cv; /* map load completion */ space_map_ops_t *sm_ops; /* space map block picker ops vector */ - avl_tree_t *sm_pp_root; /* picker-private AVL tree */ + avl_tree_t *sm_pp_root; /* size-ordered, picker-private tree */ void *sm_ppd; /* picker-private data */ kmutex_t *sm_lock; /* pointer to lock that protects map */ } space_map_t; @@ -149,6 +149,7 @@ extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); extern boolean_t space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_swap(space_map_t **msrc, space_map_t **mdest); extern void space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest); extern void space_map_walk(space_map_t *sm, diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c index 4f5c4e9c4df0..41f9a07524d7 100644 --- a/uts/common/fs/zfs/vdev.c +++ b/uts/common/fs/zfs/vdev.c @@ -1836,6 +1836,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) space_map_truncate(smo, mos, tx); space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); + space_map_vacate(&smsync, NULL, NULL); space_map_destroy(&smsync); From cca1f5d90df9f1566556598bfd6afc12ff929e68 Mon Sep 17 00:00:00 2001 From: Martin Matuska Date: Sat, 23 Feb 2013 09:02:46 +0000 Subject: [PATCH 5/7] Update vendor-sys/illumos/dist to illumos-gate 13966:0e1d84ebb004 Illumos ZFS issues: 3578 transferring the freed map to the defer map should be constant time 3579 ztest trips assertion in metaslab_weight() --- uts/common/fs/zfs/metaslab.c | 38 +++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index 9c4f97a3131d..76c66a00a33d 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -813,6 +813,16 @@ metaslab_weight(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); + /* + * This vdev is in the process of being removed so there is nothing + * for us to do here. + */ + if (vd->vdev_removing) { + ASSERT0(smo->smo_alloc); + ASSERT0(vd->vdev_ms_shift); + return (0); + } + /* * The baseline weight is the metaslab's free space. */ @@ -1193,8 +1203,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) space_map_obj_t *smo = &msp->ms_smo; space_map_obj_t *smosync = &msp->ms_smo_syncing; space_map_t *sm = msp->ms_map; - space_map_t *freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - space_map_t *defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; + space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; int64_t alloc_delta, defer_delta; @@ -1207,8 +1217,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * If this metaslab is just becoming available, initialize its * allocmaps, freemaps, and defermap and add its capacity to the vdev. */ - if (freed_map == NULL) { - ASSERT(defer_map == NULL); + if (*freed_map == NULL) { + ASSERT(*defer_map == NULL); for (int t = 0; t < TXG_SIZE; t++) { msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); @@ -1227,14 +1237,14 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) sm->sm_size, sm->sm_shift, sm->sm_lock); } - freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; + freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; vdev_space_update(vd, 0, 0, sm->sm_size); } alloc_delta = smosync->smo_alloc - smo->smo_alloc; - defer_delta = freed_map->sm_space - defer_map->sm_space; + defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space; vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); @@ -1244,12 +1254,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * If there's a space_map_load() in progress, wait for it to complete * so that we have a consistent view of the in-core space map. - * Then, add defer_map (oldest deferred frees) to this map and - * transfer freed_map (this txg's frees) to defer_map. */ space_map_load_wait(sm); - space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); - space_map_vacate(freed_map, space_map_add, defer_map); + + /* + * Move the frees from the defer_map to this map (if it's loaded). + * Swap the freed_map and the defer_map -- this is safe to do + * because we've just emptied out the defer_map. + */ + space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm); + ASSERT0((*defer_map)->sm_space); + ASSERT0(avl_numnodes(&(*defer_map)->sm_root)); + space_map_swap(freed_map, defer_map); *smo = *smosync; From 3e63c757553472a19bdb02b0b85c61c927326975 Mon Sep 17 00:00:00 2001 From: Martin Matuska Date: Sat, 23 Feb 2013 09:06:36 +0000 Subject: [PATCH 6/7] Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13967:92bec6d87f59 Illumos ZFS issues: 3557 dumpvp_size is not updated correctly when a dump zvol's size is changed 3558 setting the volsize on a dump device does not return back ENOSPC 3559 setting a volsize larger than the space available sometimes succeeds --- cmd/zfs/zfs_main.c | 12 +++- lib/libzfs/common/libzfs_dataset.c | 22 +++++-- uts/common/fs/zfs/sys/zvol.h | 2 +- uts/common/fs/zfs/zfs_ioctl.c | 3 +- uts/common/fs/zfs/zvol.c | 97 +++++++++++++++++------------- 5 files changed, 83 insertions(+), 53 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index ee9d289a11a2..e5e35e6f166c 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -775,10 +775,12 @@ zfs_do_create(int argc, char **argv) if (type == ZFS_TYPE_VOLUME && !noreserve) { zpool_handle_t *zpool_handle; + nvlist_t *real_props; uint64_t spa_version; char *p; zfs_prop_t resv_prop; char *strval; + char msg[1024]; if (p = strchr(argv[0], '/')) *p = '\0'; @@ -794,7 +796,15 @@ zfs_do_create(int argc, char **argv) resv_prop = ZFS_PROP_REFRESERVATION; else resv_prop = ZFS_PROP_RESERVATION; - volsize = zvol_volsize_to_reservation(volsize, props); + + (void) snprintf(msg, sizeof (msg), + gettext("cannot create '%s'"), argv[0]); + if (props && (real_props = zfs_valid_proplist(g_zfs, type, + props, 0, NULL, msg)) == NULL) + goto error; + + volsize = zvol_volsize_to_reservation(volsize, real_props); + nvlist_free(real_props); if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), &strval) != 0) { diff --git a/lib/libzfs/common/libzfs_dataset.c b/lib/libzfs/common/libzfs_dataset.c index 35d27d9d1ffb..3ad1ed69d57b 100644 --- a/lib/libzfs/common/libzfs_dataset.c +++ b/lib/libzfs/common/libzfs_dataset.c @@ -1283,6 +1283,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) uint64_t old_reservation; uint64_t new_reservation; zfs_prop_t resv_prop; + nvlist_t *props; /* * If this is an existing volume, and someone is setting the volsize, @@ -1292,16 +1293,25 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_reservation = zfs_prop_get_int(zhp, resv_prop); - if ((zvol_volsize_to_reservation(old_volsize, zhp->zfs_props) != - old_reservation) || nvlist_lookup_uint64(nvl, - zfs_prop_to_name(resv_prop), &new_reservation) != ENOENT) { + + props = fnvlist_alloc(); + fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); + + if ((zvol_volsize_to_reservation(old_volsize, props) != + old_reservation) || nvlist_exists(nvl, + zfs_prop_to_name(resv_prop))) { + fnvlist_free(props); return (0); } if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), - &new_volsize) != 0) + &new_volsize) != 0) { + fnvlist_free(props); return (-1); - new_reservation = zvol_volsize_to_reservation(new_volsize, - zhp->zfs_props); + } + new_reservation = zvol_volsize_to_reservation(new_volsize, props); + fnvlist_free(props); + if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), new_reservation) != 0) { (void) no_memory(zhp->zfs_hdl); diff --git a/uts/common/fs/zfs/sys/zvol.h b/uts/common/fs/zfs/sys/zvol.h index 0059bf510260..db1f05ae9ca9 100644 --- a/uts/common/fs/zfs/sys/zvol.h +++ b/uts/common/fs/zfs/sys/zvol.h @@ -43,7 +43,7 @@ extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); extern int zvol_create_minor(const char *); extern int zvol_remove_minor(const char *); extern void zvol_remove_minors(const char *); -extern int zvol_set_volsize(const char *, major_t, uint64_t); +extern int zvol_set_volsize(const char *, uint64_t); extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks); diff --git a/uts/common/fs/zfs/zfs_ioctl.c b/uts/common/fs/zfs/zfs_ioctl.c index 090d6c9417a4..38adc1940ff8 100644 --- a/uts/common/fs/zfs/zfs_ioctl.c +++ b/uts/common/fs/zfs/zfs_ioctl.c @@ -2368,8 +2368,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, err = dsl_dataset_set_reservation(dsname, source, intval); break; case ZFS_PROP_VOLSIZE: - err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip), - intval); + err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_VERSION: { diff --git a/uts/common/fs/zfs/zvol.c b/uts/common/fs/zfs/zvol.c index da1b0efda5b3..b413f5ed8b00 100644 --- a/uts/common/fs/zfs/zvol.c +++ b/uts/common/fs/zfs/zvol.c @@ -145,10 +145,11 @@ static int zvol_dump_fini(zvol_state_t *zv); static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); static void -zvol_size_changed(uint64_t volsize, major_t maj, minor_t min) +zvol_size_changed(zvol_state_t *zv, uint64_t volsize) { - dev_t dev = makedevice(maj, min); + dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor); + zv->zv_volsize = volsize; VERIFY(ddi_prop_update_int64(dev, zfs_dip, "Size", volsize) == DDI_SUCCESS); VERIFY(ddi_prop_update_int64(dev, zfs_dip, @@ -610,22 +611,22 @@ zvol_first_open(zvol_state_t *zv) if (error) return (error); + zv->zv_objset = os; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) { ASSERT(error == 0); dmu_objset_disown(os, zvol_tag); return (error); } - zv->zv_objset = os; + error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); if (error) { dmu_objset_disown(os, zvol_tag); return (error); } - zv->zv_volsize = volsize; + + zvol_size_changed(zv, volsize); zv->zv_zilog = zil_open(os, zvol_get_data); - zvol_size_changed(zv->zv_volsize, ddi_driver_major(zfs_dip), - zv->zv_minor); VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly, NULL) == 0); @@ -747,56 +748,37 @@ zvol_remove_minors(const char *name) mutex_exit(&zfsdev_state_lock); } -int -zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) +static int +zvol_set_volsize_impl(objset_t *os, zvol_state_t *zv, uint64_t volsize) { - zvol_state_t *zv = NULL; - objset_t *os; - int error; - dmu_object_info_t doi; uint64_t old_volsize = 0ULL; - uint64_t readonly; - - mutex_enter(&zfsdev_state_lock); - zv = zvol_minor_lookup(name); - if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { - mutex_exit(&zfsdev_state_lock); - return (error); - } - - if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 || - (error = zvol_check_volsize(volsize, - doi.doi_data_block_size)) != 0) - goto out; - - VERIFY(dsl_prop_get_integer(name, "readonly", &readonly, - NULL) == 0); - if (readonly) { - error = EROFS; - goto out; - } + int error; + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); error = zvol_update_volsize(os, volsize); + /* * Reinitialize the dump area to the new size. If we * failed to resize the dump area then restore it back to - * its original size. + * its original size. We must set the new volsize prior + * to calling dumpvp_resize() to ensure that the devices' + * size(9P) is not visible by the dump subsystem. */ if (zv && error == 0) { + old_volsize = zv->zv_volsize; + zvol_size_changed(zv, volsize); + if (zv->zv_flags & ZVOL_DUMPIFIED) { - old_volsize = zv->zv_volsize; - zv->zv_volsize = volsize; if ((error = zvol_dumpify(zv)) != 0 || (error = dumpvp_resize()) != 0) { + int dumpify_error; + (void) zvol_update_volsize(os, old_volsize); - zv->zv_volsize = old_volsize; - error = zvol_dumpify(zv); + zvol_size_changed(zv, old_volsize); + dumpify_error = zvol_dumpify(zv); + error = dumpify_error ? dumpify_error : error; } } - if (error == 0) { - zv->zv_volsize = volsize; - zvol_size_changed(volsize, maj, zv->zv_minor); - } } /* @@ -819,12 +801,41 @@ zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) nvlist_free(attr); kmem_free(physpath, MAXPATHLEN); } + return (error); +} +int +zvol_set_volsize(const char *name, uint64_t volsize) +{ + zvol_state_t *zv = NULL; + objset_t *os; + int error; + dmu_object_info_t doi; + uint64_t readonly; + + mutex_enter(&zfsdev_state_lock); + zv = zvol_minor_lookup(name); + if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { + mutex_exit(&zfsdev_state_lock); + return (error); + } + + if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 || + (error = zvol_check_volsize(volsize, + doi.doi_data_block_size)) != 0) + goto out; + + VERIFY3U(dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL), ==, 0); + if (readonly) { + error = EROFS; + goto out; + } + + error = zvol_set_volsize_impl(os, zv, volsize); out: dmu_objset_rele(os, FTAG); - mutex_exit(&zfsdev_state_lock); - return (error); } From 0f857eb26863fa7eb17f1246ecae17cc81890101 Mon Sep 17 00:00:00 2001 From: Martin Matuska Date: Tue, 26 Feb 2013 08:51:39 +0000 Subject: [PATCH 7/7] Update vendor-sys/illumos/dist to illumos-gate 13968:e4988c7d0403 Illumos ZFS issues: 3552 condensing one space map burns 3 seconds of CPU in spa_sync() thread (fix race condition) --- uts/common/fs/zfs/metaslab.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index 76c66a00a33d..bf9889e183d5 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -1383,6 +1383,13 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, mutex_exit(&mg->mg_lock); return (-1ULL); } + + /* + * If the selected metaslab is condensing, skip it. + */ + if (msp->ms_map->sm_condensing) + continue; + was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; if (activation_weight == METASLAB_WEIGHT_PRIMARY) break; @@ -1422,16 +1429,6 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, mutex_enter(&msp->ms_lock); - /* - * If this metaslab is currently condensing then pick again as - * we can't manipulate this metaslab until it's committed - * to disk. - */ - if (msp->ms_map->sm_condensing) { - mutex_exit(&msp->ms_lock); - continue; - } - /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that @@ -1458,6 +1455,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, continue; } + /* + * If this metaslab is currently condensing then pick again as + * we can't manipulate this metaslab until it's committed + * to disk. + */ + if (msp->ms_map->sm_condensing) { + mutex_exit(&msp->ms_lock); + continue; + } + if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL) break;