MFV r336948: 9112 Improve allocation performance on high-end systems
On high-end systems running async sequential write workloads, especially NUMA systems with flash or NVMe storage, one significant performance bottleneck is selecting a metaslab to do allocations from. This process can be parallelized, providing significant performance increases for these workloads. illumos/illumos-gate@f78cdc34af Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com> Reviewed by: Alexander Motin <mav@FreeBSD.org> Approved by: Gordon Ross <gwr@nexenta.com> Author: Paul Dagnelie <pcd@delphix.com>
This commit is contained in:
commit
9cd6f162c0
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
*/
|
||||
@ -275,6 +275,8 @@ static uint64_t metaslab_weight(metaslab_t *);
|
||||
static void metaslab_set_fragmentation(metaslab_t *);
|
||||
static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
|
||||
static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
|
||||
static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
|
||||
static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
|
||||
|
||||
kmem_cache_t *metaslab_alloc_trace_cache;
|
||||
|
||||
@ -294,7 +296,12 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
|
||||
mc->mc_rotor = NULL;
|
||||
mc->mc_ops = ops;
|
||||
mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
refcount_create_tracked(&mc->mc_alloc_slots);
|
||||
mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
|
||||
sizeof (refcount_t), KM_SLEEP);
|
||||
mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
|
||||
sizeof (uint64_t), KM_SLEEP);
|
||||
for (int i = 0; i < spa->spa_alloc_count; i++)
|
||||
refcount_create_tracked(&mc->mc_alloc_slots[i]);
|
||||
|
||||
return (mc);
|
||||
}
|
||||
@ -308,7 +315,12 @@ metaslab_class_destroy(metaslab_class_t *mc)
|
||||
ASSERT(mc->mc_space == 0);
|
||||
ASSERT(mc->mc_dspace == 0);
|
||||
|
||||
refcount_destroy(&mc->mc_alloc_slots);
|
||||
for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
|
||||
refcount_destroy(&mc->mc_alloc_slots[i]);
|
||||
kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
|
||||
sizeof (refcount_t));
|
||||
kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
|
||||
sizeof (uint64_t));
|
||||
mutex_destroy(&mc->mc_lock);
|
||||
kmem_free(mc, sizeof (metaslab_class_t));
|
||||
}
|
||||
@ -532,6 +544,30 @@ metaslab_compare(const void *x1, const void *x2)
|
||||
const metaslab_t *m1 = x1;
|
||||
const metaslab_t *m2 = x2;
|
||||
|
||||
int sort1 = 0;
|
||||
int sort2 = 0;
|
||||
if (m1->ms_allocator != -1 && m1->ms_primary)
|
||||
sort1 = 1;
|
||||
else if (m1->ms_allocator != -1 && !m1->ms_primary)
|
||||
sort1 = 2;
|
||||
if (m2->ms_allocator != -1 && m2->ms_primary)
|
||||
sort2 = 1;
|
||||
else if (m2->ms_allocator != -1 && !m2->ms_primary)
|
||||
sort2 = 2;
|
||||
|
||||
/*
|
||||
* Sort inactive metaslabs first, then primaries, then secondaries. When
|
||||
* selecting a metaslab to allocate from, an allocator first tries its
|
||||
* primary, then secondary active metaslab. If it doesn't have active
|
||||
* metaslabs, or can't allocate from them, it searches for an inactive
|
||||
* metaslab to activate. If it can't find a suitable one, it will steal
|
||||
* a primary or secondary metaslab from another allocator.
|
||||
*/
|
||||
if (sort1 < sort2)
|
||||
return (-1);
|
||||
if (sort1 > sort2)
|
||||
return (1);
|
||||
|
||||
if (m1->ms_weight < m2->ms_weight)
|
||||
return (1);
|
||||
if (m1->ms_weight > m2->ms_weight)
|
||||
@ -683,12 +719,16 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
|
||||
}
|
||||
|
||||
metaslab_group_t *
|
||||
metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
|
||||
metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
|
||||
{
|
||||
metaslab_group_t *mg;
|
||||
|
||||
mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
|
||||
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
|
||||
KM_SLEEP);
|
||||
mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
|
||||
KM_SLEEP);
|
||||
avl_create(&mg->mg_metaslab_tree, metaslab_compare,
|
||||
sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
|
||||
mg->mg_vd = vd;
|
||||
@ -696,7 +736,16 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
|
||||
mg->mg_activation_count = 0;
|
||||
mg->mg_initialized = B_FALSE;
|
||||
mg->mg_no_free_space = B_TRUE;
|
||||
refcount_create_tracked(&mg->mg_alloc_queue_depth);
|
||||
mg->mg_allocators = allocators;
|
||||
|
||||
mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t),
|
||||
KM_SLEEP);
|
||||
mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
|
||||
sizeof (uint64_t), KM_SLEEP);
|
||||
for (int i = 0; i < allocators; i++) {
|
||||
refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
|
||||
mg->mg_cur_max_alloc_queue_depth[i] = 0;
|
||||
}
|
||||
|
||||
mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
|
||||
minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
|
||||
@ -718,8 +767,20 @@ metaslab_group_destroy(metaslab_group_t *mg)
|
||||
|
||||
taskq_destroy(mg->mg_taskq);
|
||||
avl_destroy(&mg->mg_metaslab_tree);
|
||||
kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
|
||||
kmem_free(mg->mg_secondaries, mg->mg_allocators *
|
||||
sizeof (metaslab_t *));
|
||||
mutex_destroy(&mg->mg_lock);
|
||||
refcount_destroy(&mg->mg_alloc_queue_depth);
|
||||
|
||||
for (int i = 0; i < mg->mg_allocators; i++) {
|
||||
refcount_destroy(&mg->mg_alloc_queue_depth[i]);
|
||||
mg->mg_cur_max_alloc_queue_depth[i] = 0;
|
||||
}
|
||||
kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
|
||||
sizeof (refcount_t));
|
||||
kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
|
||||
sizeof (uint64_t));
|
||||
|
||||
kmem_free(mg, sizeof (metaslab_group_t));
|
||||
}
|
||||
|
||||
@ -799,6 +860,22 @@ metaslab_group_passivate(metaslab_group_t *mg)
|
||||
taskq_wait(mg->mg_taskq);
|
||||
spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
|
||||
metaslab_group_alloc_update(mg);
|
||||
for (int i = 0; i < mg->mg_allocators; i++) {
|
||||
metaslab_t *msp = mg->mg_primaries[i];
|
||||
if (msp != NULL) {
|
||||
mutex_enter(&msp->ms_lock);
|
||||
metaslab_passivate(msp,
|
||||
metaslab_weight_from_range_tree(msp));
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
msp = mg->mg_secondaries[i];
|
||||
if (msp != NULL) {
|
||||
mutex_enter(&msp->ms_lock);
|
||||
metaslab_passivate(msp,
|
||||
metaslab_weight_from_range_tree(msp));
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
}
|
||||
|
||||
mgprev = mg->mg_prev;
|
||||
mgnext = mg->mg_next;
|
||||
@ -939,6 +1016,17 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&mg->mg_lock));
|
||||
ASSERT(msp->ms_group == mg);
|
||||
avl_remove(&mg->mg_metaslab_tree, msp);
|
||||
msp->ms_weight = weight;
|
||||
avl_add(&mg->mg_metaslab_tree, msp);
|
||||
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
|
||||
{
|
||||
@ -950,10 +1038,7 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
|
||||
mutex_enter(&mg->mg_lock);
|
||||
ASSERT(msp->ms_group == mg);
|
||||
avl_remove(&mg->mg_metaslab_tree, msp);
|
||||
msp->ms_weight = weight;
|
||||
avl_add(&mg->mg_metaslab_tree, msp);
|
||||
metaslab_group_sort_impl(mg, msp, weight);
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
|
||||
@ -1001,7 +1086,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
|
||||
*/
|
||||
static boolean_t
|
||||
metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
|
||||
uint64_t psize)
|
||||
uint64_t psize, int allocator)
|
||||
{
|
||||
spa_t *spa = mg->mg_vd->vdev_spa;
|
||||
metaslab_class_t *mc = mg->mg_class;
|
||||
@ -1030,7 +1115,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
|
||||
if (mg->mg_allocatable) {
|
||||
metaslab_group_t *mgp;
|
||||
int64_t qdepth;
|
||||
uint64_t qmax = mg->mg_max_alloc_queue_depth;
|
||||
uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
|
||||
|
||||
if (!mc->mc_alloc_throttle_enabled)
|
||||
return (B_TRUE);
|
||||
@ -1042,7 +1127,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
|
||||
if (mg->mg_no_free_space)
|
||||
return (B_FALSE);
|
||||
|
||||
qdepth = refcount_count(&mg->mg_alloc_queue_depth);
|
||||
qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]);
|
||||
|
||||
/*
|
||||
* If this metaslab group is below its qmax or it's
|
||||
@ -1061,9 +1146,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
|
||||
* groups at the same time when we make this check.
|
||||
*/
|
||||
for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
|
||||
qmax = mgp->mg_max_alloc_queue_depth;
|
||||
qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
|
||||
|
||||
qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
|
||||
qdepth = refcount_count(
|
||||
&mgp->mg_alloc_queue_depth[allocator]);
|
||||
|
||||
/*
|
||||
* If there is another metaslab group that
|
||||
@ -1471,6 +1557,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
|
||||
ms->ms_id = id;
|
||||
ms->ms_start = id << vd->vdev_ms_shift;
|
||||
ms->ms_size = 1ULL << vd->vdev_ms_shift;
|
||||
ms->ms_allocator = -1;
|
||||
ms->ms_new = B_TRUE;
|
||||
|
||||
/*
|
||||
* We only open space map objects that already exist. All others
|
||||
@ -1567,6 +1655,7 @@ metaslab_fini(metaslab_t *msp)
|
||||
cv_destroy(&msp->ms_load_cv);
|
||||
mutex_destroy(&msp->ms_lock);
|
||||
mutex_destroy(&msp->ms_sync_lock);
|
||||
ASSERT3U(msp->ms_allocator, ==, -1);
|
||||
|
||||
kmem_free(msp, sizeof (metaslab_t));
|
||||
}
|
||||
@ -1963,19 +2052,59 @@ metaslab_weight(metaslab_t *msp)
|
||||
}
|
||||
|
||||
static int
|
||||
metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
|
||||
metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
|
||||
int allocator, uint64_t activation_weight)
|
||||
{
|
||||
/*
|
||||
* If we're activating for the claim code, we don't want to actually
|
||||
* set the metaslab up for a specific allocator.
|
||||
*/
|
||||
if (activation_weight == METASLAB_WEIGHT_CLAIM)
|
||||
return (0);
|
||||
metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
|
||||
mg->mg_primaries : mg->mg_secondaries);
|
||||
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
mutex_enter(&mg->mg_lock);
|
||||
if (arr[allocator] != NULL) {
|
||||
mutex_exit(&mg->mg_lock);
|
||||
return (EEXIST);
|
||||
}
|
||||
|
||||
arr[allocator] = msp;
|
||||
ASSERT3S(msp->ms_allocator, ==, -1);
|
||||
msp->ms_allocator = allocator;
|
||||
msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
|
||||
mutex_exit(&mg->mg_lock);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
|
||||
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
|
||||
int error = 0;
|
||||
metaslab_load_wait(msp);
|
||||
if (!msp->ms_loaded) {
|
||||
int error = metaslab_load(msp);
|
||||
if (error) {
|
||||
if ((error = metaslab_load(msp)) != 0) {
|
||||
metaslab_group_sort(msp->ms_group, msp, 0);
|
||||
return (error);
|
||||
}
|
||||
}
|
||||
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
|
||||
/*
|
||||
* The metaslab was activated for another allocator
|
||||
* while we were waiting, we should reselect.
|
||||
*/
|
||||
return (EBUSY);
|
||||
}
|
||||
if ((error = metaslab_activate_allocator(msp->ms_group, msp,
|
||||
allocator, activation_weight)) != 0) {
|
||||
return (error);
|
||||
}
|
||||
|
||||
msp->ms_activation_weight = msp->ms_weight;
|
||||
metaslab_group_sort(msp->ms_group, msp,
|
||||
@ -1987,6 +2116,34 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
|
||||
uint64_t weight)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
|
||||
metaslab_group_sort(mg, msp, weight);
|
||||
return;
|
||||
}
|
||||
|
||||
mutex_enter(&mg->mg_lock);
|
||||
ASSERT3P(msp->ms_group, ==, mg);
|
||||
if (msp->ms_primary) {
|
||||
ASSERT3U(0, <=, msp->ms_allocator);
|
||||
ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
|
||||
ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
|
||||
ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
|
||||
mg->mg_primaries[msp->ms_allocator] = NULL;
|
||||
} else {
|
||||
ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
|
||||
ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
|
||||
mg->mg_secondaries[msp->ms_allocator] = NULL;
|
||||
}
|
||||
msp->ms_allocator = -1;
|
||||
metaslab_group_sort_impl(mg, msp, weight);
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_passivate(metaslab_t *msp, uint64_t weight)
|
||||
{
|
||||
@ -2002,7 +2159,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
|
||||
ASSERT0(weight & METASLAB_ACTIVE_MASK);
|
||||
|
||||
msp->ms_activation_weight = 0;
|
||||
metaslab_group_sort(msp->ms_group, msp, weight);
|
||||
metaslab_passivate_allocator(msp->ms_group, msp, weight);
|
||||
ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
|
||||
}
|
||||
|
||||
@ -2556,11 +2713,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
||||
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
|
||||
}
|
||||
|
||||
if (msp->ms_new) {
|
||||
msp->ms_new = B_FALSE;
|
||||
mutex_enter(&mg->mg_lock);
|
||||
mg->mg_ms_ready++;
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
/*
|
||||
* Calculate the new weights before unloading any metaslabs.
|
||||
* This will give us the most accurate weighting.
|
||||
*/
|
||||
metaslab_group_sort(mg, msp, metaslab_weight(msp));
|
||||
metaslab_group_sort(mg, msp, metaslab_weight(msp) |
|
||||
(msp->ms_weight & METASLAB_ACTIVE_MASK));
|
||||
|
||||
/*
|
||||
* If the metaslab is loaded and we've not tried to load or allocate
|
||||
@ -2572,6 +2736,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
||||
VERIFY0(range_tree_space(
|
||||
msp->ms_allocating[(txg + t) & TXG_MASK]));
|
||||
}
|
||||
if (msp->ms_allocator != -1) {
|
||||
metaslab_passivate(msp, msp->ms_weight &
|
||||
~METASLAB_ACTIVE_MASK);
|
||||
}
|
||||
|
||||
if (!metaslab_debug_unload)
|
||||
metaslab_unload(msp);
|
||||
@ -2665,7 +2833,8 @@ metaslab_alloc_trace_fini(void)
|
||||
*/
|
||||
static void
|
||||
metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
|
||||
metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
|
||||
metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
|
||||
int allocator)
|
||||
{
|
||||
if (!metaslab_trace_enabled)
|
||||
return;
|
||||
@ -2698,6 +2867,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
|
||||
mat->mat_dva_id = dva_id;
|
||||
mat->mat_offset = offset;
|
||||
mat->mat_weight = 0;
|
||||
mat->mat_allocator = allocator;
|
||||
|
||||
if (msp != NULL)
|
||||
mat->mat_weight = msp->ms_weight;
|
||||
@ -2738,35 +2908,56 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
|
||||
*/
|
||||
|
||||
static void
|
||||
metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
|
||||
metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
|
||||
int allocator)
|
||||
{
|
||||
if (!(flags & METASLAB_ASYNC_ALLOC) ||
|
||||
flags & METASLAB_DONT_THROTTLE)
|
||||
(flags & METASLAB_DONT_THROTTLE))
|
||||
return;
|
||||
|
||||
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
|
||||
if (!mg->mg_class->mc_alloc_throttle_enabled)
|
||||
return;
|
||||
|
||||
(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
|
||||
(void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
|
||||
{
|
||||
uint64_t max = mg->mg_max_alloc_queue_depth;
|
||||
uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
|
||||
while (cur < max) {
|
||||
if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
|
||||
cur, cur + 1) == cur) {
|
||||
atomic_inc_64(
|
||||
&mg->mg_class->mc_alloc_max_slots[allocator]);
|
||||
return;
|
||||
}
|
||||
cur = mg->mg_cur_max_alloc_queue_depth[allocator];
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
|
||||
metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
|
||||
int allocator, boolean_t io_complete)
|
||||
{
|
||||
if (!(flags & METASLAB_ASYNC_ALLOC) ||
|
||||
flags & METASLAB_DONT_THROTTLE)
|
||||
(flags & METASLAB_DONT_THROTTLE))
|
||||
return;
|
||||
|
||||
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
|
||||
if (!mg->mg_class->mc_alloc_throttle_enabled)
|
||||
return;
|
||||
|
||||
(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
|
||||
(void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
|
||||
if (io_complete)
|
||||
metaslab_group_increment_qdepth(mg, allocator);
|
||||
}
|
||||
|
||||
void
|
||||
metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
|
||||
metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
|
||||
int allocator)
|
||||
{
|
||||
#ifdef ZFS_DEBUG
|
||||
const dva_t *dva = bp->blk_dva;
|
||||
@ -2775,7 +2966,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
|
||||
for (int d = 0; d < ndvas; d++) {
|
||||
uint64_t vdev = DVA_GET_VDEV(&dva[d]);
|
||||
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
|
||||
VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
|
||||
VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator],
|
||||
tag));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -2817,91 +3009,146 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
|
||||
return (start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the metaslab with the highest weight that is less than what we've
|
||||
* already tried. In the common case, this means that we will examine each
|
||||
* metaslab at most once. Note that concurrent callers could reorder metaslabs
|
||||
* by activation/passivation once we have dropped the mg_lock. If a metaslab is
|
||||
* activated by another thread, and we fail to allocate from the metaslab we
|
||||
* have selected, we may not try the newly-activated metaslab, and instead
|
||||
* activate another metaslab. This is not optimal, but generally does not cause
|
||||
* any problems (a possible exception being if every metaslab is completely full
|
||||
* except for the the newly-activated metaslab which we fail to examine).
|
||||
*/
|
||||
static metaslab_t *
|
||||
find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
|
||||
dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
|
||||
zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
|
||||
{
|
||||
avl_index_t idx;
|
||||
avl_tree_t *t = &mg->mg_metaslab_tree;
|
||||
metaslab_t *msp = avl_find(t, search, &idx);
|
||||
if (msp == NULL)
|
||||
msp = avl_nearest(t, idx, AVL_AFTER);
|
||||
|
||||
for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
|
||||
int i;
|
||||
if (!metaslab_should_allocate(msp, asize)) {
|
||||
metaslab_trace_add(zal, mg, msp, asize, d,
|
||||
TRACE_TOO_SMALL, allocator);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the selected metaslab is condensing, skip it.
|
||||
*/
|
||||
if (msp->ms_condensing)
|
||||
continue;
|
||||
|
||||
*was_active = msp->ms_allocator != -1;
|
||||
/*
|
||||
* If we're activating as primary, this is our first allocation
|
||||
* from this disk, so we don't need to check how close we are.
|
||||
* If the metaslab under consideration was already active,
|
||||
* we're getting desperate enough to steal another allocator's
|
||||
* metaslab, so we still don't care about distances.
|
||||
*/
|
||||
if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
|
||||
break;
|
||||
|
||||
uint64_t target_distance = min_distance
|
||||
+ (space_map_allocated(msp->ms_sm) != 0 ? 0 :
|
||||
min_distance >> 1);
|
||||
|
||||
for (i = 0; i < d; i++) {
|
||||
if (metaslab_distance(msp, &dva[i]) < target_distance)
|
||||
break;
|
||||
}
|
||||
if (i == d)
|
||||
break;
|
||||
}
|
||||
|
||||
if (msp != NULL) {
|
||||
search->ms_weight = msp->ms_weight;
|
||||
search->ms_start = msp->ms_start + 1;
|
||||
search->ms_allocator = msp->ms_allocator;
|
||||
search->ms_primary = msp->ms_primary;
|
||||
}
|
||||
return (msp);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static uint64_t
|
||||
metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
||||
uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
|
||||
uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
|
||||
int allocator)
|
||||
{
|
||||
metaslab_t *msp = NULL;
|
||||
uint64_t offset = -1ULL;
|
||||
uint64_t activation_weight;
|
||||
uint64_t target_distance;
|
||||
int i;
|
||||
boolean_t tertiary = B_FALSE;
|
||||
|
||||
activation_weight = METASLAB_WEIGHT_PRIMARY;
|
||||
for (i = 0; i < d; i++) {
|
||||
if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
|
||||
for (int i = 0; i < d; i++) {
|
||||
if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
|
||||
DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
|
||||
activation_weight = METASLAB_WEIGHT_SECONDARY;
|
||||
} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
|
||||
DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
|
||||
tertiary = B_TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If we don't have enough metaslabs active to fill the entire array, we
|
||||
* just use the 0th slot.
|
||||
*/
|
||||
if (mg->mg_ms_ready < mg->mg_allocators * 2) {
|
||||
tertiary = B_FALSE;
|
||||
allocator = 0;
|
||||
}
|
||||
|
||||
ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
|
||||
|
||||
metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
|
||||
search->ms_weight = UINT64_MAX;
|
||||
search->ms_start = 0;
|
||||
/*
|
||||
* At the end of the metaslab tree are the already-active metaslabs,
|
||||
* first the primaries, then the secondaries. When we resume searching
|
||||
* through the tree, we need to consider ms_allocator and ms_primary so
|
||||
* we start in the location right after where we left off, and don't
|
||||
* accidentally loop forever considering the same metaslabs.
|
||||
*/
|
||||
search->ms_allocator = -1;
|
||||
search->ms_primary = B_TRUE;
|
||||
for (;;) {
|
||||
boolean_t was_active;
|
||||
avl_tree_t *t = &mg->mg_metaslab_tree;
|
||||
avl_index_t idx;
|
||||
boolean_t was_active = B_FALSE;
|
||||
|
||||
mutex_enter(&mg->mg_lock);
|
||||
|
||||
/*
|
||||
* Find the metaslab with the highest weight that is less
|
||||
* than what we've already tried. In the common case, this
|
||||
* means that we will examine each metaslab at most once.
|
||||
* Note that concurrent callers could reorder metaslabs
|
||||
* by activation/passivation once we have dropped the mg_lock.
|
||||
* If a metaslab is activated by another thread, and we fail
|
||||
* to allocate from the metaslab we have selected, we may
|
||||
* not try the newly-activated metaslab, and instead activate
|
||||
* another metaslab. This is not optimal, but generally
|
||||
* does not cause any problems (a possible exception being
|
||||
* if every metaslab is completely full except for the
|
||||
* the newly-activated metaslab which we fail to examine).
|
||||
*/
|
||||
msp = avl_find(t, search, &idx);
|
||||
if (msp == NULL)
|
||||
msp = avl_nearest(t, idx, AVL_AFTER);
|
||||
for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
|
||||
|
||||
if (!metaslab_should_allocate(msp, asize)) {
|
||||
metaslab_trace_add(zal, mg, msp, asize, d,
|
||||
TRACE_TOO_SMALL);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the selected metaslab is condensing, skip it.
|
||||
*/
|
||||
if (msp->ms_condensing)
|
||||
continue;
|
||||
|
||||
was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
|
||||
if (activation_weight == METASLAB_WEIGHT_PRIMARY)
|
||||
break;
|
||||
|
||||
target_distance = min_distance +
|
||||
(space_map_allocated(msp->ms_sm) != 0 ? 0 :
|
||||
min_distance >> 1);
|
||||
|
||||
for (i = 0; i < d; i++) {
|
||||
if (metaslab_distance(msp, &dva[i]) <
|
||||
target_distance)
|
||||
break;
|
||||
}
|
||||
if (i == d)
|
||||
break;
|
||||
if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
|
||||
mg->mg_primaries[allocator] != NULL) {
|
||||
msp = mg->mg_primaries[allocator];
|
||||
was_active = B_TRUE;
|
||||
} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
|
||||
mg->mg_secondaries[allocator] != NULL && !tertiary) {
|
||||
msp = mg->mg_secondaries[allocator];
|
||||
was_active = B_TRUE;
|
||||
} else {
|
||||
msp = find_valid_metaslab(mg, activation_weight, dva, d,
|
||||
min_distance, asize, allocator, zal, search,
|
||||
&was_active);
|
||||
}
|
||||
|
||||
mutex_exit(&mg->mg_lock);
|
||||
if (msp == NULL) {
|
||||
kmem_free(search, sizeof (*search));
|
||||
return (-1ULL);
|
||||
}
|
||||
search->ms_weight = msp->ms_weight;
|
||||
search->ms_start = msp->ms_start + 1;
|
||||
|
||||
mutex_enter(&msp->ms_lock);
|
||||
|
||||
/*
|
||||
* Ensure that the metaslab we have selected is still
|
||||
* capable of handling our request. It's possible that
|
||||
@ -2915,18 +3162,32 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
|
||||
activation_weight == METASLAB_WEIGHT_PRIMARY) {
|
||||
metaslab_passivate(msp,
|
||||
msp->ms_weight & ~METASLAB_ACTIVE_MASK);
|
||||
/*
|
||||
* If the metaslab is freshly activated for an allocator that
|
||||
* isn't the one we're allocating from, or if it's a primary and
|
||||
* we're seeking a secondary (or vice versa), we go back and
|
||||
* select a new metaslab.
|
||||
*/
|
||||
if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
|
||||
(msp->ms_allocator != -1) &&
|
||||
(msp->ms_allocator != allocator || ((activation_weight ==
|
||||
METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
|
||||
mutex_exit(&msp->ms_lock);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (metaslab_activate(msp, activation_weight) != 0) {
|
||||
if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
|
||||
metaslab_passivate(msp, msp->ms_weight &
|
||||
~METASLAB_WEIGHT_CLAIM);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (metaslab_activate(msp, allocator, activation_weight) != 0) {
|
||||
mutex_exit(&msp->ms_lock);
|
||||
continue;
|
||||
}
|
||||
|
||||
msp->ms_selected_txg = txg;
|
||||
|
||||
/*
|
||||
@ -2939,7 +3200,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
||||
if (!metaslab_should_allocate(msp, asize)) {
|
||||
/* Passivate this metaslab and select a new one. */
|
||||
metaslab_trace_add(zal, mg, msp, asize, d,
|
||||
TRACE_TOO_SMALL);
|
||||
TRACE_TOO_SMALL, allocator);
|
||||
goto next;
|
||||
}
|
||||
|
||||
@ -2950,13 +3211,15 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
||||
*/
|
||||
if (msp->ms_condensing) {
|
||||
metaslab_trace_add(zal, mg, msp, asize, d,
|
||||
TRACE_CONDENSING);
|
||||
TRACE_CONDENSING, allocator);
|
||||
metaslab_passivate(msp, msp->ms_weight &
|
||||
~METASLAB_ACTIVE_MASK);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
continue;
|
||||
}
|
||||
|
||||
offset = metaslab_block_alloc(msp, asize, txg);
|
||||
metaslab_trace_add(zal, mg, msp, asize, d, offset);
|
||||
metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
|
||||
|
||||
if (offset != -1ULL) {
|
||||
/* Proactively passivate the metaslab, if needed */
|
||||
@ -3012,19 +3275,20 @@ next:
|
||||
|
||||
static uint64_t
|
||||
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
||||
uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
|
||||
uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
|
||||
int allocator)
|
||||
{
|
||||
uint64_t offset;
|
||||
ASSERT(mg->mg_initialized);
|
||||
|
||||
offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
|
||||
min_distance, dva, d);
|
||||
min_distance, dva, d, allocator);
|
||||
|
||||
mutex_enter(&mg->mg_lock);
|
||||
if (offset == -1ULL) {
|
||||
mg->mg_failed_allocations++;
|
||||
metaslab_trace_add(zal, mg, NULL, asize, d,
|
||||
TRACE_GROUP_FAILURE);
|
||||
TRACE_GROUP_FAILURE, allocator);
|
||||
if (asize == SPA_GANGBLOCKSIZE) {
|
||||
/*
|
||||
* This metaslab group was unable to allocate
|
||||
@ -3059,7 +3323,7 @@ int ditto_same_vdev_distance_shift = 3;
|
||||
int
|
||||
metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||
dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
|
||||
zio_alloc_list_t *zal)
|
||||
zio_alloc_list_t *zal, int allocator)
|
||||
{
|
||||
metaslab_group_t *mg, *rotor;
|
||||
vdev_t *vd;
|
||||
@ -3071,7 +3335,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||
* For testing, make some blocks above a certain size be gang blocks.
|
||||
*/
|
||||
if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
|
||||
metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
|
||||
metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
|
||||
allocator);
|
||||
return (SET_ERROR(ENOSPC));
|
||||
}
|
||||
|
||||
@ -3157,12 +3422,12 @@ top:
|
||||
*/
|
||||
if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
|
||||
allocatable = metaslab_group_allocatable(mg, rotor,
|
||||
psize);
|
||||
psize, allocator);
|
||||
}
|
||||
|
||||
if (!allocatable) {
|
||||
metaslab_trace_add(zal, mg, NULL, psize, d,
|
||||
TRACE_NOT_ALLOCATABLE);
|
||||
TRACE_NOT_ALLOCATABLE, allocator);
|
||||
goto next;
|
||||
}
|
||||
|
||||
@ -3177,7 +3442,7 @@ top:
|
||||
vd->vdev_state < VDEV_STATE_HEALTHY) &&
|
||||
d == 0 && !try_hard && vd->vdev_children == 0) {
|
||||
metaslab_trace_add(zal, mg, NULL, psize, d,
|
||||
TRACE_VDEV_ERROR);
|
||||
TRACE_VDEV_ERROR, allocator);
|
||||
goto next;
|
||||
}
|
||||
|
||||
@ -3201,7 +3466,7 @@ top:
|
||||
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
|
||||
|
||||
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
|
||||
distance, dva, d);
|
||||
distance, dva, d, allocator);
|
||||
|
||||
if (offset != -1ULL) {
|
||||
/*
|
||||
@ -3264,7 +3529,7 @@ next:
|
||||
|
||||
bzero(&dva[d], sizeof (dva_t));
|
||||
|
||||
metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
|
||||
metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
|
||||
return (SET_ERROR(ENOSPC));
|
||||
}
|
||||
|
||||
@ -3565,18 +3830,20 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
|
||||
* the reservation.
|
||||
*/
|
||||
boolean_t
|
||||
metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
|
||||
int flags)
|
||||
metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
|
||||
zio_t *zio, int flags)
|
||||
{
|
||||
uint64_t available_slots = 0;
|
||||
boolean_t slot_reserved = B_FALSE;
|
||||
uint64_t max = mc->mc_alloc_max_slots[allocator];
|
||||
|
||||
ASSERT(mc->mc_alloc_throttle_enabled);
|
||||
mutex_enter(&mc->mc_lock);
|
||||
|
||||
uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
|
||||
if (reserved_slots < mc->mc_alloc_max_slots)
|
||||
available_slots = mc->mc_alloc_max_slots - reserved_slots;
|
||||
uint64_t reserved_slots =
|
||||
refcount_count(&mc->mc_alloc_slots[allocator]);
|
||||
if (reserved_slots < max)
|
||||
available_slots = max - reserved_slots;
|
||||
|
||||
if (slots <= available_slots || GANG_ALLOCATION(flags)) {
|
||||
/*
|
||||
@ -3584,7 +3851,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
|
||||
* them individually when an I/O completes.
|
||||
*/
|
||||
for (int d = 0; d < slots; d++) {
|
||||
reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
|
||||
reserved_slots =
|
||||
refcount_add(&mc->mc_alloc_slots[allocator],
|
||||
zio);
|
||||
}
|
||||
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
|
||||
slot_reserved = B_TRUE;
|
||||
@ -3595,12 +3864,14 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
|
||||
}
|
||||
|
||||
void
|
||||
metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
|
||||
metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
|
||||
int allocator, zio_t *zio)
|
||||
{
|
||||
ASSERT(mc->mc_alloc_throttle_enabled);
|
||||
mutex_enter(&mc->mc_lock);
|
||||
for (int d = 0; d < slots; d++) {
|
||||
(void) refcount_remove(&mc->mc_alloc_slots, zio);
|
||||
(void) refcount_remove(&mc->mc_alloc_slots[allocator],
|
||||
zio);
|
||||
}
|
||||
mutex_exit(&mc->mc_lock);
|
||||
}
|
||||
@ -3622,7 +3893,13 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
|
||||
mutex_enter(&msp->ms_lock);
|
||||
|
||||
if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
|
||||
error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
|
||||
error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
|
||||
/*
|
||||
* No need to fail in that case; someone else has activated the
|
||||
* metaslab, but that doesn't preclude us from using it.
|
||||
*/
|
||||
if (error == EBUSY)
|
||||
error = 0;
|
||||
|
||||
if (error == 0 &&
|
||||
!range_tree_contains(msp->ms_allocatable, offset, size))
|
||||
@ -3727,7 +4004,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
|
||||
int
|
||||
metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
|
||||
int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
|
||||
zio_alloc_list_t *zal, zio_t *zio)
|
||||
zio_alloc_list_t *zal, zio_t *zio, int allocator)
|
||||
{
|
||||
dva_t *dva = bp->blk_dva;
|
||||
dva_t *hintdva = hintbp->blk_dva;
|
||||
@ -3750,12 +4027,13 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
|
||||
|
||||
for (int d = 0; d < ndvas; d++) {
|
||||
error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
|
||||
txg, flags, zal);
|
||||
txg, flags, zal, allocator);
|
||||
if (error != 0) {
|
||||
for (d--; d >= 0; d--) {
|
||||
metaslab_unalloc_dva(spa, &dva[d], txg);
|
||||
metaslab_group_alloc_decrement(spa,
|
||||
DVA_GET_VDEV(&dva[d]), zio, flags);
|
||||
DVA_GET_VDEV(&dva[d]), zio, flags,
|
||||
allocator, B_FALSE);
|
||||
bzero(&dva[d], sizeof (dva_t));
|
||||
}
|
||||
spa_config_exit(spa, SCL_ALLOC, FTAG);
|
||||
@ -3766,7 +4044,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
|
||||
* based on the newly allocated dva.
|
||||
*/
|
||||
metaslab_group_alloc_increment(spa,
|
||||
DVA_GET_VDEV(&dva[d]), zio, flags);
|
||||
DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -7776,9 +7776,11 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
spa->spa_syncing_txg = txg;
|
||||
spa->spa_sync_pass = 0;
|
||||
|
||||
mutex_enter(&spa->spa_alloc_lock);
|
||||
VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
|
||||
mutex_exit(&spa->spa_alloc_lock);
|
||||
for (int i = 0; i < spa->spa_alloc_count; i++) {
|
||||
mutex_enter(&spa->spa_alloc_locks[i]);
|
||||
VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
|
||||
mutex_exit(&spa->spa_alloc_locks[i]);
|
||||
}
|
||||
|
||||
/*
|
||||
* If there are any pending vdev state changes, convert them
|
||||
@ -7844,7 +7846,7 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
* The max queue depth will not change in the middle of syncing
|
||||
* out this txg.
|
||||
*/
|
||||
uint64_t queue_depth_total = 0;
|
||||
uint64_t slots_per_allocator = 0;
|
||||
for (int c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *tvd = rvd->vdev_child[c];
|
||||
metaslab_group_t *mg = tvd->vdev_mg;
|
||||
@ -7858,18 +7860,23 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
* allocations look at mg_max_alloc_queue_depth, and async
|
||||
* allocations all happen from spa_sync().
|
||||
*/
|
||||
ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
|
||||
for (int i = 0; i < spa->spa_alloc_count; i++)
|
||||
ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i])));
|
||||
mg->mg_max_alloc_queue_depth = max_queue_depth;
|
||||
queue_depth_total += mg->mg_max_alloc_queue_depth;
|
||||
|
||||
for (int i = 0; i < spa->spa_alloc_count; i++) {
|
||||
mg->mg_cur_max_alloc_queue_depth[i] =
|
||||
zfs_vdev_def_queue_depth;
|
||||
}
|
||||
slots_per_allocator += zfs_vdev_def_queue_depth;
|
||||
}
|
||||
metaslab_class_t *mc = spa_normal_class(spa);
|
||||
ASSERT0(refcount_count(&mc->mc_alloc_slots));
|
||||
mc->mc_alloc_max_slots = queue_depth_total;
|
||||
for (int i = 0; i < spa->spa_alloc_count; i++) {
|
||||
ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
|
||||
mc->mc_alloc_max_slots[i] = slots_per_allocator;
|
||||
}
|
||||
mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
|
||||
|
||||
ASSERT3U(mc->mc_alloc_max_slots, <=,
|
||||
max_queue_depth * rvd->vdev_children);
|
||||
|
||||
for (int c = 0; c < rvd->vdev_children; c++) {
|
||||
vdev_t *vd = rvd->vdev_child[c];
|
||||
vdev_indirect_state_sync_verify(vd);
|
||||
@ -8052,9 +8059,11 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
|
||||
dsl_pool_sync_done(dp, txg);
|
||||
|
||||
mutex_enter(&spa->spa_alloc_lock);
|
||||
VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
|
||||
mutex_exit(&spa->spa_alloc_lock);
|
||||
for (int i = 0; i < spa->spa_alloc_count; i++) {
|
||||
mutex_enter(&spa->spa_alloc_locks[i]);
|
||||
VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
|
||||
mutex_exit(&spa->spa_alloc_locks[i]);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update usable space statistics.
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
|
||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||
@ -434,6 +434,8 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
|
||||
&spa_min_slop, 0,
|
||||
"Minimal value of reserved space");
|
||||
|
||||
int spa_allocators = 4;
|
||||
|
||||
/*PRINTFLIKE2*/
|
||||
void
|
||||
spa_load_failed(spa_t *spa, const char *fmt, ...)
|
||||
@ -705,7 +707,6 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
|
||||
mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
|
||||
cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
|
||||
@ -779,8 +780,16 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
|
||||
spa_active_count++;
|
||||
}
|
||||
|
||||
avl_create(&spa->spa_alloc_tree, zio_bookmark_compare,
|
||||
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
|
||||
spa->spa_alloc_count = spa_allocators;
|
||||
spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
|
||||
sizeof (kmutex_t), KM_SLEEP);
|
||||
spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
|
||||
sizeof (avl_tree_t), KM_SLEEP);
|
||||
for (int i = 0; i < spa->spa_alloc_count; i++) {
|
||||
mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
|
||||
avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
|
||||
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
|
||||
}
|
||||
|
||||
/*
|
||||
* Every pool starts with the default cachefile
|
||||
@ -860,7 +869,15 @@ spa_remove(spa_t *spa)
|
||||
kmem_free(dp, sizeof (spa_config_dirent_t));
|
||||
}
|
||||
|
||||
avl_destroy(&spa->spa_alloc_tree);
|
||||
for (int i = 0; i < spa->spa_alloc_count; i++) {
|
||||
avl_destroy(&spa->spa_alloc_trees[i]);
|
||||
mutex_destroy(&spa->spa_alloc_locks[i]);
|
||||
}
|
||||
kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
|
||||
sizeof (kmutex_t));
|
||||
kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
|
||||
sizeof (avl_tree_t));
|
||||
|
||||
list_destroy(&spa->spa_config_list);
|
||||
|
||||
nvlist_free(spa->spa_label_features);
|
||||
@ -895,7 +912,6 @@ spa_remove(spa_t *spa)
|
||||
cv_destroy(&spa->spa_scrub_io_cv);
|
||||
cv_destroy(&spa->spa_suspend_cv);
|
||||
|
||||
mutex_destroy(&spa->spa_alloc_lock);
|
||||
mutex_destroy(&spa->spa_async_lock);
|
||||
mutex_destroy(&spa->spa_errlist_lock);
|
||||
mutex_destroy(&spa->spa_errlog_lock);
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_METASLAB_H
|
||||
@ -65,9 +65,10 @@ uint64_t metaslab_block_maxsize(metaslab_t *);
|
||||
#define METASLAB_DONT_THROTTLE 0x10
|
||||
|
||||
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
|
||||
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *);
|
||||
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
|
||||
int);
|
||||
int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
|
||||
dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *);
|
||||
dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
|
||||
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
|
||||
void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
|
||||
void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
|
||||
@ -88,9 +89,9 @@ int metaslab_class_validate(metaslab_class_t *);
|
||||
void metaslab_class_histogram_verify(metaslab_class_t *);
|
||||
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
|
||||
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
|
||||
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
|
||||
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
|
||||
zio_t *, int);
|
||||
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
|
||||
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
|
||||
|
||||
void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
|
||||
int64_t, int64_t);
|
||||
@ -100,7 +101,7 @@ uint64_t metaslab_class_get_dspace(metaslab_class_t *);
|
||||
uint64_t metaslab_class_get_deferred(metaslab_class_t *);
|
||||
uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc);
|
||||
|
||||
metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
|
||||
metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int);
|
||||
void metaslab_group_destroy(metaslab_group_t *);
|
||||
void metaslab_group_activate(metaslab_group_t *);
|
||||
void metaslab_group_passivate(metaslab_group_t *);
|
||||
@ -109,8 +110,9 @@ uint64_t metaslab_group_get_space(metaslab_group_t *);
|
||||
void metaslab_group_histogram_verify(metaslab_group_t *);
|
||||
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
|
||||
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
|
||||
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
|
||||
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
|
||||
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
|
||||
boolean_t);
|
||||
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -24,7 +24,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_METASLAB_IMPL_H
|
||||
@ -52,6 +52,7 @@ typedef struct metaslab_alloc_trace {
|
||||
uint64_t mat_weight;
|
||||
uint32_t mat_dva_id;
|
||||
uint64_t mat_offset;
|
||||
int mat_allocator;
|
||||
} metaslab_alloc_trace_t;
|
||||
|
||||
/*
|
||||
@ -72,9 +73,11 @@ typedef enum trace_alloc_type {
|
||||
|
||||
#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
|
||||
#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
|
||||
#define METASLAB_WEIGHT_TYPE (1ULL << 61)
|
||||
#define METASLAB_WEIGHT_CLAIM (1ULL << 61)
|
||||
#define METASLAB_WEIGHT_TYPE (1ULL << 60)
|
||||
#define METASLAB_ACTIVE_MASK \
|
||||
(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
|
||||
(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \
|
||||
METASLAB_WEIGHT_CLAIM)
|
||||
|
||||
/*
|
||||
* The metaslab weight is used to encode the amount of free space in a
|
||||
@ -97,37 +100,39 @@ typedef enum trace_alloc_type {
|
||||
*
|
||||
* 64 56 48 40 32 24 16 8 0
|
||||
* +-------+-------+-------+-------+-------+-------+-------+-------+
|
||||
* |PS1| weighted-free space |
|
||||
* |PSC1| weighted-free space |
|
||||
* +-------+-------+-------+-------+-------+-------+-------+-------+
|
||||
*
|
||||
* PS - indicates primary and secondary activation
|
||||
* C - indicates activation for claimed block zio
|
||||
* space - the fragmentation-weighted space
|
||||
*
|
||||
* Segment-based weight:
|
||||
*
|
||||
* 64 56 48 40 32 24 16 8 0
|
||||
* +-------+-------+-------+-------+-------+-------+-------+-------+
|
||||
* |PS0| idx| count of segments in region |
|
||||
* |PSC0| idx| count of segments in region |
|
||||
* +-------+-------+-------+-------+-------+-------+-------+-------+
|
||||
*
|
||||
* PS - indicates primary and secondary activation
|
||||
* C - indicates activation for claimed block zio
|
||||
* idx - index for the highest bucket in the histogram
|
||||
* count - number of segments in the specified bucket
|
||||
*/
|
||||
#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2)
|
||||
#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x)
|
||||
#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3)
|
||||
#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x)
|
||||
|
||||
#define WEIGHT_IS_SPACEBASED(weight) \
|
||||
((weight) == 0 || BF64_GET((weight), 61, 1))
|
||||
#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1)
|
||||
((weight) == 0 || BF64_GET((weight), 60, 1))
|
||||
#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1)
|
||||
|
||||
/*
|
||||
* These macros are only applicable to segment-based weighting.
|
||||
*/
|
||||
#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6)
|
||||
#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x)
|
||||
#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55)
|
||||
#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x)
|
||||
#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6)
|
||||
#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x)
|
||||
#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54)
|
||||
#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x)
|
||||
|
||||
/*
|
||||
* A metaslab class encompasses a category of allocatable top-level vdevs.
|
||||
@ -178,8 +183,8 @@ struct metaslab_class {
|
||||
* allowed to reserve slots even if we've reached the maximum
|
||||
* number of allocations allowed.
|
||||
*/
|
||||
uint64_t mc_alloc_max_slots;
|
||||
refcount_t mc_alloc_slots;
|
||||
uint64_t *mc_alloc_max_slots;
|
||||
refcount_t *mc_alloc_slots;
|
||||
|
||||
uint64_t mc_alloc_groups; /* # of allocatable groups */
|
||||
|
||||
@ -202,9 +207,12 @@ struct metaslab_class {
|
||||
*/
|
||||
struct metaslab_group {
|
||||
kmutex_t mg_lock;
|
||||
metaslab_t **mg_primaries;
|
||||
metaslab_t **mg_secondaries;
|
||||
avl_tree_t mg_metaslab_tree;
|
||||
uint64_t mg_aliquot;
|
||||
boolean_t mg_allocatable; /* can we allocate? */
|
||||
uint64_t mg_ms_ready;
|
||||
|
||||
/*
|
||||
* A metaslab group is considered to be initialized only after
|
||||
@ -224,15 +232,33 @@ struct metaslab_group {
|
||||
metaslab_group_t *mg_next;
|
||||
|
||||
/*
|
||||
* Each metaslab group can handle mg_max_alloc_queue_depth allocations
|
||||
* which are tracked by mg_alloc_queue_depth. It's possible for a
|
||||
* metaslab group to handle more allocations than its max. This
|
||||
* can occur when gang blocks are required or when other groups
|
||||
* are unable to handle their share of allocations.
|
||||
* In order for the allocation throttle to function properly, we cannot
|
||||
* have too many IOs going to each disk by default; the throttle
|
||||
* operates by allocating more work to disks that finish quickly, so
|
||||
* allocating larger chunks to each disk reduces its effectiveness.
|
||||
* However, if the number of IOs going to each allocator is too small,
|
||||
* we will not perform proper aggregation at the vdev_queue layer,
|
||||
* also resulting in decreased performance. Therefore, we will use a
|
||||
* ramp-up strategy.
|
||||
*
|
||||
* Each allocator in each metaslab group has a current queue depth
|
||||
* (mg_alloc_queue_depth[allocator]) and a current max queue depth
|
||||
* (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group
|
||||
* has an absolute max queue depth (mg_max_alloc_queue_depth). We
|
||||
* add IOs to an allocator until the mg_alloc_queue_depth for that
|
||||
* allocator hits the cur_max. Every time an IO completes for a given
|
||||
* allocator on a given metaslab group, we increment its cur_max until
|
||||
* it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to
|
||||
* help protect against disks that decrease in performance over time.
|
||||
*
|
||||
* It's possible for an allocator to handle more allocations than
|
||||
* its max. This can occur when gang blocks are required or when other
|
||||
* groups are unable to handle their share of allocations.
|
||||
*/
|
||||
uint64_t mg_max_alloc_queue_depth;
|
||||
refcount_t mg_alloc_queue_depth;
|
||||
|
||||
uint64_t *mg_cur_max_alloc_queue_depth;
|
||||
refcount_t *mg_alloc_queue_depth;
|
||||
int mg_allocators;
|
||||
/*
|
||||
* A metalab group that can no longer allocate the minimum block
|
||||
* size will set mg_no_free_space. Once a metaslab group is out
|
||||
@ -356,6 +382,13 @@ struct metaslab {
|
||||
uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
|
||||
uint64_t ms_max_size; /* maximum allocatable size */
|
||||
|
||||
/*
|
||||
* -1 if it's not active in an allocator, otherwise set to the allocator
|
||||
* this metaslab is active for.
|
||||
*/
|
||||
int ms_allocator;
|
||||
boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */
|
||||
|
||||
/*
|
||||
* The metaslab block allocators can optionally use a size-ordered
|
||||
* range tree and/or an array of LBAs. Not all allocators use
|
||||
@ -370,6 +403,8 @@ struct metaslab {
|
||||
metaslab_group_t *ms_group; /* metaslab group */
|
||||
avl_node_t ms_group_node; /* node in metaslab group tree */
|
||||
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
|
||||
|
||||
boolean_t ms_new;
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -238,8 +238,16 @@ struct spa {
|
||||
uint64_t spa_last_synced_guid; /* last synced guid */
|
||||
list_t spa_config_dirty_list; /* vdevs with dirty config */
|
||||
list_t spa_state_dirty_list; /* vdevs with dirty state */
|
||||
kmutex_t spa_alloc_lock;
|
||||
avl_tree_t spa_alloc_tree;
|
||||
/*
|
||||
* spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
|
||||
* stored in spa_alloc_count. There is one tree and one lock for each
|
||||
* allocator, to help improve allocation performance in write-heavy
|
||||
* workloads.
|
||||
*/
|
||||
kmutex_t *spa_alloc_locks;
|
||||
avl_tree_t *spa_alloc_trees;
|
||||
int spa_alloc_count;
|
||||
|
||||
spa_aux_vdev_t spa_spares; /* hot spares */
|
||||
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
|
||||
nvlist_t *spa_label_features; /* Features for reading MOS */
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_VDEV_IMPL_H
|
||||
@ -59,6 +59,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t;
|
||||
struct abd;
|
||||
|
||||
extern int zfs_vdev_queue_depth_pct;
|
||||
extern int zfs_vdev_def_queue_depth;
|
||||
extern uint32_t zfs_vdev_async_write_max_active;
|
||||
|
||||
/*
|
||||
|
@ -22,7 +22,7 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
* Copyright 2016 Toomas Soome <tsoome@me.com>
|
||||
*/
|
||||
@ -489,6 +489,7 @@ struct zio {
|
||||
void *io_waiter;
|
||||
kmutex_t io_lock;
|
||||
kcondvar_t io_cv;
|
||||
int io_allocator;
|
||||
|
||||
/* FMA state */
|
||||
zio_cksum_report_t *io_cksum_report;
|
||||
@ -550,8 +551,8 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
|
||||
extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
|
||||
const blkptr_t *bp, uint64_t size, enum zio_flag flags);
|
||||
|
||||
extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
|
||||
blkptr_t *old_bp, uint64_t size, boolean_t *slog);
|
||||
extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg,
|
||||
blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog);
|
||||
extern void zio_flush(zio_t *zio, vdev_t *vd);
|
||||
extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
|
||||
uint64_t size);
|
||||
|
@ -752,7 +752,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
alloctype == VDEV_ALLOC_SPLIT ||
|
||||
alloctype == VDEV_ALLOC_ROOTPOOL);
|
||||
vd->vdev_mg = metaslab_group_create(islog ?
|
||||
spa_log_class(spa) : spa_normal_class(spa), vd);
|
||||
spa_log_class(spa) : spa_normal_class(spa), vd,
|
||||
spa->spa_alloc_count);
|
||||
}
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf &&
|
||||
@ -1140,7 +1141,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
|
||||
|
||||
vd->vdev_ms = mspp;
|
||||
vd->vdev_ms_count = newc;
|
||||
|
||||
for (m = oldc; m < newc; m++) {
|
||||
uint64_t object = 0;
|
||||
|
||||
|
@ -24,7 +24,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
*/
|
||||
|
||||
@ -195,6 +195,14 @@ int zfs_vdev_queue_depth_pct = 1000;
|
||||
int zfs_vdev_queue_depth_pct = 300;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When performing allocations for a given metaslab, we want to make sure that
|
||||
* there are enough IOs to aggregate together to improve throughput. We want to
|
||||
* ensure that there are at least 128k worth of IOs that can be aggregated, and
|
||||
* we assume that the average allocation size is 4k, so we need the queue depth
|
||||
* to be 32 per allocator to get good aggregation of sequential writes.
|
||||
*/
|
||||
int zfs_vdev_def_queue_depth = 32;
|
||||
|
||||
#ifdef __FreeBSD__
|
||||
#ifdef _KERNEL
|
||||
|
@ -807,8 +807,15 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
|
||||
|
||||
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
|
||||
|
||||
/*
|
||||
* We use allocator 0 for this I/O because we don't expect device remap
|
||||
* to be the steady state of the system, so parallelizing is not as
|
||||
* critical as it is for other allocation types. We also want to ensure
|
||||
* that the IOs are allocated together as much as possible, to reduce
|
||||
* mapping sizes.
|
||||
*/
|
||||
int error = metaslab_alloc_dva(spa, mg->mg_class, size,
|
||||
&dst, 0, NULL, txg, 0, zal);
|
||||
&dst, 0, NULL, txg, 0, zal, 0);
|
||||
if (error != 0)
|
||||
return (error);
|
||||
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
*/
|
||||
|
||||
@ -665,7 +665,8 @@ zil_create(zilog_t *zilog)
|
||||
BP_ZERO(&blk);
|
||||
}
|
||||
|
||||
error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
|
||||
error = zio_alloc_zil(zilog->zl_spa,
|
||||
zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL,
|
||||
ZIL_MIN_BLKSZ, &slog);
|
||||
|
||||
if (error == 0)
|
||||
@ -1342,7 +1343,8 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
|
||||
BP_ZERO(bp);
|
||||
|
||||
/* pass the old blkptr in order to spread log blocks across devs */
|
||||
error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
|
||||
error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object,
|
||||
txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
|
||||
if (error == 0) {
|
||||
ASSERT3U(bp->blk_birth, ==, txg);
|
||||
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
*/
|
||||
@ -44,6 +44,7 @@
|
||||
#include <sys/dsl_scan.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
#include <sys/abd.h>
|
||||
#include <sys/cityhash.h>
|
||||
|
||||
SYSCTL_DECL(_vfs_zfs);
|
||||
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
|
||||
@ -2335,7 +2336,8 @@ zio_write_gang_block(zio_t *pio)
|
||||
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
|
||||
|
||||
flags |= METASLAB_ASYNC_ALLOC;
|
||||
VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
|
||||
VERIFY(refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
|
||||
pio));
|
||||
|
||||
/*
|
||||
* The logical zio has already placed a reservation for
|
||||
@ -2346,12 +2348,12 @@ zio_write_gang_block(zio_t *pio)
|
||||
* additional reservations for gang blocks.
|
||||
*/
|
||||
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
|
||||
pio, flags));
|
||||
pio->io_allocator, pio, flags));
|
||||
}
|
||||
|
||||
error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
|
||||
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
|
||||
&pio->io_alloc_list, pio);
|
||||
&pio->io_alloc_list, pio, pio->io_allocator);
|
||||
if (error) {
|
||||
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
||||
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
||||
@ -2365,7 +2367,7 @@ zio_write_gang_block(zio_t *pio)
|
||||
* stage.
|
||||
*/
|
||||
metaslab_class_throttle_unreserve(mc,
|
||||
gbh_copies - copies, pio);
|
||||
gbh_copies - copies, pio->io_allocator, pio);
|
||||
}
|
||||
pio->io_error = error;
|
||||
return (ZIO_PIPELINE_CONTINUE);
|
||||
@ -2423,7 +2425,7 @@ zio_write_gang_block(zio_t *pio)
|
||||
* slot for them here.
|
||||
*/
|
||||
VERIFY(metaslab_class_throttle_reserve(mc,
|
||||
zp.zp_copies, cio, flags));
|
||||
zp.zp_copies, cio->io_allocator, cio, flags));
|
||||
}
|
||||
zio_nowait(cio);
|
||||
}
|
||||
@ -2913,13 +2915,13 @@ zio_ddt_free(zio_t *zio)
|
||||
*/
|
||||
|
||||
static zio_t *
|
||||
zio_io_to_allocate(spa_t *spa)
|
||||
zio_io_to_allocate(spa_t *spa, int allocator)
|
||||
{
|
||||
zio_t *zio;
|
||||
|
||||
ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
|
||||
ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
|
||||
|
||||
zio = avl_first(&spa->spa_alloc_tree);
|
||||
zio = avl_first(&spa->spa_alloc_trees[allocator]);
|
||||
if (zio == NULL)
|
||||
return (NULL);
|
||||
|
||||
@ -2929,12 +2931,13 @@ zio_io_to_allocate(spa_t *spa)
|
||||
* Try to place a reservation for this zio. If we're unable to
|
||||
* reserve then we throttle.
|
||||
*/
|
||||
ASSERT3U(zio->io_allocator, ==, allocator);
|
||||
if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
|
||||
zio->io_prop.zp_copies, zio, 0)) {
|
||||
zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
avl_remove(&spa->spa_alloc_tree, zio);
|
||||
avl_remove(&spa->spa_alloc_trees[allocator], zio);
|
||||
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
|
||||
|
||||
return (zio);
|
||||
@ -2958,13 +2961,23 @@ zio_dva_throttle(zio_t *zio)
|
||||
ASSERT3U(zio->io_queued_timestamp, >, 0);
|
||||
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
|
||||
|
||||
mutex_enter(&spa->spa_alloc_lock);
|
||||
zbookmark_phys_t *bm = &zio->io_bookmark;
|
||||
/*
|
||||
* We want to try to use as many allocators as possible to help improve
|
||||
* performance, but we also want logically adjacent IOs to be physically
|
||||
* adjacent to improve sequential read performance. We chunk each object
|
||||
* into 2^20 block regions, and then hash based on the objset, object,
|
||||
* level, and region to accomplish both of these goals.
|
||||
*/
|
||||
zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
|
||||
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
|
||||
mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
|
||||
|
||||
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
||||
avl_add(&spa->spa_alloc_tree, zio);
|
||||
avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
|
||||
|
||||
nio = zio_io_to_allocate(zio->io_spa);
|
||||
mutex_exit(&spa->spa_alloc_lock);
|
||||
nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator);
|
||||
mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
|
||||
|
||||
if (nio == zio)
|
||||
return (ZIO_PIPELINE_CONTINUE);
|
||||
@ -2985,13 +2998,13 @@ zio_dva_throttle(zio_t *zio)
|
||||
}
|
||||
|
||||
void
|
||||
zio_allocate_dispatch(spa_t *spa)
|
||||
zio_allocate_dispatch(spa_t *spa, int allocator)
|
||||
{
|
||||
zio_t *zio;
|
||||
|
||||
mutex_enter(&spa->spa_alloc_lock);
|
||||
zio = zio_io_to_allocate(spa);
|
||||
mutex_exit(&spa->spa_alloc_lock);
|
||||
mutex_enter(&spa->spa_alloc_locks[allocator]);
|
||||
zio = zio_io_to_allocate(spa, allocator);
|
||||
mutex_exit(&spa->spa_alloc_locks[allocator]);
|
||||
if (zio == NULL)
|
||||
return;
|
||||
|
||||
@ -3032,7 +3045,7 @@ zio_dva_allocate(zio_t *zio)
|
||||
|
||||
error = metaslab_alloc(spa, mc, zio->io_size, bp,
|
||||
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
|
||||
&zio->io_alloc_list, zio);
|
||||
&zio->io_alloc_list, zio, zio->io_allocator);
|
||||
|
||||
if (error != 0) {
|
||||
spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
|
||||
@ -3092,8 +3105,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
|
||||
* Try to allocate an intent log block. Return 0 on success, errno on failure.
|
||||
*/
|
||||
int
|
||||
zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
|
||||
uint64_t size, boolean_t *slog)
|
||||
zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp,
|
||||
blkptr_t *old_bp, uint64_t size, boolean_t *slog)
|
||||
{
|
||||
int error = 1;
|
||||
zio_alloc_list_t io_alloc_list;
|
||||
@ -3101,14 +3114,22 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
|
||||
ASSERT(txg > spa_syncing_txg(spa));
|
||||
|
||||
metaslab_trace_init(&io_alloc_list);
|
||||
/*
|
||||
* When allocating a zil block, we don't have information about
|
||||
* the final destination of the block except the objset it's part
|
||||
* of, so we just hash the objset ID to pick the allocator to get
|
||||
* some parallelism.
|
||||
*/
|
||||
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
|
||||
txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL);
|
||||
txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL,
|
||||
cityhash4(0, 0, 0, objset) % spa->spa_alloc_count);
|
||||
if (error == 0) {
|
||||
*slog = TRUE;
|
||||
} else {
|
||||
error = metaslab_alloc(spa, spa_normal_class(spa), size,
|
||||
new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
|
||||
&io_alloc_list, NULL);
|
||||
&io_alloc_list, NULL, cityhash4(0, 0, 0, objset) %
|
||||
spa->spa_alloc_count);
|
||||
if (error == 0)
|
||||
*slog = FALSE;
|
||||
}
|
||||
@ -3678,8 +3699,8 @@ zio_ready(zio_t *zio)
|
||||
*/
|
||||
metaslab_class_throttle_unreserve(
|
||||
spa_normal_class(zio->io_spa),
|
||||
zio->io_prop.zp_copies, zio);
|
||||
zio_allocate_dispatch(zio->io_spa);
|
||||
zio->io_prop.zp_copies, zio->io_allocator, zio);
|
||||
zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3762,18 +3783,19 @@ zio_dva_throttle_done(zio_t *zio)
|
||||
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
|
||||
|
||||
mutex_enter(&pio->io_lock);
|
||||
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
|
||||
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
|
||||
pio->io_allocator, B_TRUE);
|
||||
mutex_exit(&pio->io_lock);
|
||||
|
||||
metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
|
||||
1, pio);
|
||||
1, pio->io_allocator, pio);
|
||||
|
||||
/*
|
||||
* Call into the pipeline to see if there is more work that
|
||||
* needs to be done. If there is work to be done it will be
|
||||
* dispatched to another taskq thread.
|
||||
*/
|
||||
zio_allocate_dispatch(zio->io_spa);
|
||||
zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
|
||||
}
|
||||
|
||||
static int
|
||||
@ -3816,8 +3838,10 @@ zio_done(zio_t *zio)
|
||||
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
||||
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
||||
ASSERT(bp != NULL);
|
||||
metaslab_group_alloc_verify(spa, zio->io_bp, zio);
|
||||
VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
|
||||
metaslab_group_alloc_verify(spa, zio->io_bp, zio,
|
||||
zio->io_allocator);
|
||||
VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator],
|
||||
zio));
|
||||
}
|
||||
|
||||
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
|
||||
|
Loading…
x
Reference in New Issue
Block a user