Update ZFS metaslab code from OpenSolaris.
This provides a noticeable write speedup, especially on pools with less than 30% of free space. Detailed information (OpenSolaris onnv changesets and Bug IDs): 11146:7e58f40bcb1c 6826241 Sync write IOPS drops dramatically during TXG sync 6869229 zfs should switch to shiny new metaslabs more frequently 11728:59fdb3b856f6 6918420 zdb -m has issues printing metaslab statistics 12047:7c1fcc8419ca 6917066 zfs block picking can be improved Approved by: delphij (mentor) Obtained from: OpenSolaris (Bug ID 6826241, 6869229, 6918420, 6917066) MFC after: 2 weeks
This commit is contained in:
parent
c87f1ad43c
commit
abe5837f7c
@ -491,35 +491,37 @@ dump_metaslab_stats(metaslab_t *msp)
|
||||
static void
|
||||
dump_metaslab(metaslab_t *msp)
|
||||
{
|
||||
char freebuf[5];
|
||||
space_map_obj_t *smo = &msp->ms_smo;
|
||||
vdev_t *vd = msp->ms_group->mg_vd;
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
space_map_t *sm = &msp->ms_map;
|
||||
space_map_obj_t *smo = &msp->ms_smo;
|
||||
char freebuf[5];
|
||||
|
||||
nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
|
||||
nicenum(sm->sm_size - smo->smo_alloc, freebuf);
|
||||
|
||||
(void) printf(
|
||||
"\tvdev %5llu offset %12llx spacemap %6llu free %5s\n",
|
||||
(u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
|
||||
(u_longlong_t)smo->smo_object, freebuf);
|
||||
(u_longlong_t)(sm->sm_start / sm->sm_size),
|
||||
(u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf);
|
||||
|
||||
if (dump_opt['m'] > 1) {
|
||||
mutex_enter(&msp->ms_lock);
|
||||
VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops,
|
||||
SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0);
|
||||
space_map_load_wait(sm);
|
||||
if (!sm->sm_loaded)
|
||||
VERIFY(space_map_load(sm, zfs_metaslab_ops,
|
||||
SM_FREE, smo, spa->spa_meta_objset) == 0);
|
||||
dump_metaslab_stats(msp);
|
||||
space_map_unload(&msp->ms_map);
|
||||
space_map_unload(sm);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
|
||||
ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
|
||||
ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift));
|
||||
|
||||
mutex_enter(&msp->ms_lock);
|
||||
dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
|
||||
dump_spacemap(spa->spa_meta_objset, smo, sm);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -19,8 +19,7 @@
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@ -37,7 +36,7 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
|
||||
|
||||
/*
|
||||
* Minimum size which forces the dynamic allocator to change
|
||||
* it's allocation strategy. Once the space map cannot satisfy
|
||||
* it's allocation strategy. Once the space map cannot satisfy
|
||||
* an allocation of this size then it switches to using more
|
||||
* aggressive strategy (i.e search by size rather than offset).
|
||||
*/
|
||||
@ -49,7 +48,23 @@ uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
|
||||
* Once the space_map's free space drops below this level we dynamically
|
||||
* switch to using best-fit allocations.
|
||||
*/
|
||||
int metaslab_df_free_pct = 30;
|
||||
int metaslab_df_free_pct = 4;
|
||||
|
||||
/*
|
||||
* A metaslab is considered "free" if it contains a contiguous
|
||||
* segment which is greater than metaslab_min_alloc_size.
|
||||
*/
|
||||
uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
|
||||
|
||||
/*
|
||||
* Max number of space_maps to prefetch.
|
||||
*/
|
||||
int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
|
||||
|
||||
/*
|
||||
* Percentage bonus multiplier for metaslabs that are in the bonus area.
|
||||
*/
|
||||
int metaslab_smo_bonus_pct = 150;
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
@ -218,6 +233,32 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* Common allocator routines
|
||||
* ==========================================================================
|
||||
*/
|
||||
static int
|
||||
metaslab_segsize_compare(const void *x1, const void *x2)
|
||||
{
|
||||
const space_seg_t *s1 = x1;
|
||||
const space_seg_t *s2 = x2;
|
||||
uint64_t ss_size1 = s1->ss_end - s1->ss_start;
|
||||
uint64_t ss_size2 = s2->ss_end - s2->ss_start;
|
||||
|
||||
if (ss_size1 < ss_size2)
|
||||
return (-1);
|
||||
if (ss_size1 > ss_size2)
|
||||
return (1);
|
||||
|
||||
if (s1->ss_start < s2->ss_start)
|
||||
return (-1);
|
||||
if (s1->ss_start > s2->ss_start)
|
||||
return (1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a helper function that can be used by the allocator to find
|
||||
* a suitable block to allocate. This will search the specified AVL
|
||||
@ -258,101 +299,8 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
|
||||
return (metaslab_block_picker(t, cursor, size, align));
|
||||
}
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* The first-fit block allocator
|
||||
* ==========================================================================
|
||||
*/
|
||||
static void
|
||||
metaslab_ff_load(space_map_t *sm)
|
||||
{
|
||||
ASSERT(sm->sm_ppd == NULL);
|
||||
sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
|
||||
sm->sm_pp_root = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_ff_unload(space_map_t *sm)
|
||||
{
|
||||
kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
|
||||
sm->sm_ppd = NULL;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
metaslab_ff_alloc(space_map_t *sm, uint64_t size)
|
||||
{
|
||||
avl_tree_t *t = &sm->sm_root;
|
||||
uint64_t align = size & -size;
|
||||
uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
|
||||
|
||||
return (metaslab_block_picker(t, cursor, size, align));
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static void
|
||||
metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
{
|
||||
/* No need to update cursor */
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static void
|
||||
metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
{
|
||||
/* No need to update cursor */
|
||||
}
|
||||
|
||||
static space_map_ops_t metaslab_ff_ops = {
|
||||
metaslab_ff_load,
|
||||
metaslab_ff_unload,
|
||||
metaslab_ff_alloc,
|
||||
metaslab_ff_claim,
|
||||
metaslab_ff_free,
|
||||
NULL /* maxsize */
|
||||
};
|
||||
|
||||
/*
|
||||
* Dynamic block allocator -
|
||||
* Uses the first fit allocation scheme until space get low and then
|
||||
* adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
|
||||
* and metaslab_df_free_pct to determine when to switch the allocation scheme.
|
||||
*/
|
||||
|
||||
uint64_t
|
||||
metaslab_df_maxsize(space_map_t *sm)
|
||||
{
|
||||
avl_tree_t *t = sm->sm_pp_root;
|
||||
space_seg_t *ss;
|
||||
|
||||
if (t == NULL || (ss = avl_last(t)) == NULL)
|
||||
return (0ULL);
|
||||
|
||||
return (ss->ss_end - ss->ss_start);
|
||||
}
|
||||
|
||||
static int
|
||||
metaslab_df_seg_compare(const void *x1, const void *x2)
|
||||
{
|
||||
const space_seg_t *s1 = x1;
|
||||
const space_seg_t *s2 = x2;
|
||||
uint64_t ss_size1 = s1->ss_end - s1->ss_start;
|
||||
uint64_t ss_size2 = s2->ss_end - s2->ss_start;
|
||||
|
||||
if (ss_size1 < ss_size2)
|
||||
return (-1);
|
||||
if (ss_size1 > ss_size2)
|
||||
return (1);
|
||||
|
||||
if (s1->ss_start < s2->ss_start)
|
||||
return (-1);
|
||||
if (s1->ss_start > s2->ss_start)
|
||||
return (1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_df_load(space_map_t *sm)
|
||||
metaslab_pp_load(space_map_t *sm)
|
||||
{
|
||||
space_seg_t *ss;
|
||||
|
||||
@ -360,7 +308,7 @@ metaslab_df_load(space_map_t *sm)
|
||||
sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
|
||||
|
||||
sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
|
||||
avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
|
||||
avl_create(sm->sm_pp_root, metaslab_segsize_compare,
|
||||
sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
|
||||
|
||||
for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
|
||||
@ -368,7 +316,7 @@ metaslab_df_load(space_map_t *sm)
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_df_unload(space_map_t *sm)
|
||||
metaslab_pp_unload(space_map_t *sm)
|
||||
{
|
||||
void *cookie = NULL;
|
||||
|
||||
@ -384,13 +332,82 @@ metaslab_df_unload(space_map_t *sm)
|
||||
sm->sm_pp_root = NULL;
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static void
|
||||
metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
{
|
||||
/* No need to update cursor */
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static void
|
||||
metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
{
|
||||
/* No need to update cursor */
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the maximum contiguous segment within the metaslab.
|
||||
*/
|
||||
uint64_t
|
||||
metaslab_pp_maxsize(space_map_t *sm)
|
||||
{
|
||||
avl_tree_t *t = sm->sm_pp_root;
|
||||
space_seg_t *ss;
|
||||
|
||||
if (t == NULL || (ss = avl_last(t)) == NULL)
|
||||
return (0ULL);
|
||||
|
||||
return (ss->ss_end - ss->ss_start);
|
||||
}
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* The first-fit block allocator
|
||||
* ==========================================================================
|
||||
*/
|
||||
static uint64_t
|
||||
metaslab_ff_alloc(space_map_t *sm, uint64_t size)
|
||||
{
|
||||
avl_tree_t *t = &sm->sm_root;
|
||||
uint64_t align = size & -size;
|
||||
uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
|
||||
|
||||
return (metaslab_block_picker(t, cursor, size, align));
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
boolean_t
|
||||
metaslab_ff_fragmented(space_map_t *sm)
|
||||
{
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
static space_map_ops_t metaslab_ff_ops = {
|
||||
metaslab_pp_load,
|
||||
metaslab_pp_unload,
|
||||
metaslab_ff_alloc,
|
||||
metaslab_pp_claim,
|
||||
metaslab_pp_free,
|
||||
metaslab_pp_maxsize,
|
||||
metaslab_ff_fragmented
|
||||
};
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* Dynamic block allocator -
|
||||
* Uses the first fit allocation scheme until space get low and then
|
||||
* adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
|
||||
* and metaslab_df_free_pct to determine when to switch the allocation scheme.
|
||||
* ==========================================================================
|
||||
*/
|
||||
static uint64_t
|
||||
metaslab_df_alloc(space_map_t *sm, uint64_t size)
|
||||
{
|
||||
avl_tree_t *t = &sm->sm_root;
|
||||
uint64_t align = size & -size;
|
||||
uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
|
||||
uint64_t max_size = metaslab_df_maxsize(sm);
|
||||
uint64_t max_size = metaslab_pp_maxsize(sm);
|
||||
int free_pct = sm->sm_space * 100 / sm->sm_size;
|
||||
|
||||
ASSERT(MUTEX_HELD(sm->sm_lock));
|
||||
@ -412,30 +429,158 @@ metaslab_df_alloc(space_map_t *sm, uint64_t size)
|
||||
return (metaslab_block_picker(t, cursor, size, 1ULL));
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static void
|
||||
metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
static boolean_t
|
||||
metaslab_df_fragmented(space_map_t *sm)
|
||||
{
|
||||
/* No need to update cursor */
|
||||
}
|
||||
uint64_t max_size = metaslab_pp_maxsize(sm);
|
||||
int free_pct = sm->sm_space * 100 / sm->sm_size;
|
||||
|
||||
/* ARGSUSED */
|
||||
static void
|
||||
metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
{
|
||||
/* No need to update cursor */
|
||||
if (max_size >= metaslab_df_alloc_threshold &&
|
||||
free_pct >= metaslab_df_free_pct)
|
||||
return (B_FALSE);
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
static space_map_ops_t metaslab_df_ops = {
|
||||
metaslab_df_load,
|
||||
metaslab_df_unload,
|
||||
metaslab_pp_load,
|
||||
metaslab_pp_unload,
|
||||
metaslab_df_alloc,
|
||||
metaslab_df_claim,
|
||||
metaslab_df_free,
|
||||
metaslab_df_maxsize
|
||||
metaslab_pp_claim,
|
||||
metaslab_pp_free,
|
||||
metaslab_pp_maxsize,
|
||||
metaslab_df_fragmented
|
||||
};
|
||||
|
||||
space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
|
||||
/*
|
||||
* ==========================================================================
|
||||
* Other experimental allocators
|
||||
* ==========================================================================
|
||||
*/
|
||||
static uint64_t
|
||||
metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
|
||||
{
|
||||
avl_tree_t *t = &sm->sm_root;
|
||||
uint64_t *cursor = (uint64_t *)sm->sm_ppd;
|
||||
uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
|
||||
uint64_t max_size = metaslab_pp_maxsize(sm);
|
||||
uint64_t rsize = size;
|
||||
uint64_t offset = 0;
|
||||
|
||||
ASSERT(MUTEX_HELD(sm->sm_lock));
|
||||
ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
|
||||
|
||||
if (max_size < size)
|
||||
return (-1ULL);
|
||||
|
||||
ASSERT3U(*extent_end, >=, *cursor);
|
||||
|
||||
/*
|
||||
* If we're running low on space switch to using the size
|
||||
* sorted AVL tree (best-fit).
|
||||
*/
|
||||
if ((*cursor + size) > *extent_end) {
|
||||
|
||||
t = sm->sm_pp_root;
|
||||
*cursor = *extent_end = 0;
|
||||
|
||||
if (max_size > 2 * SPA_MAXBLOCKSIZE)
|
||||
rsize = MIN(metaslab_min_alloc_size, max_size);
|
||||
offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
|
||||
if (offset != -1)
|
||||
*cursor = offset + size;
|
||||
} else {
|
||||
offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
|
||||
}
|
||||
ASSERT3U(*cursor, <=, *extent_end);
|
||||
return (offset);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
metaslab_cdf_fragmented(space_map_t *sm)
|
||||
{
|
||||
uint64_t max_size = metaslab_pp_maxsize(sm);
|
||||
|
||||
if (max_size > (metaslab_min_alloc_size * 10))
|
||||
return (B_FALSE);
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
static space_map_ops_t metaslab_cdf_ops = {
|
||||
metaslab_pp_load,
|
||||
metaslab_pp_unload,
|
||||
metaslab_cdf_alloc,
|
||||
metaslab_pp_claim,
|
||||
metaslab_pp_free,
|
||||
metaslab_pp_maxsize,
|
||||
metaslab_cdf_fragmented
|
||||
};
|
||||
|
||||
uint64_t metaslab_ndf_clump_shift = 4;
|
||||
|
||||
static uint64_t
|
||||
metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
|
||||
{
|
||||
avl_tree_t *t = &sm->sm_root;
|
||||
avl_index_t where;
|
||||
space_seg_t *ss, ssearch;
|
||||
uint64_t hbit = highbit(size);
|
||||
uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
|
||||
uint64_t max_size = metaslab_pp_maxsize(sm);
|
||||
|
||||
ASSERT(MUTEX_HELD(sm->sm_lock));
|
||||
ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
|
||||
|
||||
if (max_size < size)
|
||||
return (-1ULL);
|
||||
|
||||
ssearch.ss_start = *cursor;
|
||||
ssearch.ss_end = *cursor + size;
|
||||
|
||||
ss = avl_find(t, &ssearch, &where);
|
||||
if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
|
||||
t = sm->sm_pp_root;
|
||||
|
||||
ssearch.ss_start = 0;
|
||||
ssearch.ss_end = MIN(max_size,
|
||||
1ULL << (hbit + metaslab_ndf_clump_shift));
|
||||
ss = avl_find(t, &ssearch, &where);
|
||||
if (ss == NULL)
|
||||
ss = avl_nearest(t, where, AVL_AFTER);
|
||||
ASSERT(ss != NULL);
|
||||
}
|
||||
|
||||
if (ss != NULL) {
|
||||
if (ss->ss_start + size <= ss->ss_end) {
|
||||
*cursor = ss->ss_start + size;
|
||||
return (ss->ss_start);
|
||||
}
|
||||
}
|
||||
return (-1ULL);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
metaslab_ndf_fragmented(space_map_t *sm)
|
||||
{
|
||||
uint64_t max_size = metaslab_pp_maxsize(sm);
|
||||
|
||||
if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
|
||||
return (B_FALSE);
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
|
||||
static space_map_ops_t metaslab_ndf_ops = {
|
||||
metaslab_pp_load,
|
||||
metaslab_pp_unload,
|
||||
metaslab_ndf_alloc,
|
||||
metaslab_pp_claim,
|
||||
metaslab_pp_free,
|
||||
metaslab_pp_maxsize,
|
||||
metaslab_ndf_fragmented
|
||||
};
|
||||
|
||||
space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
@ -522,7 +667,6 @@ metaslab_fini(metaslab_t *msp)
|
||||
#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
|
||||
#define METASLAB_ACTIVE_MASK \
|
||||
(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
|
||||
#define METASLAB_SMO_BONUS_MULTIPLIER 2
|
||||
|
||||
static uint64_t
|
||||
metaslab_weight(metaslab_t *msp)
|
||||
@ -555,25 +699,60 @@ metaslab_weight(metaslab_t *msp)
|
||||
ASSERT(weight >= space && weight <= 2 * space);
|
||||
|
||||
/*
|
||||
* For locality, assign higher weight to metaslabs we've used before.
|
||||
* For locality, assign higher weight to metaslabs which have
|
||||
* a lower offset than what we've already activated.
|
||||
*/
|
||||
if (smo->smo_object != 0)
|
||||
weight *= METASLAB_SMO_BONUS_MULTIPLIER;
|
||||
if (sm->sm_start <= mg->mg_bonus_area)
|
||||
weight *= (metaslab_smo_bonus_pct / 100);
|
||||
ASSERT(weight >= space &&
|
||||
weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
|
||||
weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
|
||||
|
||||
if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
|
||||
/*
|
||||
* If this metaslab is one we're actively using, adjust its
|
||||
* weight to make it preferable to any inactive metaslab so
|
||||
* we'll polish it off.
|
||||
*/
|
||||
weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
|
||||
}
|
||||
return (weight);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_prefetch(metaslab_group_t *mg)
|
||||
{
|
||||
spa_t *spa = mg->mg_vd->vdev_spa;
|
||||
metaslab_t *msp;
|
||||
avl_tree_t *t = &mg->mg_metaslab_tree;
|
||||
int m;
|
||||
|
||||
mutex_enter(&mg->mg_lock);
|
||||
|
||||
/*
|
||||
* If this metaslab is one we're actively using, adjust its weight to
|
||||
* make it preferable to any inactive metaslab so we'll polish it off.
|
||||
* Prefetch the next potential metaslabs
|
||||
*/
|
||||
weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
|
||||
for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
|
||||
space_map_t *sm = &msp->ms_map;
|
||||
space_map_obj_t *smo = &msp->ms_smo;
|
||||
|
||||
return (weight);
|
||||
/* If we have reached our prefetch limit then we're done */
|
||||
if (m >= metaslab_prefetch_limit)
|
||||
break;
|
||||
|
||||
if (!sm->sm_loaded && smo->smo_object != 0) {
|
||||
mutex_exit(&mg->mg_lock);
|
||||
dmu_prefetch(spa->spa_meta_objset, smo->smo_object,
|
||||
0ULL, smo->smo_objsize);
|
||||
mutex_enter(&mg->mg_lock);
|
||||
}
|
||||
}
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
|
||||
static int
|
||||
metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
|
||||
{
|
||||
metaslab_group_t *mg = msp->ms_group;
|
||||
space_map_t *sm = &msp->ms_map;
|
||||
space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
|
||||
|
||||
@ -587,6 +766,15 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Track the bonus area as we activate new metaslabs.
|
||||
*/
|
||||
if (sm->sm_start > mg->mg_bonus_area) {
|
||||
mutex_enter(&mg->mg_lock);
|
||||
mg->mg_bonus_area = sm->sm_start;
|
||||
mutex_exit(&mg->mg_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* If we were able to load the map then make sure
|
||||
* that this map is still able to satisfy our request.
|
||||
@ -773,6 +961,32 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
void
|
||||
metaslab_sync_reassess(metaslab_group_t *mg)
|
||||
{
|
||||
vdev_t *vd = mg->mg_vd;
|
||||
|
||||
/*
|
||||
* Re-evaluate all metaslabs which have lower offsets than the
|
||||
* bonus area.
|
||||
*/
|
||||
for (int m = 0; m < vd->vdev_ms_count; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
|
||||
if (msp->ms_map.sm_start > mg->mg_bonus_area)
|
||||
break;
|
||||
|
||||
mutex_enter(&msp->ms_lock);
|
||||
metaslab_group_sort(mg, msp, metaslab_weight(msp));
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Prefetch the next potential metaslabs
|
||||
*/
|
||||
metaslab_prefetch(mg);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
metaslab_distance(metaslab_t *msp, dva_t *dva)
|
||||
{
|
||||
@ -868,7 +1082,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
|
||||
if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
|
||||
break;
|
||||
|
||||
metaslab_passivate(msp, size - 1);
|
||||
metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
|
||||
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
@ -74,35 +74,38 @@ enum zti_modes {
|
||||
zti_mode_fixed, /* value is # of threads (min 1) */
|
||||
zti_mode_online_percent, /* value is % of online CPUs */
|
||||
zti_mode_tune, /* fill from zio_taskq_tune_* */
|
||||
zti_mode_null, /* don't create a taskq */
|
||||
zti_nmodes
|
||||
};
|
||||
|
||||
#define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) }
|
||||
#define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) }
|
||||
#define ZTI_THREAD_TUNE { zti_mode_tune, 0 }
|
||||
#define ZTI_FIX(n) { zti_mode_fixed, (n) }
|
||||
#define ZTI_PCT(n) { zti_mode_online_percent, (n) }
|
||||
#define ZTI_TUNE { zti_mode_tune, 0 }
|
||||
#define ZTI_NULL { zti_mode_null, 0 }
|
||||
|
||||
#define ZTI_THREAD_ONE ZTI_THREAD_FIX(1)
|
||||
#define ZTI_ONE ZTI_FIX(1)
|
||||
|
||||
typedef struct zio_taskq_info {
|
||||
const char *zti_name;
|
||||
struct {
|
||||
enum zti_modes zti_mode;
|
||||
uint_t zti_value;
|
||||
} zti_nthreads[ZIO_TASKQ_TYPES];
|
||||
enum zti_modes zti_mode;
|
||||
uint_t zti_value;
|
||||
} zio_taskq_info_t;
|
||||
|
||||
static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
|
||||
"issue", "intr"
|
||||
"issue", "issue_high", "intr", "intr_high"
|
||||
};
|
||||
|
||||
const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = {
|
||||
/* ISSUE INTR */
|
||||
{ "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
|
||||
{ "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } },
|
||||
{ "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } },
|
||||
{ "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
|
||||
{ "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
|
||||
{ "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
|
||||
/*
|
||||
* Define the taskq threads for the following I/O types:
|
||||
* NULL, READ, WRITE, FREE, CLAIM, and IOCTL
|
||||
*/
|
||||
const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
|
||||
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
|
||||
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
|
||||
{ ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL },
|
||||
{ ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
|
||||
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
|
||||
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
|
||||
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
|
||||
};
|
||||
|
||||
enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
|
||||
@ -581,14 +584,14 @@ spa_activate(spa_t *spa, int mode)
|
||||
spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
|
||||
|
||||
for (int t = 0; t < ZIO_TYPES; t++) {
|
||||
const zio_taskq_info_t *ztip = &zio_taskqs[t];
|
||||
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
|
||||
enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
|
||||
uint_t value = ztip->zti_nthreads[q].zti_value;
|
||||
const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
|
||||
enum zti_modes mode = ztip->zti_mode;
|
||||
uint_t value = ztip->zti_value;
|
||||
char name[32];
|
||||
|
||||
(void) snprintf(name, sizeof (name),
|
||||
"%s_%s", ztip->zti_name, zio_taskq_types[q]);
|
||||
"%s_%s", zio_type_name[t], zio_taskq_types[q]);
|
||||
|
||||
if (mode == zti_mode_tune) {
|
||||
mode = zio_taskq_tune_mode;
|
||||
@ -613,6 +616,10 @@ spa_activate(spa_t *spa, int mode)
|
||||
TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
|
||||
break;
|
||||
|
||||
case zti_mode_null:
|
||||
spa->spa_zio_taskq[t][q] = NULL;
|
||||
break;
|
||||
|
||||
case zti_mode_tune:
|
||||
default:
|
||||
panic("unrecognized mode for "
|
||||
@ -659,7 +666,8 @@ spa_deactivate(spa_t *spa)
|
||||
|
||||
for (int t = 0; t < ZIO_TYPES; t++) {
|
||||
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
|
||||
taskq_destroy(spa->spa_zio_taskq[t][q]);
|
||||
if (spa->spa_zio_taskq[t][q] != NULL)
|
||||
taskq_destroy(spa->spa_zio_taskq[t][q]);
|
||||
spa->spa_zio_taskq[t][q] = NULL;
|
||||
}
|
||||
}
|
||||
|
@ -368,10 +368,8 @@ space_map_unload(space_map_t *sm)
|
||||
uint64_t
|
||||
space_map_maxsize(space_map_t *sm)
|
||||
{
|
||||
if (sm->sm_loaded && sm->sm_ops != NULL)
|
||||
return (sm->sm_ops->smop_max(sm));
|
||||
else
|
||||
return (-1ULL);
|
||||
ASSERT(sm->sm_ops != NULL);
|
||||
return (sm->sm_ops->smop_max(sm));
|
||||
}
|
||||
|
||||
uint64_t
|
||||
|
@ -46,6 +46,7 @@ extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
|
||||
extern void metaslab_fini(metaslab_t *msp);
|
||||
extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
|
||||
extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
|
||||
extern void metaslab_sync_reassess(metaslab_group_t *mg);
|
||||
|
||||
#define METASLAB_HINTBP_FAVOR 0x0
|
||||
#define METASLAB_HINTBP_AVOID 0x1
|
||||
|
@ -46,6 +46,7 @@ struct metaslab_group {
|
||||
kmutex_t mg_lock;
|
||||
avl_tree_t mg_metaslab_tree;
|
||||
uint64_t mg_aliquot;
|
||||
uint64_t mg_bonus_area;
|
||||
int64_t mg_bias;
|
||||
metaslab_class_t *mg_class;
|
||||
vdev_t *mg_vd;
|
||||
|
@ -87,7 +87,9 @@ typedef enum spa_log_state {
|
||||
|
||||
enum zio_taskq_type {
|
||||
ZIO_TASKQ_ISSUE = 0,
|
||||
ZIO_TASKQ_ISSUE_HIGH,
|
||||
ZIO_TASKQ_INTERRUPT,
|
||||
ZIO_TASKQ_INTERRUPT_HIGH,
|
||||
ZIO_TASKQ_TYPES
|
||||
};
|
||||
|
||||
|
@ -77,6 +77,7 @@ struct space_map_ops {
|
||||
void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
uint64_t (*smop_max)(space_map_t *sm);
|
||||
boolean_t (*smop_fragmented)(space_map_t *sm);
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -107,14 +107,15 @@ enum zio_compress {
|
||||
#define ZIO_PRIORITY_NOW (zio_priority_table[0])
|
||||
#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1])
|
||||
#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2])
|
||||
#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3])
|
||||
#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4])
|
||||
#define ZIO_PRIORITY_FREE (zio_priority_table[5])
|
||||
#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6])
|
||||
#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7])
|
||||
#define ZIO_PRIORITY_RESILVER (zio_priority_table[8])
|
||||
#define ZIO_PRIORITY_SCRUB (zio_priority_table[9])
|
||||
#define ZIO_PRIORITY_TABLE_SIZE 10
|
||||
#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3])
|
||||
#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4])
|
||||
#define ZIO_PRIORITY_AGG (zio_priority_table[5])
|
||||
#define ZIO_PRIORITY_FREE (zio_priority_table[6])
|
||||
#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7])
|
||||
#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8])
|
||||
#define ZIO_PRIORITY_RESILVER (zio_priority_table[9])
|
||||
#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
|
||||
#define ZIO_PRIORITY_TABLE_SIZE 11
|
||||
|
||||
#define ZIO_FLAG_MUSTSUCCEED 0x00000
|
||||
#define ZIO_FLAG_CANFAIL 0x00001
|
||||
|
@ -1773,9 +1773,13 @@ void
|
||||
vdev_sync_done(vdev_t *vd, uint64_t txg)
|
||||
{
|
||||
metaslab_t *msp;
|
||||
boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
|
||||
|
||||
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
|
||||
metaslab_sync_done(msp, txg);
|
||||
|
||||
if (reassess)
|
||||
metaslab_sync_reassess(vd->vdev_mg);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -233,7 +233,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
|
||||
ASSERT(size <= zfs_vdev_aggregation_limit);
|
||||
|
||||
aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
|
||||
zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW,
|
||||
zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
|
||||
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
|
||||
vdev_queue_agg_io_done, NULL);
|
||||
|
||||
|
@ -49,11 +49,12 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
|
||||
0, /* ZIO_PRIORITY_NOW */
|
||||
0, /* ZIO_PRIORITY_SYNC_READ */
|
||||
0, /* ZIO_PRIORITY_SYNC_WRITE */
|
||||
6, /* ZIO_PRIORITY_ASYNC_READ */
|
||||
4, /* ZIO_PRIORITY_ASYNC_WRITE */
|
||||
4, /* ZIO_PRIORITY_FREE */
|
||||
0, /* ZIO_PRIORITY_CACHE_FILL */
|
||||
0, /* ZIO_PRIORITY_LOG_WRITE */
|
||||
1, /* ZIO_PRIORITY_CACHE_FILL */
|
||||
1, /* ZIO_PRIORITY_AGG */
|
||||
4, /* ZIO_PRIORITY_FREE */
|
||||
4, /* ZIO_PRIORITY_ASYNC_WRITE */
|
||||
6, /* ZIO_PRIORITY_ASYNC_READ */
|
||||
10, /* ZIO_PRIORITY_RESILVER */
|
||||
20, /* ZIO_PRIORITY_SCRUB */
|
||||
};
|
||||
@ -64,7 +65,9 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
|
||||
* ==========================================================================
|
||||
*/
|
||||
char *zio_type_name[ZIO_TYPES] = {
|
||||
"null", "read", "write", "free", "claim", "ioctl" };
|
||||
"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
|
||||
"zio_ioctl"
|
||||
};
|
||||
|
||||
#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */
|
||||
#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
|
||||
@ -942,6 +945,7 @@ zio_write_bp_init(zio_t *zio)
|
||||
static void
|
||||
zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
|
||||
{
|
||||
spa_t *spa = zio->io_spa;
|
||||
zio_type_t t = zio->io_type;
|
||||
|
||||
/*
|
||||
@ -958,7 +962,15 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
|
||||
if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
|
||||
t = ZIO_TYPE_NULL;
|
||||
|
||||
(void) taskq_dispatch_safe(zio->io_spa->spa_zio_taskq[t][q],
|
||||
/*
|
||||
* If this is a high priority I/O, then use the high priority taskq.
|
||||
*/
|
||||
if (zio->io_priority == ZIO_PRIORITY_NOW &&
|
||||
spa->spa_zio_taskq[t][q + 1] != NULL)
|
||||
q++;
|
||||
|
||||
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
|
||||
(void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q],
|
||||
(task_func_t *)zio_execute, zio, &zio->io_task);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user