MFC r258633: MFV r255256: 3954 metaslabs continue to load even after
hitting zfs_mg_alloc_failure limit
This commit is contained in:
parent
26096ba436
commit
926bf5d727
@ -65,7 +65,8 @@ int zfs_condense_pct = 200;
|
|||||||
/*
|
/*
|
||||||
* This value defines the number of allowed allocation failures per vdev.
|
* This value defines the number of allowed allocation failures per vdev.
|
||||||
* If a device reaches this threshold in a given txg then we consider skipping
|
* If a device reaches this threshold in a given txg then we consider skipping
|
||||||
* allocations on that device.
|
* allocations on that device. The value of zfs_mg_alloc_failures is computed
|
||||||
|
* in zio_init() unless it has been overridden in /etc/system.
|
||||||
*/
|
*/
|
||||||
int zfs_mg_alloc_failures = 0;
|
int zfs_mg_alloc_failures = 0;
|
||||||
TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures);
|
TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures);
|
||||||
@ -73,6 +74,21 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RWTUN,
|
|||||||
&zfs_mg_alloc_failures, 0,
|
&zfs_mg_alloc_failures, 0,
|
||||||
"Number of allowed allocation failures per vdev");
|
"Number of allowed allocation failures per vdev");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The zfs_mg_noalloc_threshold defines which metaslab groups should
|
||||||
|
* be eligible for allocation. The value is defined as a percentage of
|
||||||
|
* a free space. Metaslab groups that have more free space than
|
||||||
|
* zfs_mg_noalloc_threshold are always eligible for allocations. Once
|
||||||
|
* a metaslab group's free space is less than or equal to the
|
||||||
|
* zfs_mg_noalloc_threshold the allocator will avoid allocating to that
|
||||||
|
* group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
|
||||||
|
* Once all groups in the pool reach zfs_mg_noalloc_threshold then all
|
||||||
|
* groups are allowed to accept allocations. Gang blocks are always
|
||||||
|
* eligible to allocate on any metaslab group. The default value of 0 means
|
||||||
|
* no metaslab group will be excluded based on this criterion.
|
||||||
|
*/
|
||||||
|
int zfs_mg_noalloc_threshold = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
|
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
|
||||||
*/
|
*/
|
||||||
@ -289,6 +305,53 @@ metaslab_compare(const void *x1, const void *x2)
|
|||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Update the allocatable flag and the metaslab group's capacity.
|
||||||
|
* The allocatable flag is set to true if the capacity is below
|
||||||
|
* the zfs_mg_noalloc_threshold. If a metaslab group transitions
|
||||||
|
* from allocatable to non-allocatable or vice versa then the metaslab
|
||||||
|
* group's class is updated to reflect the transition.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
metaslab_group_alloc_update(metaslab_group_t *mg)
|
||||||
|
{
|
||||||
|
vdev_t *vd = mg->mg_vd;
|
||||||
|
metaslab_class_t *mc = mg->mg_class;
|
||||||
|
vdev_stat_t *vs = &vd->vdev_stat;
|
||||||
|
boolean_t was_allocatable;
|
||||||
|
|
||||||
|
ASSERT(vd == vd->vdev_top);
|
||||||
|
|
||||||
|
mutex_enter(&mg->mg_lock);
|
||||||
|
was_allocatable = mg->mg_allocatable;
|
||||||
|
|
||||||
|
mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
|
||||||
|
(vs->vs_space + 1);
|
||||||
|
|
||||||
|
mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The mc_alloc_groups maintains a count of the number of
|
||||||
|
* groups in this metaslab class that are still above the
|
||||||
|
* zfs_mg_noalloc_threshold. This is used by the allocating
|
||||||
|
* threads to determine if they should avoid allocations to
|
||||||
|
* a given group. The allocator will avoid allocations to a group
|
||||||
|
* if that group has reached or is below the zfs_mg_noalloc_threshold
|
||||||
|
* and there are still other groups that are above the threshold.
|
||||||
|
* When a group transitions from allocatable to non-allocatable or
|
||||||
|
* vice versa we update the metaslab class to reflect that change.
|
||||||
|
* When the mc_alloc_groups value drops to 0 that means that all
|
||||||
|
* groups have reached the zfs_mg_noalloc_threshold making all groups
|
||||||
|
* eligible for allocations. This effectively means that all devices
|
||||||
|
* are balanced again.
|
||||||
|
*/
|
||||||
|
if (was_allocatable && !mg->mg_allocatable)
|
||||||
|
mc->mc_alloc_groups--;
|
||||||
|
else if (!was_allocatable && mg->mg_allocatable)
|
||||||
|
mc->mc_alloc_groups++;
|
||||||
|
mutex_exit(&mg->mg_lock);
|
||||||
|
}
|
||||||
|
|
||||||
metaslab_group_t *
|
metaslab_group_t *
|
||||||
metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
|
metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
|
||||||
{
|
{
|
||||||
@ -339,6 +402,7 @@ metaslab_group_activate(metaslab_group_t *mg)
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
|
mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
|
||||||
|
metaslab_group_alloc_update(mg);
|
||||||
|
|
||||||
if ((mgprev = mc->mc_rotor) == NULL) {
|
if ((mgprev = mc->mc_rotor) == NULL) {
|
||||||
mg->mg_prev = mg;
|
mg->mg_prev = mg;
|
||||||
@ -425,6 +489,29 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
|
|||||||
mutex_exit(&mg->mg_lock);
|
mutex_exit(&mg->mg_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Determine if a given metaslab group should skip allocations. A metaslab
|
||||||
|
* group should avoid allocations if its used capacity has crossed the
|
||||||
|
* zfs_mg_noalloc_threshold and there is at least one metaslab group
|
||||||
|
* that can still handle allocations.
|
||||||
|
*/
|
||||||
|
static boolean_t
|
||||||
|
metaslab_group_allocatable(metaslab_group_t *mg)
|
||||||
|
{
|
||||||
|
vdev_t *vd = mg->mg_vd;
|
||||||
|
spa_t *spa = vd->vdev_spa;
|
||||||
|
metaslab_class_t *mc = mg->mg_class;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A metaslab group is considered allocatable if its free capacity
|
||||||
|
* is greater than the set value of zfs_mg_noalloc_threshold, it's
|
||||||
|
* associated with a slog, or there are no other metaslab groups
|
||||||
|
* with free capacity greater than zfs_mg_noalloc_threshold.
|
||||||
|
*/
|
||||||
|
return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
|
||||||
|
mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ==========================================================================
|
* ==========================================================================
|
||||||
* Common allocator routines
|
* Common allocator routines
|
||||||
@ -1374,6 +1461,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
|
|||||||
vdev_t *vd = mg->mg_vd;
|
vdev_t *vd = mg->mg_vd;
|
||||||
int64_t failures = mg->mg_alloc_failures;
|
int64_t failures = mg->mg_alloc_failures;
|
||||||
|
|
||||||
|
metaslab_group_alloc_update(mg);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Re-evaluate all metaslabs which have lower offsets than the
|
* Re-evaluate all metaslabs which have lower offsets than the
|
||||||
* bonus area.
|
* bonus area.
|
||||||
@ -1475,6 +1564,8 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
|
|||||||
if (msp == NULL)
|
if (msp == NULL)
|
||||||
return (-1ULL);
|
return (-1ULL);
|
||||||
|
|
||||||
|
mutex_enter(&msp->ms_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we've already reached the allowable number of failed
|
* If we've already reached the allowable number of failed
|
||||||
* allocation attempts on this metaslab group then we
|
* allocation attempts on this metaslab group then we
|
||||||
@ -1491,11 +1582,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
|
|||||||
"asize %llu, failures %llu", spa_name(spa),
|
"asize %llu, failures %llu", spa_name(spa),
|
||||||
mg->mg_vd->vdev_id, txg, mg, psize, asize,
|
mg->mg_vd->vdev_id, txg, mg, psize, asize,
|
||||||
mg->mg_alloc_failures);
|
mg->mg_alloc_failures);
|
||||||
|
mutex_exit(&msp->ms_lock);
|
||||||
return (-1ULL);
|
return (-1ULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_enter(&msp->ms_lock);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Ensure that the metaslab we have selected is still
|
* Ensure that the metaslab we have selected is still
|
||||||
* capable of handling our request. It's possible that
|
* capable of handling our request. It's possible that
|
||||||
@ -1648,6 +1738,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
|||||||
} else {
|
} else {
|
||||||
allocatable = vdev_allocatable(vd);
|
allocatable = vdev_allocatable(vd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Determine if the selected metaslab group is eligible
|
||||||
|
* for allocations. If we're ganging or have requested
|
||||||
|
* an allocation for the smallest gang block size
|
||||||
|
* then we don't want to avoid allocating to the this
|
||||||
|
* metaslab group. If we're in this condition we should
|
||||||
|
* try to allocate from any device possible so that we
|
||||||
|
* don't inadvertently return ENOSPC and suspend the pool
|
||||||
|
* even though space is still available.
|
||||||
|
*/
|
||||||
|
if (allocatable && CAN_FASTGANG(flags) &&
|
||||||
|
psize > SPA_GANGBLOCKSIZE)
|
||||||
|
allocatable = metaslab_group_allocatable(mg);
|
||||||
|
|
||||||
if (!allocatable)
|
if (!allocatable)
|
||||||
goto next;
|
goto next;
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _SYS_METASLAB_IMPL_H
|
#ifndef _SYS_METASLAB_IMPL_H
|
||||||
@ -45,6 +45,7 @@ struct metaslab_class {
|
|||||||
metaslab_group_t *mc_rotor;
|
metaslab_group_t *mc_rotor;
|
||||||
space_map_ops_t *mc_ops;
|
space_map_ops_t *mc_ops;
|
||||||
uint64_t mc_aliquot;
|
uint64_t mc_aliquot;
|
||||||
|
uint64_t mc_alloc_groups; /* # of allocatable groups */
|
||||||
uint64_t mc_alloc; /* total allocated space */
|
uint64_t mc_alloc; /* total allocated space */
|
||||||
uint64_t mc_deferred; /* total deferred frees */
|
uint64_t mc_deferred; /* total deferred frees */
|
||||||
uint64_t mc_space; /* total space (alloc + free) */
|
uint64_t mc_space; /* total space (alloc + free) */
|
||||||
@ -58,6 +59,8 @@ struct metaslab_group {
|
|||||||
uint64_t mg_aliquot;
|
uint64_t mg_aliquot;
|
||||||
uint64_t mg_bonus_area;
|
uint64_t mg_bonus_area;
|
||||||
uint64_t mg_alloc_failures;
|
uint64_t mg_alloc_failures;
|
||||||
|
boolean_t mg_allocatable; /* can we allocate? */
|
||||||
|
uint64_t mg_free_capacity; /* percentage free */
|
||||||
int64_t mg_bias;
|
int64_t mg_bias;
|
||||||
int64_t mg_activation_count;
|
int64_t mg_activation_count;
|
||||||
metaslab_class_t *mg_class;
|
metaslab_class_t *mg_class;
|
||||||
|
@ -2458,7 +2458,7 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
|
|||||||
if (error) {
|
if (error) {
|
||||||
error = metaslab_alloc(spa, spa_normal_class(spa), size,
|
error = metaslab_alloc(spa, spa_normal_class(spa), size,
|
||||||
new_bp, 1, txg, old_bp,
|
new_bp, 1, txg, old_bp,
|
||||||
METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
|
METASLAB_HINTBP_AVOID);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (error == 0) {
|
if (error == 0) {
|
||||||
|
Loading…
Reference in New Issue
Block a user