Fix arc_adjust_meta() behavior

The goal of this function is to evict enough meta data buffers from the
ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
more complicated than it appears because it is common for data buffers
to have holds on meta data buffers.  In addition, dnode meta data buffers
will be held by the dnodes in the block preventing them from being freed.
This means we can't simply traverse the ARC and expect to always find
enough unheld meta data buffer to release.

Therefore, this function has been updated to make alternating passes
over the ARC releasing data buffers and then newly unheld meta data
buffers.  This ensures forward progress is maintained and arc_meta_used
will decrease.  Normally this is sufficient, but if required the ARC
will call the registered prune callbacks causing dentry and inodes to
be dropped from the VFS cache.  This will make dnode meta data buffers
available for reclaim.  The number of total restarts in limited by
zfs_arc_meta_adjust_restarts to prevent spinning in the rare case
where all meta data is pinned.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Pavel Snajdr <snajpa@snajpa.net>
Issue #3160
This commit is contained in:
Brian Behlendorf 2015-03-17 15:08:22 -07:00
parent 2cbb06b561
commit bc88866657
2 changed files with 80 additions and 23 deletions

View File

@ -411,6 +411,20 @@ pruning the inode and dentry caches.
Default value: \fB10,000\fR. Default value: \fB10,000\fR.
.RE .RE
.sp
.ne 2
.na
\fBzfs_arc_meta_adjust_restarts\fR (ulong)
.ad
.RS 12n
The number of restart passes to make while scanning the ARC attempting
the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR.
This value should not need to be tuned but is available to facilitate
performance analysis.
.sp
Default value: \fB4096\fR.
.RE
.sp .sp
.ne 2 .ne 2
.na .na

View File

@ -221,6 +221,11 @@ unsigned long zfs_arc_max = 0;
unsigned long zfs_arc_min = 0; unsigned long zfs_arc_min = 0;
unsigned long zfs_arc_meta_limit = 0; unsigned long zfs_arc_meta_limit = 0;
/*
* Limit the number of restarts in arc_adjust_meta()
*/
unsigned long zfs_arc_meta_adjust_restarts = 4096;
/* The 6 states: */ /* The 6 states: */
static arc_state_t ARC_anon; static arc_state_t ARC_anon;
static arc_state_t ARC_mru; static arc_state_t ARC_mru;
@ -2195,15 +2200,30 @@ arc_do_user_evicts(void)
} }
/* /*
* Evict only meta data objects from the cache leaving the data objects. * The goal of this function is to evict enough meta data buffers from the
* This is only used to enforce the tunable arc_meta_limit, if we are * ARC in order to enforce the arc_meta_limit. Achieving this is slightly
* unable to evict enough buffers notify the user via the prune callback. * more complicated than it appears because it is common for data buffers
* to have holds on meta data buffers. In addition, dnode meta data buffers
* will be held by the dnodes in the block preventing them from being freed.
* This means we can't simply traverse the ARC and expect to always find
* enough unheld meta data buffer to release.
*
* Therefore, this function has been updated to make alternating passes
* over the ARC releasing data buffers and then newly unheld meta data
* buffers. This ensures forward progress is maintained and arc_meta_used
* will decrease. Normally this is sufficient, but if required the ARC
* will call the registered prune callbacks causing dentry and inodes to
* be dropped from the VFS cache. This will make dnode meta data buffers
* available for reclaim.
*/ */
static void static void
arc_adjust_meta(void) arc_adjust_meta(void)
{ {
int64_t adjustmnt, delta; int64_t adjustmnt, delta, prune = 0;
arc_buf_contents_t type = ARC_BUFC_DATA;
unsigned long restarts = zfs_arc_meta_adjust_restarts;
restart:
/* /*
* This slightly differs than the way we evict from the mru in * This slightly differs than the way we evict from the mru in
* arc_adjust because we don't have a "target" value (i.e. no * arc_adjust because we don't have a "target" value (i.e. no
@ -2214,9 +2234,9 @@ arc_adjust_meta(void)
*/ */
adjustmnt = arc_meta_used - arc_meta_limit; adjustmnt = arc_meta_used - arc_meta_limit;
if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); arc_evict(arc_mru, 0, delta, FALSE, type);
adjustmnt -= delta; adjustmnt -= delta;
} }
@ -2230,31 +2250,50 @@ arc_adjust_meta(void)
* simply decrement the amount of data evicted from the MRU. * simply decrement the amount of data evicted from the MRU.
*/ */
if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); arc_evict(arc_mfu, 0, delta, FALSE, type);
} }
adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] + adjustmnt = arc_meta_used - arc_meta_limit;
arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
delta = MIN(adjustmnt, delta = MIN(adjustmnt,
arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]); arc_mru_ghost->arcs_lsize[type]);
arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA); arc_evict_ghost(arc_mru_ghost, 0, delta, type);
adjustmnt -= delta;
} }
adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] + if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
delta = MIN(adjustmnt, delta = MIN(adjustmnt,
arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]); arc_mfu_ghost->arcs_lsize[type]);
arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA); arc_evict_ghost(arc_mfu_ghost, 0, delta, type);
} }
if (arc_meta_used > arc_meta_limit) /*
arc_do_user_prune(zfs_arc_meta_prune); * If after attempting to make the requested adjustment to the ARC
* the meta limit is still being exceeded then request that the
* higher layers drop some cached objects which have holds on ARC
* meta buffers. Requests to the upper layers will be made with
* increasingly large scan sizes until the ARC is below the limit.
*/
if (arc_meta_used > arc_meta_limit) {
if (type == ARC_BUFC_DATA) {
type = ARC_BUFC_METADATA;
} else {
type = ARC_BUFC_DATA;
if (zfs_arc_meta_prune) {
prune += zfs_arc_meta_prune;
arc_do_user_prune(prune);
}
}
if (restarts > 0) {
restarts--;
goto restart;
}
}
} }
/* /*
@ -5609,6 +5648,10 @@ MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
module_param(zfs_arc_meta_prune, int, 0644); module_param(zfs_arc_meta_prune, int, 0644);
MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune"); MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
"Limit number of restarts in arc_adjust_meta");
module_param(zfs_arc_grow_retry, int, 0644); module_param(zfs_arc_grow_retry, int, 0644);
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");