Enable balanced arc pruning

Taken from:
ommit f6046738365571bd647f804958dfdff8a32fbde4
Author: Brian Behlendorf <behlendorf1@llnl.gov>
Date:   Sat May 30 09:57:53 2015 -0500

    Make arc_prune() asynchronous

    As described in the comment above arc_adapt_thread() it is critical
    that the arc_adapt_thread() function never sleep while holding a hash
    lock.  This behavior was possible in the Linux implementation because
    the arc_prune() logic was implemented to be synchronous.  Under
    illumos the analogous dnlc_reduce_cache() function is asynchronous.

    To address this the arc_do_user_prune() function is has been reworked
    in to two new functions as follows:

    * arc_prune_async() is an asynchronous implementation which dispatches
    the prune callback to be run by the system taskq.  This makes it
    suitable to use in the context of the arc_adapt_thread().

    * arc_prune() is a synchronous implementation which depends on the
    arc_prune_async() implementation but blocks until the outstanding
    callbacks complete.  This is used in arc_kmem_reap_now() where it
    is safe, and expected, that memory will be freed.

    This patch additionally adds the zfs_arc_meta_strategy module option
    while allows the meta reclaim strategy to be configured.  It defaults
    to a balanced strategy which has been proved to work well under Linux
    but the illumos meta-only strategy can be enabled.

    Signed-off-by: Tim Chase <tim@chase2k.com>
    Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Matt Macy 2018-08-11 22:01:52 +00:00
parent 37310a98a8
commit 9f3a171221

View File

@ -525,6 +525,14 @@ typedef struct arc_state {
refcount_t arcs_size;
} arc_state_t;
/*
* Percentage that can be consumed by dnodes of ARC meta buffers.
*/
int zfs_arc_meta_prune = 10000;
unsigned long zfs_arc_dnode_limit_percent = 10;
int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
int zfs_arc_meta_adjust_restarts = 4096;
/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
@ -4075,12 +4083,115 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
return (0);
}
/*
* The goal of this function is to evict enough meta data buffers from the
* ARC in order to enforce the arc_meta_limit. Achieving this is slightly
* more complicated than it appears because it is common for data buffers
* to have holds on meta data buffers. In addition, dnode meta data buffers
* will be held by the dnodes in the block preventing them from being freed.
* This means we can't simply traverse the ARC and expect to always find
* enough unheld meta data buffer to release.
*
* Therefore, this function has been updated to make alternating passes
* over the ARC releasing data buffers and then newly unheld meta data
* buffers. This ensures forward progress is maintained and meta_used
* will decrease. Normally this is sufficient, but if required the ARC
* will call the registered prune callbacks causing dentry and inodes to
* be dropped from the VFS cache. This will make dnode meta data buffers
* available for reclaim.
*/
static uint64_t
arc_adjust_meta_balanced(uint64_t meta_used)
{
int64_t delta, prune = 0, adjustmnt;
uint64_t total_evicted = 0;
arc_buf_contents_t type = ARC_BUFC_DATA;
int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
restart:
/*
* This slightly differs than the way we evict from the mru in
* arc_adjust because we don't have a "target" value (i.e. no
* "meta" arc_p). As a result, I think we can completely
* cannibalize the metadata in the MRU before we evict the
* metadata from the MFU. I think we probably need to implement a
* "metadata arc_p" value to do this properly.
*/
adjustmnt = meta_used - arc_meta_limit;
if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) {
delta = MIN(refcount_count(&arc_mru->arcs_esize[type]),
adjustmnt);
total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
adjustmnt -= delta;
}
/*
* We can't afford to recalculate adjustmnt here. If we do,
* new metadata buffers can sneak into the MRU or ANON lists,
* thus penalize the MFU metadata. Although the fudge factor is
* small, it has been empirically shown to be significant for
* certain workloads (e.g. creating many empty directories). As
* such, we use the original calculation for adjustmnt, and
* simply decrement the amount of data evicted from the MRU.
*/
if (adjustmnt > 0 && refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
delta = MIN(refcount_count(&arc_mfu->arcs_esize[type]),
adjustmnt);
total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
}
adjustmnt = meta_used - arc_meta_limit;
if (adjustmnt > 0 &&
refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
delta = MIN(adjustmnt,
refcount_count(&arc_mru_ghost->arcs_esize[type]));
total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
adjustmnt -= delta;
}
if (adjustmnt > 0 &&
refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
delta = MIN(adjustmnt,
refcount_count(&arc_mfu_ghost->arcs_esize[type]));
total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
}
/*
* If after attempting to make the requested adjustment to the ARC
* the meta limit is still being exceeded then request that the
* higher layers drop some cached objects which have holds on ARC
* meta buffers. Requests to the upper layers will be made with
* increasingly large scan sizes until the ARC is below the limit.
*/
if (meta_used > arc_meta_limit) {
if (type == ARC_BUFC_DATA) {
type = ARC_BUFC_METADATA;
} else {
type = ARC_BUFC_DATA;
if (zfs_arc_meta_prune) {
prune += zfs_arc_meta_prune;
arc_prune_async(prune);
}
}
if (restarts > 0) {
restarts--;
goto restart;
}
}
return (total_evicted);
}
/*
* Evict metadata buffers from the cache, such that arc_meta_used is
* capped by the arc_meta_limit tunable.
*/
static uint64_t
arc_adjust_meta(uint64_t meta_used)
arc_adjust_meta_only(uint64_t meta_used)
{
uint64_t total_evicted = 0;
int64_t target;
@ -4112,6 +4223,15 @@ arc_adjust_meta(uint64_t meta_used)
return (total_evicted);
}
static uint64_t
arc_adjust_meta(uint64_t meta_used)
{
if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
return (arc_adjust_meta_only(meta_used));
else
return (arc_adjust_meta_balanced(meta_used));
}
/*
* Return the type of the oldest buffer in the given arc state
*