From 9f3a1712216f8390f9a7dccce4a0da966e974bb8 Mon Sep 17 00:00:00 2001 From: Matt Macy Date: Sat, 11 Aug 2018 22:01:52 +0000 Subject: [PATCH] Enable balanced arc pruning Taken from: ommit f6046738365571bd647f804958dfdff8a32fbde4 Author: Brian Behlendorf Date: Sat May 30 09:57:53 2015 -0500 Make arc_prune() asynchronous As described in the comment above arc_adapt_thread() it is critical that the arc_adapt_thread() function never sleep while holding a hash lock. This behavior was possible in the Linux implementation because the arc_prune() logic was implemented to be synchronous. Under illumos the analogous dnlc_reduce_cache() function is asynchronous. To address this the arc_do_user_prune() function is has been reworked in to two new functions as follows: * arc_prune_async() is an asynchronous implementation which dispatches the prune callback to be run by the system taskq. This makes it suitable to use in the context of the arc_adapt_thread(). * arc_prune() is a synchronous implementation which depends on the arc_prune_async() implementation but blocks until the outstanding callbacks complete. This is used in arc_kmem_reap_now() where it is safe, and expected, that memory will be freed. This patch additionally adds the zfs_arc_meta_strategy module option while allows the meta reclaim strategy to be configured. It defaults to a balanced strategy which has been proved to work well under Linux but the illumos meta-only strategy can be enabled. Signed-off-by: Tim Chase Signed-off-by: Brian Behlendorf --- .../opensolaris/uts/common/fs/zfs/arc.c | 122 +++++++++++++++++- 1 file changed, 121 insertions(+), 1 deletion(-) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 97b0be4ab778..bb9b55cfee6e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -525,6 +525,14 @@ typedef struct arc_state { refcount_t arcs_size; } arc_state_t; +/* + * Percentage that can be consumed by dnodes of ARC meta buffers. + */ +int zfs_arc_meta_prune = 10000; +unsigned long zfs_arc_dnode_limit_percent = 10; +int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; +int zfs_arc_meta_adjust_restarts = 4096; + /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; @@ -4075,12 +4083,115 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, return (0); } +/* + * The goal of this function is to evict enough meta data buffers from the + * ARC in order to enforce the arc_meta_limit. Achieving this is slightly + * more complicated than it appears because it is common for data buffers + * to have holds on meta data buffers. In addition, dnode meta data buffers + * will be held by the dnodes in the block preventing them from being freed. + * This means we can't simply traverse the ARC and expect to always find + * enough unheld meta data buffer to release. + * + * Therefore, this function has been updated to make alternating passes + * over the ARC releasing data buffers and then newly unheld meta data + * buffers. This ensures forward progress is maintained and meta_used + * will decrease. Normally this is sufficient, but if required the ARC + * will call the registered prune callbacks causing dentry and inodes to + * be dropped from the VFS cache. This will make dnode meta data buffers + * available for reclaim. + */ +static uint64_t +arc_adjust_meta_balanced(uint64_t meta_used) +{ + int64_t delta, prune = 0, adjustmnt; + uint64_t total_evicted = 0; + arc_buf_contents_t type = ARC_BUFC_DATA; + int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); + +restart: + /* + * This slightly differs than the way we evict from the mru in + * arc_adjust because we don't have a "target" value (i.e. no + * "meta" arc_p). As a result, I think we can completely + * cannibalize the metadata in the MRU before we evict the + * metadata from the MFU. I think we probably need to implement a + * "metadata arc_p" value to do this properly. + */ + adjustmnt = meta_used - arc_meta_limit; + + if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&arc_mru->arcs_esize[type]), + adjustmnt); + total_evicted += arc_adjust_impl(arc_mru, 0, delta, type); + adjustmnt -= delta; + } + + /* + * We can't afford to recalculate adjustmnt here. If we do, + * new metadata buffers can sneak into the MRU or ANON lists, + * thus penalize the MFU metadata. Although the fudge factor is + * small, it has been empirically shown to be significant for + * certain workloads (e.g. creating many empty directories). As + * such, we use the original calculation for adjustmnt, and + * simply decrement the amount of data evicted from the MRU. + */ + + if (adjustmnt > 0 && refcount_count(&arc_mfu->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&arc_mfu->arcs_esize[type]), + adjustmnt); + total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type); + } + + adjustmnt = meta_used - arc_meta_limit; + + if (adjustmnt > 0 && + refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { + delta = MIN(adjustmnt, + refcount_count(&arc_mru_ghost->arcs_esize[type])); + total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type); + adjustmnt -= delta; + } + + if (adjustmnt > 0 && + refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { + delta = MIN(adjustmnt, + refcount_count(&arc_mfu_ghost->arcs_esize[type])); + total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type); + } + + /* + * If after attempting to make the requested adjustment to the ARC + * the meta limit is still being exceeded then request that the + * higher layers drop some cached objects which have holds on ARC + * meta buffers. Requests to the upper layers will be made with + * increasingly large scan sizes until the ARC is below the limit. + */ + if (meta_used > arc_meta_limit) { + if (type == ARC_BUFC_DATA) { + type = ARC_BUFC_METADATA; + } else { + type = ARC_BUFC_DATA; + + if (zfs_arc_meta_prune) { + prune += zfs_arc_meta_prune; + arc_prune_async(prune); + } + } + + if (restarts > 0) { + restarts--; + goto restart; + } + } + return (total_evicted); +} + /* * Evict metadata buffers from the cache, such that arc_meta_used is * capped by the arc_meta_limit tunable. */ static uint64_t -arc_adjust_meta(uint64_t meta_used) +arc_adjust_meta_only(uint64_t meta_used) { uint64_t total_evicted = 0; int64_t target; @@ -4112,6 +4223,15 @@ arc_adjust_meta(uint64_t meta_used) return (total_evicted); } +static uint64_t +arc_adjust_meta(uint64_t meta_used) +{ + if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) + return (arc_adjust_meta_only(meta_used)); + else + return (arc_adjust_meta_balanced(meta_used)); +} + /* * Return the type of the oldest buffer in the given arc state *