diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 4b3dc3666db0..fe31e292a792 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -411,6 +411,20 @@ pruning the inode and dentry caches. Default value: \fB10,000\fR. .RE +.sp +.ne 2 +.na +\fBzfs_arc_meta_adjust_restarts\fR (ulong) +.ad +.RS 12n +The number of restart passes to make while scanning the ARC attempting +the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR. +This value should not need to be tuned but is available to facilitate +performance analysis. +.sp +Default value: \fB4096\fR. +.RE + .sp .ne 2 .na diff --git a/module/zfs/arc.c b/module/zfs/arc.c index f9f0008c00b6..bdf116c35c52 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -221,6 +221,11 @@ unsigned long zfs_arc_max = 0; unsigned long zfs_arc_min = 0; unsigned long zfs_arc_meta_limit = 0; +/* + * Limit the number of restarts in arc_adjust_meta() + */ +unsigned long zfs_arc_meta_adjust_restarts = 4096; + /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; @@ -2195,15 +2200,30 @@ arc_do_user_evicts(void) } /* - * Evict only meta data objects from the cache leaving the data objects. - * This is only used to enforce the tunable arc_meta_limit, if we are - * unable to evict enough buffers notify the user via the prune callback. + * The goal of this function is to evict enough meta data buffers from the + * ARC in order to enforce the arc_meta_limit. Achieving this is slightly + * more complicated than it appears because it is common for data buffers + * to have holds on meta data buffers. In addition, dnode meta data buffers + * will be held by the dnodes in the block preventing them from being freed. + * This means we can't simply traverse the ARC and expect to always find + * enough unheld meta data buffer to release. + * + * Therefore, this function has been updated to make alternating passes + * over the ARC releasing data buffers and then newly unheld meta data + * buffers. This ensures forward progress is maintained and arc_meta_used + * will decrease. Normally this is sufficient, but if required the ARC + * will call the registered prune callbacks causing dentry and inodes to + * be dropped from the VFS cache. This will make dnode meta data buffers + * available for reclaim. */ static void arc_adjust_meta(void) { - int64_t adjustmnt, delta; + int64_t adjustmnt, delta, prune = 0; + arc_buf_contents_t type = ARC_BUFC_DATA; + unsigned long restarts = zfs_arc_meta_adjust_restarts; +restart: /* * This slightly differs than the way we evict from the mru in * arc_adjust because we don't have a "target" value (i.e. no @@ -2214,9 +2234,9 @@ arc_adjust_meta(void) */ adjustmnt = arc_meta_used - arc_meta_limit; - if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); - arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); + if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) { + delta = MIN(arc_mru->arcs_lsize[type], adjustmnt); + arc_evict(arc_mru, 0, delta, FALSE, type); adjustmnt -= delta; } @@ -2230,31 +2250,50 @@ arc_adjust_meta(void) * simply decrement the amount of data evicted from the MRU. */ - if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); - arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); + if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) { + delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt); + arc_evict(arc_mfu, 0, delta, FALSE, type); } - adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] + - arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit; + adjustmnt = arc_meta_used - arc_meta_limit; - if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { + if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { delta = MIN(adjustmnt, - arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]); - arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA); + arc_mru_ghost->arcs_lsize[type]); + arc_evict_ghost(arc_mru_ghost, 0, delta, type); + adjustmnt -= delta; } - adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] + - arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit; - - if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { + if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) { delta = MIN(adjustmnt, - arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]); - arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA); + arc_mfu_ghost->arcs_lsize[type]); + arc_evict_ghost(arc_mfu_ghost, 0, delta, type); } - if (arc_meta_used > arc_meta_limit) - arc_do_user_prune(zfs_arc_meta_prune); + /* + * If after attempting to make the requested adjustment to the ARC + * the meta limit is still being exceeded then request that the + * higher layers drop some cached objects which have holds on ARC + * meta buffers. Requests to the upper layers will be made with + * increasingly large scan sizes until the ARC is below the limit. + */ + if (arc_meta_used > arc_meta_limit) { + if (type == ARC_BUFC_DATA) { + type = ARC_BUFC_METADATA; + } else { + type = ARC_BUFC_DATA; + + if (zfs_arc_meta_prune) { + prune += zfs_arc_meta_prune; + arc_do_user_prune(prune); + } + } + + if (restarts > 0) { + restarts--; + goto restart; + } + } } /* @@ -5609,6 +5648,10 @@ MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); module_param(zfs_arc_meta_prune, int, 0644); MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune"); +module_param(zfs_arc_meta_adjust_restarts, ulong, 0644); +MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts, + "Limit number of restarts in arc_adjust_meta"); + module_param(zfs_arc_grow_retry, int, 0644); MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");