Fix arc_adjust_meta() behavior

The goal of this function is to evict enough meta data buffers from the ARC in order to enforce the arc_meta_limit. Achieving this is slightly more complicated than it appears because it is common for data buffers to have holds on meta data buffers. In addition, dnode meta data buffers will be held by the dnodes in the block preventing them from being freed. This means we can't simply traverse the ARC and expect to always find enough unheld meta data buffer to release. Therefore, this function has been updated to make alternating passes over the ARC releasing data buffers and then newly unheld meta data buffers. This ensures forward progress is maintained and arc_meta_used will decrease. Normally this is sufficient, but if required the ARC will call the registered prune callbacks causing dentry and inodes to be dropped from the VFS cache. This will make dnode meta data buffers available for reclaim. The number of total restarts in limited by zfs_arc_meta_adjust_restarts to prevent spinning in the rare case where all meta data is pinned. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Pavel Snajdr <snajpa@snajpa.net> Issue #3160
2015-03-17 15:08:22 -07:00 · 2015-03-17 15:08:22 -07:00 · bc88866657
commit bc88866657
parent 2cbb06b561
2 changed files with 80 additions and 23 deletions
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@ -411,6 +411,20 @@ pruning the inode and dentry caches.
 Default value: \fB10,000\fR.
 .RE
 .sp
 .ne 2
 .na
 \fBzfs_arc_meta_adjust_restarts\fR (ulong)
 .ad
 .RS 12n
 The number of restart passes to make while scanning the ARC attempting
 the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR.
 This value should not need to be tuned but is available to facilitate
 performance analysis.
 .sp
 Default value: \fB4096\fR.
 .RE
 .sp
 .ne 2
 .na
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@ -221,6 +221,11 @@ unsigned long zfs_arc_max = 0;
 unsigned long zfs_arc_min = 0;
 unsigned long zfs_arc_meta_limit = 0;
 /*
 * Limit the number of restarts in arc_adjust_meta()
 */
 unsigned long zfs_arc_meta_adjust_restarts = 4096;
 /* The 6 states: */
 static arc_state_t ARC_anon;
 static arc_state_t ARC_mru;
@ -2195,15 +2200,30 @@ arc_do_user_evicts(void)
 }
 /*
- * Evict only meta data objects from the cache leaving the data objects.
+ * The goal of this function is to evict enough meta data buffers from the
- * This is only used to enforce the tunable arc_meta_limit, if we are
+ * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
- * unable to evict enough buffers notify the user via the prune callback.
+ * more complicated than it appears because it is common for data buffers
 * to have holds on meta data buffers.  In addition, dnode meta data buffers
 * will be held by the dnodes in the block preventing them from being freed.
 * This means we can't simply traverse the ARC and expect to always find
 * enough unheld meta data buffer to release.
 *
 * Therefore, this function has been updated to make alternating passes
 * over the ARC releasing data buffers and then newly unheld meta data
 * buffers.  This ensures forward progress is maintained and arc_meta_used
 * will decrease.  Normally this is sufficient, but if required the ARC
 * will call the registered prune callbacks causing dentry and inodes to
 * be dropped from the VFS cache.  This will make dnode meta data buffers
 * available for reclaim.
 */
 static void
 arc_adjust_meta(void)
 {
-	int64_t adjustmnt, delta;
+	int64_t adjustmnt, delta, prune = 0;
 	arc_buf_contents_t type = ARC_BUFC_DATA;
 	unsigned long restarts = zfs_arc_meta_adjust_restarts;
 restart:
 	/*
 	 * This slightly differs than the way we evict from the mru in
 	 * arc_adjust because we don't have a "target" value (i.e. no
@ -2214,9 +2234,9 @@ arc_adjust_meta(void)
 	 */
 	adjustmnt = arc_meta_used - arc_meta_limit;
-	if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+	if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
-		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
+		delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
-		arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
+		arc_evict(arc_mru, 0, delta, FALSE, type);
 		adjustmnt -= delta;
 	}
@ -2230,31 +2250,50 @@ arc_adjust_meta(void)
 	 * simply decrement the amount of data evicted from the MRU.
 	 */
-	if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+	if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
-		delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
+		delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
-		arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
+		arc_evict(arc_mfu, 0, delta, FALSE, type);
 	}
-	adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+	adjustmnt = arc_meta_used - arc_meta_limit;
 	    arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
-	if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+	if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
 		delta = MIN(adjustmnt,
-		    arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]);
+		    arc_mru_ghost->arcs_lsize[type]);
-		arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA);
+		arc_evict_ghost(arc_mru_ghost, 0, delta, type);
 		adjustmnt -= delta;
 	}
-	adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
+	if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
 	    arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
 	if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
 		delta = MIN(adjustmnt,
-		    arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]);
+		    arc_mfu_ghost->arcs_lsize[type]);
-		arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA);
+		arc_evict_ghost(arc_mfu_ghost, 0, delta, type);
 	}
-	if (arc_meta_used > arc_meta_limit)
+	/*
-		arc_do_user_prune(zfs_arc_meta_prune);
+	 * If after attempting to make the requested adjustment to the ARC
 	 * the meta limit is still being exceeded then request that the
 	 * higher layers drop some cached objects which have holds on ARC
 	 * meta buffers.  Requests to the upper layers will be made with
 	 * increasingly large scan sizes until the ARC is below the limit.
 	 */
 	if (arc_meta_used > arc_meta_limit) {
 		if (type == ARC_BUFC_DATA) {
 			type = ARC_BUFC_METADATA;
 		} else {
 			type = ARC_BUFC_DATA;
 			if (zfs_arc_meta_prune) {
 				prune += zfs_arc_meta_prune;
 				arc_do_user_prune(prune);
 			}
 		}
 		if (restarts > 0) {
 			restarts--;
 			goto restart;
 		}
 	}
 }
 /*
@ -5609,6 +5648,10 @@ MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
 module_param(zfs_arc_meta_prune, int, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
 module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
 	"Limit number of restarts in arc_adjust_meta");
 module_param(zfs_arc_grow_retry, int, 0644);
 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");