System-wide speculative prefetch limit.

With some pathological access patterns it is possible to make ZFS accumulate almost unlimited amount of speculative prefetch ZIOs. Combined with linear ABD allocations in RAIDZ code, it appears to be possible to exhaust system KVA, triggering kernel panic. Address this by introducing a system-wide counter of active prefetch requests and blocking prefetch distance doubling per stream hits if the number of active requests is higher that ~6% of ARC size. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #14516
2023-03-01 18:27:40 -05:00 · 2023-03-01 18:27:40 -05:00 · 5f42d1dbf2
commit 5f42d1dbf2
parent cd560c4474
2 changed files with 25 additions and 5 deletions
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@ -30,6 +30,7 @@
 #define	_SYS_ARC_IMPL_H

 #include <sys/arc.h>
+#include <sys/multilist.h>
 #include <sys/zio_crypt.h>
 #include <sys/zthr.h>
 #include <sys/aggsum.h>
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@ -28,6 +28,7 @@
 */

 #include <sys/zfs_context.h>
+#include <sys/arc_impl.h>
 #include <sys/dnode.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_zfetch.h>
@ -65,13 +66,15 @@ typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_misses;
 	kstat_named_t zfetchstat_max_streams;
 	kstat_named_t zfetchstat_io_issued;
+	kstat_named_t zfetchstat_io_active;
 } zfetch_stats_t;

 static zfetch_stats_t zfetch_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "max_streams",		KSTAT_DATA_UINT64 },
-	{ "io_issued",		KSTAT_DATA_UINT64 },
+	{ "io_issued",			KSTAT_DATA_UINT64 },
+	{ "io_active",			KSTAT_DATA_UINT64 },
 };

 struct {
@ -79,6 +82,7 @@ struct {
 	wmsum_t zfetchstat_misses;
 	wmsum_t zfetchstat_max_streams;
 	wmsum_t zfetchstat_io_issued;
+	aggsum_t zfetchstat_io_active;
 } zfetch_sums;

 #define	ZFETCHSTAT_BUMP(stat)					\
@ -104,6 +108,8 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
 	    wmsum_value(&zfetch_sums.zfetchstat_max_streams);
 	zs->zfetchstat_io_issued.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_io_issued);
+	zs->zfetchstat_io_active.value.ui64 =
+	    aggsum_value(&zfetch_sums.zfetchstat_io_active);
 	return (0);
 }

@ -114,6 +120,7 @@ zfetch_init(void)
 	wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
+	aggsum_init(&zfetch_sums.zfetchstat_io_active, 0);

 	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
@ -138,6 +145,8 @@ zfetch_fini(void)
 	wmsum_fini(&zfetch_sums.zfetchstat_misses);
 	wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
 	wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
+	ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active));
+	aggsum_fini(&zfetch_sums.zfetchstat_io_active);
 }

 /*
@ -294,6 +303,7 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
 		zs->zs_more = B_TRUE;
 	if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
 		dmu_zfetch_stream_fini(zs);
+	aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
 }

 /*
@ -407,20 +417,28 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	 * Start prefetch from the demand access size (nblks).  Double the
 	 * distance every access up to zfetch_min_distance.  After that only
 	 * if needed increase the distance by 1/8 up to zfetch_max_distance.
+	 *
+	 * Don't double the distance beyond single block if we have more
+	 * than ~6% of ARC held by active prefetches.  It should help with
+	 * getting out of RAM on some badly mispredicted read patterns.
 	 */
-	unsigned int nbytes = nblks << zf->zf_dnode->dn_datablkshift;
+	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
+	unsigned int nbytes = nblks << dbs;
 	unsigned int pf_nblks;
 	if (fetch_data) {
 		if (unlikely(zs->zs_pf_dist < nbytes))
 			zs->zs_pf_dist = nbytes;
-		else if (zs->zs_pf_dist < zfetch_min_distance)
+		else if (zs->zs_pf_dist < zfetch_min_distance &&
+		    (zs->zs_pf_dist < (1 << dbs) ||
+		    aggsum_compare(&zfetch_sums.zfetchstat_io_active,
+		    arc_c_max >> (4 + dbs)) < 0))
 			zs->zs_pf_dist *= 2;
 		else if (zs->zs_more)
 			zs->zs_pf_dist += zs->zs_pf_dist / 8;
 		zs->zs_more = B_FALSE;
 		if (zs->zs_pf_dist > zfetch_max_distance)
 			zs->zs_pf_dist = zfetch_max_distance;
-		pf_nblks = zs->zs_pf_dist >> zf->zf_dnode->dn_datablkshift;
+		pf_nblks = zs->zs_pf_dist >> dbs;
 	} else {
 		pf_nblks = 0;
 	}
@ -439,7 +457,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 		zs->zs_ipf_dist *= 2;
 	if (zs->zs_ipf_dist > zfetch_max_idistance)
 		zs->zs_ipf_dist = zfetch_max_idistance;
-	pf_nblks = zs->zs_ipf_dist >> zf->zf_dnode->dn_datablkshift;
+	pf_nblks = zs->zs_ipf_dist >> dbs;
 	if (zs->zs_ipf_start < zs->zs_pf_end)
 		zs->zs_ipf_start = zs->zs_pf_end;
 	if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
@ -510,6 +528,7 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
 			dmu_zfetch_stream_fini(zs);
 		return;
 	}
+	aggsum_add(&zfetch_sums.zfetchstat_io_active, issued);

 	if (!have_lock)
 		rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);