vfs: distribute freevnodes counter per-cpu

It gets rolled up to the global when deferred requeueing is performed. A dedicated read routine makes sure to return a value only off by a certain amount. This soothes a global serialisation point for all 0<->1 hold count transitions. Reviewed by: jeff Differential Revision: https://reviews.freebsd.org/D23235
svn path=/head/; revision=356859
2020-01-18 01:29:02 +00:00 · 2020-01-18 01:29:02 +00:00 · 2d0c620272 · 2020-12-20 02:59:44 +00:00
commit 2d0c620272
parent 490ebb8f35
1 changed files with 95 additions and 24 deletions
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@ -191,10 +191,11 @@ static struct vnode *vnode_list_reclaim_marker;
 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
 * whenever vnlru_proc() becomes active.
 */
-static u_long wantfreevnodes;
-static u_long __exclusive_cache_line freevnodes;
+static long wantfreevnodes;
+static long __exclusive_cache_line freevnodes;
 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
    &freevnodes, 0, "Number of \"free\" vnodes");
+static long freevnodes_old;

 static counter_u64_t recycles_count;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
@ -299,6 +300,7 @@ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
 #define	VDBATCH_SIZE 8
 struct vdbatch {
 	u_int index;
+	long freevnodes;
 	struct mtx lock;
 	struct vnode *tab[VDBATCH_SIZE];
 };
@ -323,6 +325,8 @@ static u_long vlowat;		/* minimal extras before expansion */
 static u_long vstir;		/* nonzero to stir non-free vnodes */
 static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */

+static u_long vnlru_read_freevnodes(void);
+
 /*
 * Note that no attempt is made to sanitize these parameters.
 */
@ -1205,15 +1209,17 @@ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
 /*
 * Attempt to reduce the free list by the requested amount.
 */
-static void
+static int
 vnlru_free_locked(int count, struct vfsops *mnt_op)
 {
 	struct vnode *vp, *mvp;
 	struct mount *mp;
+	int ocount;

 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	if (count > max_vnlru_free)
 		count = max_vnlru_free;
+	ocount = count;
 	mvp = vnode_list_free_marker;
 restart:
 	vp = mvp;
@ -1254,6 +1260,7 @@ vnlru_free_locked(int count, struct vfsops *mnt_op)
 		mtx_lock(&vnode_list_mtx);
 		goto restart;
 	}
+	return (ocount - count);
 }

 void
@ -1283,11 +1290,60 @@ vnlru_recalc(void)
 static struct proc *vnlruproc;
 static int vnlruproc_sig;

+/*
+ * The main freevnodes counter is only updated when threads requeue their vnode
+ * batches. CPUs are conditionally walked to compute a more accurate total.
+ *
+ * Limit how much of a slop are we willing to tolerate. Note: the actual value
+ * at any given moment can still exceed slop, but it should not be by significant
+ * margin in practice.
+ */
+#define VNLRU_FREEVNODES_SLOP 128
+
+static u_long
+vnlru_read_freevnodes(void)
+{
+	struct vdbatch *vd;
+	long slop;
+	int cpu;
+
+	mtx_assert(&vnode_list_mtx, MA_OWNED);
+	if (freevnodes > freevnodes_old)
+		slop = freevnodes - freevnodes_old;
+	else
+		slop = freevnodes_old - freevnodes;
+	if (slop < VNLRU_FREEVNODES_SLOP)
+		return (freevnodes >= 0 ? freevnodes : 0);
+	freevnodes_old = freevnodes;
+	CPU_FOREACH(cpu) {
+		vd = DPCPU_ID_PTR((cpu), vd);
+		freevnodes_old += vd->freevnodes;
+	}
+	return (freevnodes_old >= 0 ? freevnodes_old : 0);
+}
+
 static bool
 vnlru_under(u_long rnumvnodes, u_long limit)
 {
 	u_long rfreevnodes, space;

+	if (__predict_false(rnumvnodes > desiredvnodes))
+		return (true);
+
+	space = desiredvnodes - rnumvnodes;
+	if (space < limit) {
+		rfreevnodes = vnlru_read_freevnodes();
+		if (rfreevnodes > wantfreevnodes)
+			space += rfreevnodes - wantfreevnodes;
+	}
+	return (space < limit);
+}
+
+static bool
+vnlru_under_unlocked(u_long rnumvnodes, u_long limit)
+{
+	long rfreevnodes, space;
+
 	if (__predict_false(rnumvnodes > desiredvnodes))
 		return (true);

@ -1317,16 +1373,23 @@ vnlru_proc(void)
 	u_long rnumvnodes, rfreevnodes, target;
 	unsigned long onumvnodes;
 	int done, force, trigger, usevnodes;
-	bool reclaim_nc_src;
+	bool reclaim_nc_src, want_reread;

 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
 	    SHUTDOWN_PRI_FIRST);

 	force = 0;
+	want_reread = false;
 	for (;;) {
 		kproc_suspend_check(vnlruproc);
 		mtx_lock(&vnode_list_mtx);
 		rnumvnodes = atomic_load_long(&numvnodes);
+
+		if (want_reread) {
+			force = vnlru_under(numvnodes, vhiwat) ? 1 : 0;
+			want_reread = false;
+		}
+
 		/*
 		 * If numvnodes is too large (due to desiredvnodes being
 		 * adjusted using its sysctl, or emergency growth), first
@ -1354,7 +1417,7 @@ vnlru_proc(void)
 			    PVFS|PDROP, "vlruwt", hz);
 			continue;
 		}
-		rfreevnodes = atomic_load_long(&freevnodes);
+		rfreevnodes = vnlru_read_freevnodes();

 		onumvnodes = rnumvnodes;
 		/*
@ -1397,16 +1460,14 @@ vnlru_proc(void)
 				force = 3;
 				continue;
 			}
+			want_reread = true;
 			force = 0;
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
-		} else
+		} else {
+			want_reread = true;
 			kern_yield(PRI_USER);
-		/*
-		 * After becoming active to expand above low water, keep
-		 * active until above high water.
-		 */
-		force = vnlru_under(numvnodes, vhiwat) ? 1 : 0;
+		}
 	}
 }

@ -1510,7 +1571,7 @@ vn_alloc_hard(struct mount *mp)
 		vn_alloc_cyclecount = 0;
 		goto alloc;
 	}
-	rfreevnodes = atomic_load_long(&freevnodes);
+	rfreevnodes = vnlru_read_freevnodes();
 	if (vn_alloc_cyclecount++ >= rfreevnodes) {
 		vn_alloc_cyclecount = 0;
 		vstir = 1;
@ -1525,10 +1586,8 @@ vn_alloc_hard(struct mount *mp)
 	 * should be chosen so that we never wait or even reclaim from
 	 * the free list to below its target minimum.
 	 */
-	if (rfreevnodes > 0) {
-		vnlru_free_locked(1, NULL);
+	if (vnlru_free_locked(1, NULL) > 0)
 		goto alloc;
-	}
 	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 		/*
 		 * Wait for space for a new vnode.
@ -1536,7 +1595,7 @@ vn_alloc_hard(struct mount *mp)
 		vnlru_kick();
 		msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
 		if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
-		    atomic_load_long(&freevnodes) > 1)
+		    vnlru_read_freevnodes() > 1)
 			vnlru_free_locked(1, NULL);
 	}
 alloc:
@ -1555,7 +1614,7 @@ vn_alloc(struct mount *mp)
 	if (__predict_false(vn_alloc_cyclecount != 0))
 		return (vn_alloc_hard(mp));
 	rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
-	if (__predict_false(vnlru_under(rnumvnodes, vlowat))) {
+	if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) {
 		atomic_subtract_long(&numvnodes, 1);
 		return (vn_alloc_hard(mp));
 	}
@ -3177,13 +3236,17 @@ vunref(struct vnode *vp)
 static void
 vhold_activate(struct vnode *vp)
 {
+	struct vdbatch *vd;

 	ASSERT_VI_LOCKED(vp, __func__);
 	VNASSERT(vp->v_holdcnt == 0, vp,
 	    ("%s: wrong hold count", __func__));
 	VNASSERT(vp->v_op != NULL, vp,
 	    ("%s: vnode already reclaimed.", __func__));
-	atomic_subtract_long(&freevnodes, 1);
+	critical_enter();
+	vd = DPCPU_PTR(vd);
+	vd->freevnodes--;
+	critical_exit();
 	refcount_acquire(&vp->v_holdcnt);
 }

@ -3233,9 +3296,12 @@ vdbatch_process(struct vdbatch *vd)
 	int i;

 	mtx_assert(&vd->lock, MA_OWNED);
+	MPASS(curthread->td_pinned > 0);
 	MPASS(vd->index == VDBATCH_SIZE);

 	mtx_lock(&vnode_list_mtx);
+	critical_enter();
+	freevnodes += vd->freevnodes;
 	for (i = 0; i < VDBATCH_SIZE; i++) {
 		vp = vd->tab[i];
 		TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
@ -3244,6 +3310,8 @@ vdbatch_process(struct vdbatch *vd)
 		vp->v_dbatchcpu = NOCPU;
 	}
 	mtx_unlock(&vnode_list_mtx);
+	critical_exit();
+	vd->freevnodes = 0;
 	bzero(vd->tab, sizeof(vd->tab));
 	vd->index = 0;
 }
@ -3257,20 +3325,24 @@ vdbatch_enqueue(struct vnode *vp)
 	VNASSERT(!VN_IS_DOOMED(vp), vp,
 	    ("%s: deferring requeue of a doomed vnode", __func__));

+	critical_enter();
+	vd = DPCPU_PTR(vd);
+	vd->freevnodes++;
 	if (vp->v_dbatchcpu != NOCPU) {
 		VI_UNLOCK(vp);
+		critical_exit();
 		return;
 	}

-	/*
-	 * A hack: pin us to the current CPU so that we know what to put in
-	 * ->v_dbatchcpu.
-	 */
 	sched_pin();
-	vd = DPCPU_PTR(vd);
+	critical_exit();
 	mtx_lock(&vd->lock);
 	MPASS(vd->index < VDBATCH_SIZE);
 	MPASS(vd->tab[vd->index] == NULL);
+	/*
+	 * A hack: we depend on being pinned so that we know what to put in
+	 * ->v_dbatchcpu.
+	 */
 	vp->v_dbatchcpu = curcpu;
 	vd->tab[vd->index] = vp;
 	vd->index++;
@ -3355,7 +3427,6 @@ vdrop_deactivate(struct vnode *vp)
 		mp->mnt_lazyvnodelistsize--;
 		mtx_unlock(&mp->mnt_listmtx);
 	}
-	atomic_add_long(&freevnodes, 1);
 	vdbatch_enqueue(vp);
 }