vfs: protect vnodes with smr

vget_prep_smr and vhold_smr can be used to ref a vnode while within vfs_smr section, allowing consumers to get away without locking. See vhold_smr and vdropl for comments explaining caveats. Reviewed by: kib Testec by: pho Differential Revision: https://reviews.freebsd.org/D23913
2020-07-01 05:56:29 +00:00 · 2020-07-01 05:56:29 +00:00 · 4074673a81
commit 4074673a81
parent f6cf06718b
2 changed files with 108 additions and 12 deletions
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@ -76,6 +76,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/smr.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
@ -238,6 +239,8 @@ static uma_zone_t buf_trie_zone;
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 __read_frequently smr_t vfs_smr;
 /*
 * The workitem queue.
 *
@ -661,7 +664,8 @@ vntblinit(void *dummy __unused)
 	vnode_list_reclaim_marker = vn_alloc_marker(NULL);
 	TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
-	    vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
+	    vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
 	vfs_smr = uma_zone_get_smr(vnode_zone);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	/*
@ -1603,7 +1607,7 @@ vn_alloc_hard(struct mount *mp)
 	if (vnlru_under(rnumvnodes, vlowat))
 		vnlru_kick();
 	mtx_unlock(&vnode_list_mtx);
-	return (uma_zalloc(vnode_zone, M_WAITOK));
+	return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 }
 static struct vnode *
@ -1619,7 +1623,7 @@ vn_alloc(struct mount *mp)
 		return (vn_alloc_hard(mp));
 	}
-	return (uma_zalloc(vnode_zone, M_WAITOK));
+	return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 }
 static void
@ -1627,7 +1631,7 @@ vn_free(struct vnode *vp)
 {
 	atomic_subtract_long(&numvnodes, 1);
-	uma_zfree(vnode_zone, vp);
+	uma_zfree_smr(vnode_zone, vp);
 }
 /*
@ -1758,7 +1762,7 @@ freevnode(struct vnode *vp)
 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
 	bo = &vp->v_bufobj;
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
-	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
+	VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
@ -2848,7 +2852,29 @@ v_decr_devcount(struct vnode *vp)
 *
 * holdcnt can be manipulated using atomics without holding any locks,
 * except when transitioning 1<->0, in which case the interlock is held.
 *
 * Consumers which don't guarantee liveness of the vnode can use SMR to
 * try to get a reference. Note this operation can fail since the vnode
 * may be awaiting getting freed by the time they get to it.
 */
 enum vgetstate
 vget_prep_smr(struct vnode *vp)
 {
 	enum vgetstate vs;
 	VFS_SMR_ASSERT_ENTERED();
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		vs = VGET_USECOUNT;
 	} else {
 		if (vhold_smr(vp))
 			vs = VGET_HOLDCNT;
 		else
 			vs = VGET_NONE;
 	}
 	return (vs);
 }
 enum vgetstate
 vget_prep(struct vnode *vp)
 {
@ -2919,6 +2945,7 @@ vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
 		ASSERT_VI_LOCKED(vp, __func__);
 	else
 		ASSERT_VI_UNLOCKED(vp, __func__);
 	VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
 	VNPASS(vp->v_holdcnt > 0, vp);
 	VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
@ -3380,7 +3407,8 @@ vhold(struct vnode *vp)
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
-	VNASSERT(old >= 0, vp, ("%s: wrong hold count %d", __func__, old));
+	VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 	    ("%s: wrong hold count %d", __func__, old));
 	if (old != 0)
 		return;
 	critical_enter();
@ -3405,12 +3433,40 @@ vholdnz(struct vnode *vp)
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 #ifdef INVARIANTS
 	int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
-	VNASSERT(old > 0, vp, ("%s: wrong hold count %d", __func__, old));
+	VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
 	    ("%s: wrong hold count %d", __func__, old));
 #else
 	atomic_add_int(&vp->v_holdcnt, 1);
 #endif
 }
 /*
 * Grab a hold count as long as the vnode is not getting freed.
 *
 * Only use this routine if vfs smr is the only protection you have against
 * freeing the vnode.
 */
 bool
 vhold_smr(struct vnode *vp)
 {
 	int count;
 	VFS_SMR_ASSERT_ENTERED();
 	count = atomic_load_int(&vp->v_holdcnt);
 	for (;;) {
 		if (count & VHOLD_NO_SMR) {
 			VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
 			    ("non-zero hold count with flags %d\n", count));
 			return (false);
 		}
 		VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
 		if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1))
 			return (true);
 	}
 }
 static void __noinline
 vdbatch_process(struct vdbatch *vd)
 {
@ -3581,11 +3637,25 @@ vdropl(struct vnode *vp)
 		VI_UNLOCK(vp);
 		return;
 	}
-	if (VN_IS_DOOMED(vp)) {
+	if (!VN_IS_DOOMED(vp)) {
-		freevnode(vp);
+		vdrop_deactivate(vp);
 		return;
 	}
-	vdrop_deactivate(vp);
+	/*
 	 * We may be racing against vhold_smr.
 	 *
 	 * If they win we can just pretend we never got this far, they will
 	 * vdrop later.
 	 */
 	if (!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR)) {
 		/*
 		 * We lost the aforementioned race. Note that any subsequent
 		 * access is invalid as they might have managed to vdropl on
 		 * their own.
 		 */
 		return;
 	}
 	freevnode(vp);
 }
 /*
@ -4041,20 +4111,25 @@ static const char * const typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
 "VMARKER"};
 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
    "new hold count flag not added to vn_printf");
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[256], buf2[16];
 	u_long flags;
 	u_int holdcnt;
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("type %s\n", typename[vp->v_type]);
 	holdcnt = atomic_load_int(&vp->v_holdcnt);
 	printf("    usecount %d, writecount %d, refcount %d",
-	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
+	    vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS);
 	switch (vp->v_type) {
 	case VDIR:
 		printf(" mountedhere %p\n", vp->v_mountedhere);
@ -4072,6 +4147,12 @@ vn_printf(struct vnode *vp, const char *fmt, ...)
 		printf("\n");
 		break;
 	}
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (holdcnt & VHOLD_NO_SMR)
 		strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
 	printf("    hold count flags (%s)\n", buf + 1);
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_irflag & VIRF_DOOMED)
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@ -58,7 +58,7 @@
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
 		  VMARKER };
-enum vgetstate	{ VGET_HOLDCNT, VGET_USECOUNT };
+enum vgetstate	{ VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT };
 /*
 * Each underlying filesystem allocates its own private area and hangs
 * it from v_data.  If non-null, this area is freed in getnewvnode().
@ -236,6 +236,9 @@ struct xvnode {
 *	VIRF_DOOMED is doubly protected by the interlock and vnode lock.  Both
 *	are required for writing but the status may be checked with either.
 */
 #define	VHOLD_NO_SMR	(1<<29)	/* Disable vhold_smr */
 #define VHOLD_ALL_FLAGS (VHOLD_NO_SMR)
 #define	VIRF_DOOMED	0x0001	/* This vnode is being recycled */
 #define	VI_TEXT_REF	0x0001	/* Text ref grabbed use ref */
@ -657,12 +660,14 @@ void	vdrop(struct vnode *);
 void	vdropl(struct vnode *);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int flags, struct thread *td);
 enum vgetstate	vget_prep_smr(struct vnode *vp);
 enum vgetstate	vget_prep(struct vnode *vp);
 int	vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
 void	vgone(struct vnode *vp);
 void	vhold(struct vnode *);
 void	vholdl(struct vnode *);
 void	vholdnz(struct vnode *);
 bool	vhold_smr(struct vnode *);
 void	vinactive(struct vnode *vp);
 int	vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, off_t length, int blksize);
@ -974,6 +979,16 @@ int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp);
 	SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \
 	    vfs_vector_op_register, &vnodeops)
 #define VFS_SMR_DECLARE				\
 	extern smr_t vfs_smr
 #define VFS_SMR()	vfs_smr
 #define vfs_smr_enter()	smr_enter(VFS_SMR())
 #define vfs_smr_exit()	smr_exit(VFS_SMR())
 #define VFS_SMR_ASSERT_ENTERED()	SMR_ASSERT_ENTERED(VFS_SMR())
 #define VFS_SMR_ASSERT_NOT_ENTERED()	SMR_ASSERT_NOT_ENTERED(VFS_SMR())
 #define VFS_SMR_ZONE_SET(zone)	uma_zone_set_smr((zone), VFS_SMR())
 #endif /* _KERNEL */
 #endif /* !_SYS_VNODE_H_ */