vfs: protect vnodes with smr
vget_prep_smr and vhold_smr can be used to ref a vnode while within vfs_smr section, allowing consumers to get away without locking. See vhold_smr and vdropl for comments explaining caveats. Reviewed by: kib Testec by: pho Differential Revision: https://reviews.freebsd.org/D23913
This commit is contained in:
parent
f6cf06718b
commit
4074673a81
@ -76,6 +76,7 @@ __FBSDID("$FreeBSD$");
|
|||||||
#include <sys/rwlock.h>
|
#include <sys/rwlock.h>
|
||||||
#include <sys/sched.h>
|
#include <sys/sched.h>
|
||||||
#include <sys/sleepqueue.h>
|
#include <sys/sleepqueue.h>
|
||||||
|
#include <sys/smr.h>
|
||||||
#include <sys/smp.h>
|
#include <sys/smp.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <sys/sysctl.h>
|
#include <sys/sysctl.h>
|
||||||
@ -238,6 +239,8 @@ static uma_zone_t buf_trie_zone;
|
|||||||
static uma_zone_t vnode_zone;
|
static uma_zone_t vnode_zone;
|
||||||
static uma_zone_t vnodepoll_zone;
|
static uma_zone_t vnodepoll_zone;
|
||||||
|
|
||||||
|
__read_frequently smr_t vfs_smr;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The workitem queue.
|
* The workitem queue.
|
||||||
*
|
*
|
||||||
@ -661,7 +664,8 @@ vntblinit(void *dummy __unused)
|
|||||||
vnode_list_reclaim_marker = vn_alloc_marker(NULL);
|
vnode_list_reclaim_marker = vn_alloc_marker(NULL);
|
||||||
TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
|
TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
|
||||||
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
|
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
|
||||||
vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
|
vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
|
||||||
|
vfs_smr = uma_zone_get_smr(vnode_zone);
|
||||||
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
|
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
|
||||||
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
|
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
|
||||||
/*
|
/*
|
||||||
@ -1603,7 +1607,7 @@ vn_alloc_hard(struct mount *mp)
|
|||||||
if (vnlru_under(rnumvnodes, vlowat))
|
if (vnlru_under(rnumvnodes, vlowat))
|
||||||
vnlru_kick();
|
vnlru_kick();
|
||||||
mtx_unlock(&vnode_list_mtx);
|
mtx_unlock(&vnode_list_mtx);
|
||||||
return (uma_zalloc(vnode_zone, M_WAITOK));
|
return (uma_zalloc_smr(vnode_zone, M_WAITOK));
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct vnode *
|
static struct vnode *
|
||||||
@ -1619,7 +1623,7 @@ vn_alloc(struct mount *mp)
|
|||||||
return (vn_alloc_hard(mp));
|
return (vn_alloc_hard(mp));
|
||||||
}
|
}
|
||||||
|
|
||||||
return (uma_zalloc(vnode_zone, M_WAITOK));
|
return (uma_zalloc_smr(vnode_zone, M_WAITOK));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -1627,7 +1631,7 @@ vn_free(struct vnode *vp)
|
|||||||
{
|
{
|
||||||
|
|
||||||
atomic_subtract_long(&numvnodes, 1);
|
atomic_subtract_long(&numvnodes, 1);
|
||||||
uma_zfree(vnode_zone, vp);
|
uma_zfree_smr(vnode_zone, vp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1758,7 +1762,7 @@ freevnode(struct vnode *vp)
|
|||||||
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
|
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
|
||||||
bo = &vp->v_bufobj;
|
bo = &vp->v_bufobj;
|
||||||
VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
|
VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
|
||||||
VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
|
VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
|
||||||
VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
|
VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
|
||||||
VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
|
VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
|
||||||
VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
|
VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
|
||||||
@ -2848,7 +2852,29 @@ v_decr_devcount(struct vnode *vp)
|
|||||||
*
|
*
|
||||||
* holdcnt can be manipulated using atomics without holding any locks,
|
* holdcnt can be manipulated using atomics without holding any locks,
|
||||||
* except when transitioning 1<->0, in which case the interlock is held.
|
* except when transitioning 1<->0, in which case the interlock is held.
|
||||||
|
*
|
||||||
|
* Consumers which don't guarantee liveness of the vnode can use SMR to
|
||||||
|
* try to get a reference. Note this operation can fail since the vnode
|
||||||
|
* may be awaiting getting freed by the time they get to it.
|
||||||
*/
|
*/
|
||||||
|
enum vgetstate
|
||||||
|
vget_prep_smr(struct vnode *vp)
|
||||||
|
{
|
||||||
|
enum vgetstate vs;
|
||||||
|
|
||||||
|
VFS_SMR_ASSERT_ENTERED();
|
||||||
|
|
||||||
|
if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
|
||||||
|
vs = VGET_USECOUNT;
|
||||||
|
} else {
|
||||||
|
if (vhold_smr(vp))
|
||||||
|
vs = VGET_HOLDCNT;
|
||||||
|
else
|
||||||
|
vs = VGET_NONE;
|
||||||
|
}
|
||||||
|
return (vs);
|
||||||
|
}
|
||||||
|
|
||||||
enum vgetstate
|
enum vgetstate
|
||||||
vget_prep(struct vnode *vp)
|
vget_prep(struct vnode *vp)
|
||||||
{
|
{
|
||||||
@ -2919,6 +2945,7 @@ vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
|
|||||||
ASSERT_VI_LOCKED(vp, __func__);
|
ASSERT_VI_LOCKED(vp, __func__);
|
||||||
else
|
else
|
||||||
ASSERT_VI_UNLOCKED(vp, __func__);
|
ASSERT_VI_UNLOCKED(vp, __func__);
|
||||||
|
VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
|
||||||
VNPASS(vp->v_holdcnt > 0, vp);
|
VNPASS(vp->v_holdcnt > 0, vp);
|
||||||
VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
|
VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
|
||||||
|
|
||||||
@ -3380,7 +3407,8 @@ vhold(struct vnode *vp)
|
|||||||
|
|
||||||
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
||||||
old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
|
old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
|
||||||
VNASSERT(old >= 0, vp, ("%s: wrong hold count %d", __func__, old));
|
VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
|
||||||
|
("%s: wrong hold count %d", __func__, old));
|
||||||
if (old != 0)
|
if (old != 0)
|
||||||
return;
|
return;
|
||||||
critical_enter();
|
critical_enter();
|
||||||
@ -3405,12 +3433,40 @@ vholdnz(struct vnode *vp)
|
|||||||
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
||||||
#ifdef INVARIANTS
|
#ifdef INVARIANTS
|
||||||
int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
|
int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
|
||||||
VNASSERT(old > 0, vp, ("%s: wrong hold count %d", __func__, old));
|
VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
|
||||||
|
("%s: wrong hold count %d", __func__, old));
|
||||||
#else
|
#else
|
||||||
atomic_add_int(&vp->v_holdcnt, 1);
|
atomic_add_int(&vp->v_holdcnt, 1);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Grab a hold count as long as the vnode is not getting freed.
|
||||||
|
*
|
||||||
|
* Only use this routine if vfs smr is the only protection you have against
|
||||||
|
* freeing the vnode.
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
vhold_smr(struct vnode *vp)
|
||||||
|
{
|
||||||
|
int count;
|
||||||
|
|
||||||
|
VFS_SMR_ASSERT_ENTERED();
|
||||||
|
|
||||||
|
count = atomic_load_int(&vp->v_holdcnt);
|
||||||
|
for (;;) {
|
||||||
|
if (count & VHOLD_NO_SMR) {
|
||||||
|
VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
|
||||||
|
("non-zero hold count with flags %d\n", count));
|
||||||
|
return (false);
|
||||||
|
}
|
||||||
|
|
||||||
|
VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
|
||||||
|
if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1))
|
||||||
|
return (true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void __noinline
|
static void __noinline
|
||||||
vdbatch_process(struct vdbatch *vd)
|
vdbatch_process(struct vdbatch *vd)
|
||||||
{
|
{
|
||||||
@ -3581,11 +3637,25 @@ vdropl(struct vnode *vp)
|
|||||||
VI_UNLOCK(vp);
|
VI_UNLOCK(vp);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (VN_IS_DOOMED(vp)) {
|
if (!VN_IS_DOOMED(vp)) {
|
||||||
freevnode(vp);
|
vdrop_deactivate(vp);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
vdrop_deactivate(vp);
|
/*
|
||||||
|
* We may be racing against vhold_smr.
|
||||||
|
*
|
||||||
|
* If they win we can just pretend we never got this far, they will
|
||||||
|
* vdrop later.
|
||||||
|
*/
|
||||||
|
if (!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR)) {
|
||||||
|
/*
|
||||||
|
* We lost the aforementioned race. Note that any subsequent
|
||||||
|
* access is invalid as they might have managed to vdropl on
|
||||||
|
* their own.
|
||||||
|
*/
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
freevnode(vp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -4041,20 +4111,25 @@ static const char * const typename[] =
|
|||||||
{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
|
{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
|
||||||
"VMARKER"};
|
"VMARKER"};
|
||||||
|
|
||||||
|
_Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
|
||||||
|
"new hold count flag not added to vn_printf");
|
||||||
|
|
||||||
void
|
void
|
||||||
vn_printf(struct vnode *vp, const char *fmt, ...)
|
vn_printf(struct vnode *vp, const char *fmt, ...)
|
||||||
{
|
{
|
||||||
va_list ap;
|
va_list ap;
|
||||||
char buf[256], buf2[16];
|
char buf[256], buf2[16];
|
||||||
u_long flags;
|
u_long flags;
|
||||||
|
u_int holdcnt;
|
||||||
|
|
||||||
va_start(ap, fmt);
|
va_start(ap, fmt);
|
||||||
vprintf(fmt, ap);
|
vprintf(fmt, ap);
|
||||||
va_end(ap);
|
va_end(ap);
|
||||||
printf("%p: ", (void *)vp);
|
printf("%p: ", (void *)vp);
|
||||||
printf("type %s\n", typename[vp->v_type]);
|
printf("type %s\n", typename[vp->v_type]);
|
||||||
|
holdcnt = atomic_load_int(&vp->v_holdcnt);
|
||||||
printf(" usecount %d, writecount %d, refcount %d",
|
printf(" usecount %d, writecount %d, refcount %d",
|
||||||
vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
|
vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS);
|
||||||
switch (vp->v_type) {
|
switch (vp->v_type) {
|
||||||
case VDIR:
|
case VDIR:
|
||||||
printf(" mountedhere %p\n", vp->v_mountedhere);
|
printf(" mountedhere %p\n", vp->v_mountedhere);
|
||||||
@ -4072,6 +4147,12 @@ vn_printf(struct vnode *vp, const char *fmt, ...)
|
|||||||
printf("\n");
|
printf("\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
buf[0] = '\0';
|
||||||
|
buf[1] = '\0';
|
||||||
|
if (holdcnt & VHOLD_NO_SMR)
|
||||||
|
strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
|
||||||
|
printf(" hold count flags (%s)\n", buf + 1);
|
||||||
|
|
||||||
buf[0] = '\0';
|
buf[0] = '\0';
|
||||||
buf[1] = '\0';
|
buf[1] = '\0';
|
||||||
if (vp->v_irflag & VIRF_DOOMED)
|
if (vp->v_irflag & VIRF_DOOMED)
|
||||||
|
@ -58,7 +58,7 @@
|
|||||||
enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
|
enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
|
||||||
VMARKER };
|
VMARKER };
|
||||||
|
|
||||||
enum vgetstate { VGET_HOLDCNT, VGET_USECOUNT };
|
enum vgetstate { VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT };
|
||||||
/*
|
/*
|
||||||
* Each underlying filesystem allocates its own private area and hangs
|
* Each underlying filesystem allocates its own private area and hangs
|
||||||
* it from v_data. If non-null, this area is freed in getnewvnode().
|
* it from v_data. If non-null, this area is freed in getnewvnode().
|
||||||
@ -236,6 +236,9 @@ struct xvnode {
|
|||||||
* VIRF_DOOMED is doubly protected by the interlock and vnode lock. Both
|
* VIRF_DOOMED is doubly protected by the interlock and vnode lock. Both
|
||||||
* are required for writing but the status may be checked with either.
|
* are required for writing but the status may be checked with either.
|
||||||
*/
|
*/
|
||||||
|
#define VHOLD_NO_SMR (1<<29) /* Disable vhold_smr */
|
||||||
|
#define VHOLD_ALL_FLAGS (VHOLD_NO_SMR)
|
||||||
|
|
||||||
#define VIRF_DOOMED 0x0001 /* This vnode is being recycled */
|
#define VIRF_DOOMED 0x0001 /* This vnode is being recycled */
|
||||||
|
|
||||||
#define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */
|
#define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */
|
||||||
@ -657,12 +660,14 @@ void vdrop(struct vnode *);
|
|||||||
void vdropl(struct vnode *);
|
void vdropl(struct vnode *);
|
||||||
int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
|
int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
|
||||||
int vget(struct vnode *vp, int flags, struct thread *td);
|
int vget(struct vnode *vp, int flags, struct thread *td);
|
||||||
|
enum vgetstate vget_prep_smr(struct vnode *vp);
|
||||||
enum vgetstate vget_prep(struct vnode *vp);
|
enum vgetstate vget_prep(struct vnode *vp);
|
||||||
int vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
|
int vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
|
||||||
void vgone(struct vnode *vp);
|
void vgone(struct vnode *vp);
|
||||||
void vhold(struct vnode *);
|
void vhold(struct vnode *);
|
||||||
void vholdl(struct vnode *);
|
void vholdl(struct vnode *);
|
||||||
void vholdnz(struct vnode *);
|
void vholdnz(struct vnode *);
|
||||||
|
bool vhold_smr(struct vnode *);
|
||||||
void vinactive(struct vnode *vp);
|
void vinactive(struct vnode *vp);
|
||||||
int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
|
int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
|
||||||
int vtruncbuf(struct vnode *vp, off_t length, int blksize);
|
int vtruncbuf(struct vnode *vp, off_t length, int blksize);
|
||||||
@ -974,6 +979,16 @@ int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp);
|
|||||||
SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \
|
SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \
|
||||||
vfs_vector_op_register, &vnodeops)
|
vfs_vector_op_register, &vnodeops)
|
||||||
|
|
||||||
|
#define VFS_SMR_DECLARE \
|
||||||
|
extern smr_t vfs_smr
|
||||||
|
|
||||||
|
#define VFS_SMR() vfs_smr
|
||||||
|
#define vfs_smr_enter() smr_enter(VFS_SMR())
|
||||||
|
#define vfs_smr_exit() smr_exit(VFS_SMR())
|
||||||
|
#define VFS_SMR_ASSERT_ENTERED() SMR_ASSERT_ENTERED(VFS_SMR())
|
||||||
|
#define VFS_SMR_ASSERT_NOT_ENTERED() SMR_ASSERT_NOT_ENTERED(VFS_SMR())
|
||||||
|
#define VFS_SMR_ZONE_SET(zone) uma_zone_set_smr((zone), VFS_SMR())
|
||||||
|
|
||||||
#endif /* _KERNEL */
|
#endif /* _KERNEL */
|
||||||
|
|
||||||
#endif /* !_SYS_VNODE_H_ */
|
#endif /* !_SYS_VNODE_H_ */
|
||||||
|
Loading…
Reference in New Issue
Block a user