vfs: rework vnode list management

The current notion of an active vnode is eliminated.

Vnodes transition between 0<->1 hold counts all the time and the
associated traversal between different lists induces significant
scalability problems in certain workloads.

Introduce a global list containing all allocated vnodes. They get
unlinked only when UMA reclaims memory and are only requeued when
hold count reaches 0.

Sample result from an incremental make -s -j 104 bzImage on tmpfs:
stock:   118.55s user 3649.73s system 7479% cpu 50.382 total
patched: 122.38s user 1780.45s system 6242% cpu 30.480 total

Reviewed by:	jeff
Tested by:	pho (in a larger patch, previous version)
Differential Revision:	https://reviews.freebsd.org/D22997
This commit is contained in:
Mateusz Guzik 2020-01-13 02:37:25 +00:00
parent 80663cadb8
commit cc3593fbd9
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=356672
9 changed files with 88 additions and 450 deletions

View File

@ -792,7 +792,7 @@ msdosfs_unmount(struct mount *mp, int mntflags)
vn_printf(vp,
"msdosfs_umount(): just before calling VOP_CLOSE()\n");
printf("freef %p, freeb %p, mount %p\n",
TAILQ_NEXT(vp, v_actfreelist), vp->v_actfreelist.tqe_prev,
TAILQ_NEXT(vp, v_vnodelist), vp->v_vnodelist.tqe_prev,
vp->v_mount);
printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n",
TAILQ_FIRST(&vp->v_bufobj.bo_clean.bv_hd),

View File

@ -3317,7 +3317,6 @@ nfsd_mntinit(void)
inited = 1;
nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED);
TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist);
TAILQ_INIT(&nfsv4root_mnt.mnt_activevnodelist);
TAILQ_INIT(&nfsv4root_mnt.mnt_lazyvnodelist);
nfsv4root_mnt.mnt_export = NULL;
TAILQ_INIT(&nfsv4root_opt);
@ -3325,7 +3324,6 @@ nfsd_mntinit(void)
nfsv4root_mnt.mnt_opt = &nfsv4root_opt;
nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt;
nfsv4root_mnt.mnt_nvnodelistsize = 0;
nfsv4root_mnt.mnt_activevnodelistsize = 0;
nfsv4root_mnt.mnt_lazyvnodelistsize = 0;
}

View File

@ -101,7 +101,7 @@ static const char *tmpfs_updateopts[] = {
/*
* Handle updates of time from writes to mmaped regions. Use
* MNT_VNODE_FOREACH_ALL instead of MNT_VNODE_FOREACH_ACTIVE, since
* MNT_VNODE_FOREACH_ALL instead of MNT_VNODE_FOREACH_LAZY, since
* unmap of the tmpfs-backed vnode does not call vinactive(), due to
* vm object type is OBJT_SWAP.
* If lazy, only handle delayed update of mtime due to the writes to

View File

@ -1706,7 +1706,7 @@ unionfs_getwritemount(struct vop_getwritemount_args *ap)
error = VOP_GETWRITEMOUNT(uvp, ap->a_mpp);
else {
VI_LOCK(vp);
if (vp->v_iflag & VI_FREE)
if (vp->v_holdcnt == 0)
error = EOPNOTSUPP;
else
error = EACCES;

View File

@ -502,10 +502,6 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
__rangeof(struct mount, mnt_startzero, mnt_endzero));
TAILQ_INIT(&mp->mnt_nvnodelist);
mp->mnt_nvnodelistsize = 0;
TAILQ_INIT(&mp->mnt_activevnodelist);
mp->mnt_activevnodelistsize = 0;
TAILQ_INIT(&mp->mnt_tmpfreevnodelist);
mp->mnt_tmpfreevnodelistsize = 0;
TAILQ_INIT(&mp->mnt_lazyvnodelist);
mp->mnt_lazyvnodelistsize = 0;
if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 ||
@ -573,10 +569,6 @@ vfs_mount_destroy(struct mount *mp)
KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
if (mp->mnt_nvnodelistsize != 0)
panic("vfs_mount_destroy: nonzero nvnodelistsize");
if (mp->mnt_activevnodelistsize != 0)
panic("vfs_mount_destroy: nonzero activevnodelistsize");
if (mp->mnt_tmpfreevnodelistsize != 0)
panic("vfs_mount_destroy: nonzero tmpfreevnodelistsize");
if (mp->mnt_lazyvnodelistsize != 0)
panic("vfs_mount_destroy: nonzero lazyvnodelistsize");
if (mp->mnt_lockref != 0)

View File

@ -114,7 +114,6 @@ static void vfs_knllock(void *arg);
static void vfs_knlunlock(void *arg);
static void vfs_knl_assert_locked(void *arg);
static void vfs_knl_assert_unlocked(void *arg);
static void vnlru_return_batches(struct vfsops *mnt_op);
static void destroy_vpollinfo(struct vpollinfo *vi);
static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
daddr_t startlbn, daddr_t endlbn);
@ -149,10 +148,6 @@ static counter_u64_t vnodes_created;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
"Number of vnodes created by getnewvnode");
static u_long mnt_free_list_batch = 128;
SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW,
&mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list");
/*
* Conversion tables for conversion from vnode types to inode formats
* and back.
@ -167,9 +162,10 @@ int vttoif_tab[10] = {
};
/*
* List of vnodes that are ready for recycling.
* List of allocates vnodes in the system.
*/
static TAILQ_HEAD(freelst, vnode) vnode_free_list;
static TAILQ_HEAD(freelst, vnode) vnode_list;
static struct vnode *vnode_list_free_marker;
/*
* "Free" vnode target. Free vnodes are rarely completely free, but are
@ -195,7 +191,7 @@ static TAILQ_HEAD(freelst, vnode) vnode_free_list;
* whenever vnlru_proc() becomes active.
*/
static u_long wantfreevnodes;
static u_long freevnodes;
static u_long __exclusive_cache_line freevnodes;
SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
&freevnodes, 0, "Number of \"free\" vnodes");
@ -225,11 +221,11 @@ static struct mtx mntid_mtx;
/*
* Lock for any access to the following:
* vnode_free_list
* vnode_list
* numvnodes
* freevnodes
*/
static struct mtx __exclusive_cache_line vnode_free_list_mtx;
static struct mtx __exclusive_cache_line vnode_list_mtx;
/* Publicly exported FS */
struct nfs_public nfs_pub;
@ -332,11 +328,11 @@ sysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
if (val == desiredvnodes)
return (0);
mtx_lock(&vnode_free_list_mtx);
mtx_lock(&vnode_list_mtx);
desiredvnodes = val;
wantfreevnodes = desiredvnodes / 4;
vnlru_recalc();
mtx_unlock(&vnode_free_list_mtx);
mtx_unlock(&vnode_list_mtx);
/*
* XXX There is no protection against multiple threads changing
* desiredvnodes at the same time. Locking above only helps vnlru and
@ -364,10 +360,10 @@ sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
if (val == wantfreevnodes)
return (0);
mtx_lock(&vnode_free_list_mtx);
mtx_lock(&vnode_list_mtx);
wantfreevnodes = val;
vnlru_recalc();
mtx_unlock(&vnode_free_list_mtx);
mtx_unlock(&vnode_list_mtx);
return (0);
}
@ -555,6 +551,10 @@ vnode_init(void *mem, int size, int flags)
* Initialize rangelocks.
*/
rangelock_init(&vp->v_rl);
mtx_lock(&vnode_list_mtx);
TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
return (0);
}
@ -568,6 +568,9 @@ vnode_fini(void *mem, int size)
struct bufobj *bo;
vp = mem;
mtx_lock(&vnode_list_mtx);
TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
rangelock_destroy(&vp->v_rl);
lockdestroy(vp->v_vnlock);
mtx_destroy(&vp->v_interlock);
@ -626,14 +629,16 @@ vntblinit(void *dummy __unused)
}
wantfreevnodes = desiredvnodes / 4;
mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
TAILQ_INIT(&vnode_free_list);
mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
TAILQ_INIT(&vnode_list);
mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF);
/*
* The lock is taken to appease WITNESS.
*/
mtx_lock(&vnode_free_list_mtx);
mtx_lock(&vnode_list_mtx);
vnlru_recalc();
mtx_unlock(&vnode_free_list_mtx);
mtx_unlock(&vnode_list_mtx);
vnode_list_free_marker = vn_alloc_marker(NULL);
TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist);
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
@ -1102,7 +1107,7 @@ vlrureclaim(struct mount *mp, bool reclaim_nc_src, int trigger)
*/
if (vp->v_usecount ||
(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
((vp->v_iflag & VI_FREE) != 0) ||
vp->v_holdcnt == 0 ||
VN_IS_DOOMED(vp) || (vp->v_object != NULL &&
vp->v_object->resident_page_count > trigger)) {
VI_UNLOCK(vp);
@ -1171,37 +1176,24 @@ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
static void
vnlru_free_locked(int count, struct vfsops *mnt_op)
{
struct vnode *vp;
struct vnode *vp, *mvp;
struct mount *mp;
bool tried_batches;
tried_batches = false;
mtx_assert(&vnode_free_list_mtx, MA_OWNED);
mtx_assert(&vnode_list_mtx, MA_OWNED);
if (count > max_vnlru_free)
count = max_vnlru_free;
for (; count > 0; count--) {
vp = TAILQ_FIRST(&vnode_free_list);
/*
* The list can be modified while the free_list_mtx
* has been dropped and vp could be NULL here.
*/
if (vp == NULL) {
if (tried_batches)
break;
mtx_unlock(&vnode_free_list_mtx);
vnlru_return_batches(mnt_op);
tried_batches = true;
mtx_lock(&vnode_free_list_mtx);
continue;
mvp = vnode_list_free_marker;
restart:
vp = mvp;
while (count > 0) {
vp = TAILQ_NEXT(vp, v_vnodelist);
if (__predict_false(vp == NULL)) {
TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
break;
}
VNASSERT(vp->v_op != NULL, vp,
("vnlru_free: vnode already reclaimed."));
KASSERT((vp->v_iflag & VI_FREE) != 0,
("Removing vnode not on freelist"));
KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
("Mangling active vnode"));
TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
if (__predict_false(vp->v_type == VMARKER))
continue;
/*
* Don't recycle if our vnode is from different type
@ -1211,37 +1203,24 @@ vnlru_free_locked(int count, struct vfsops *mnt_op)
* Don't recycle if we can't get the interlock without
* blocking.
*/
if ((mnt_op != NULL && (mp = vp->v_mount) != NULL &&
if (vp->v_holdcnt > 0 || (mnt_op != NULL && (mp = vp->v_mount) != NULL &&
mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) {
TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
continue;
}
VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
vp, ("vp inconsistent on freelist"));
/*
* The clear of VI_FREE prevents activation of the
* vnode. There is no sense in putting the vnode on
* the mount point active list, only to remove it
* later during recycling. Inline the relevant part
* of vholdl(), to avoid triggering assertions or
* activating.
*/
freevnodes--;
vp->v_iflag &= ~VI_FREE;
VNODE_REFCOUNT_FENCE_REL();
refcount_acquire(&vp->v_holdcnt);
mtx_unlock(&vnode_free_list_mtx);
TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
VI_UNLOCK(vp);
continue;
}
vholdl(vp);
count--;
mtx_unlock(&vnode_list_mtx);
VI_UNLOCK(vp);
vtryrecycle(vp);
/*
* If the recycled succeeded this vdrop will actually free
* the vnode. If not it will simply place it back on
* the free list.
*/
vdrop(vp);
mtx_lock(&vnode_free_list_mtx);
mtx_lock(&vnode_list_mtx);
goto restart;
}
}
@ -1249,16 +1228,16 @@ void
vnlru_free(int count, struct vfsops *mnt_op)
{
mtx_lock(&vnode_free_list_mtx);
mtx_lock(&vnode_list_mtx);
vnlru_free_locked(count, mnt_op);
mtx_unlock(&vnode_free_list_mtx);
mtx_unlock(&vnode_list_mtx);
}
static void
vnlru_recalc(void)
{
mtx_assert(&vnode_free_list_mtx, MA_OWNED);
mtx_assert(&vnode_list_mtx, MA_OWNED);
gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
vlowat = vhiwat / 2;
@ -1281,63 +1260,6 @@ vspace(void)
return (space);
}
static void
vnlru_return_batch_locked(struct mount *mp)
{
struct vnode *vp;
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
if (mp->mnt_tmpfreevnodelistsize == 0)
return;
TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) {
VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp,
("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist"));
vp->v_mflag &= ~VMP_TMPMNTFREELIST;
}
mtx_lock(&vnode_free_list_mtx);
TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist);
freevnodes += mp->mnt_tmpfreevnodelistsize;
mtx_unlock(&vnode_free_list_mtx);
mp->mnt_tmpfreevnodelistsize = 0;
}
static void
vnlru_return_batch(struct mount *mp)
{
mtx_lock(&mp->mnt_listmtx);
vnlru_return_batch_locked(mp);
mtx_unlock(&mp->mnt_listmtx);
}
static void
vnlru_return_batches(struct vfsops *mnt_op)
{
struct mount *mp, *nmp;
bool need_unbusy;
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
need_unbusy = false;
if (mnt_op != NULL && mp->mnt_op != mnt_op)
goto next;
if (mp->mnt_tmpfreevnodelistsize == 0)
goto next;
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) {
vnlru_return_batch(mp);
need_unbusy = true;
mtx_lock(&mountlist_mtx);
}
next:
nmp = TAILQ_NEXT(mp, mnt_list);
if (need_unbusy)
vfs_unbusy(mp);
}
mtx_unlock(&mountlist_mtx);
}
/*
* Attempt to recycle vnodes in a context that is always safe to block.
* Calling vlrurecycle() from the bowels of filesystem code has some
@ -1361,7 +1283,7 @@ vnlru_proc(void)
force = 0;
for (;;) {
kproc_suspend_check(vnlruproc);
mtx_lock(&vnode_free_list_mtx);
mtx_lock(&vnode_list_mtx);
rnumvnodes = atomic_load_long(&numvnodes);
/*
* If numvnodes is too large (due to desiredvnodes being
@ -1385,11 +1307,11 @@ vnlru_proc(void)
if (vsp >= vlowat && force == 0) {
vnlruproc_sig = 0;
wakeup(&vnlruproc_sig);
msleep(vnlruproc, &vnode_free_list_mtx,
msleep(vnlruproc, &vnode_list_mtx,
PVFS|PDROP, "vlruwt", hz);
continue;
}
mtx_unlock(&vnode_free_list_mtx);
mtx_unlock(&vnode_list_mtx);
done = 0;
rnumvnodes = atomic_load_long(&numvnodes);
rfreevnodes = atomic_load_long(&freevnodes);
@ -1548,7 +1470,7 @@ static int
vn_alloc_wait(int suspended)
{
mtx_assert(&vnode_free_list_mtx, MA_OWNED);
mtx_assert(&vnode_list_mtx, MA_OWNED);
if (numvnodes >= desiredvnodes) {
if (suspended) {
/*
@ -1562,7 +1484,7 @@ vn_alloc_wait(int suspended)
vnlruproc_sig = 1; /* avoid unnecessary wakeups */
wakeup(vnlruproc);
}
msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS,
"vlruwk", hz);
}
/* Post-adjust like the pre-adjust in getnewvnode(). */
@ -1577,7 +1499,7 @@ vn_alloc(struct mount *mp)
static int cyclecount;
int error __unused;
mtx_lock(&vnode_free_list_mtx);
mtx_lock(&vnode_list_mtx);
if (numvnodes < desiredvnodes)
cyclecount = 0;
else if (cyclecount++ >= freevnodes) {
@ -1603,14 +1525,14 @@ vn_alloc(struct mount *mp)
MNTK_SUSPEND));
#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
if (error != 0) {
mtx_unlock(&vnode_free_list_mtx);
mtx_unlock(&vnode_list_mtx);
return (error);
}
#endif
}
vcheckspace();
atomic_add_long(&numvnodes, 1);
mtx_unlock(&vnode_free_list_mtx);
mtx_unlock(&vnode_list_mtx);
return (uma_zalloc(vnode_zone, M_WAITOK));
}
@ -1745,8 +1667,6 @@ freevnode(struct vnode *vp)
*/
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
bo = &vp->v_bufobj;
VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
("cleaned vnode still on the free list."));
VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
@ -1800,16 +1720,6 @@ delmntque(struct vnode *vp)
return;
MNT_ILOCK(mp);
VI_LOCK(vp);
KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
("Active vnode list size %d > Vnode list size %d",
mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
if (vp->v_iflag & VI_ACTIVE) {
vp->v_iflag &= ~VI_ACTIVE;
mtx_lock(&mp->mnt_listmtx);
TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
mp->mnt_activevnodelistsize--;
mtx_unlock(&mp->mnt_listmtx);
}
if (vp->v_mflag & VMP_LAZYLIST) {
mtx_lock(&mp->mnt_listmtx);
if (vp->v_mflag & VMP_LAZYLIST) {
@ -1879,13 +1789,6 @@ insmntque1(struct vnode *vp, struct mount *mp,
VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
("neg mount point vnode list size"));
mp->mnt_nvnodelistsize++;
KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
("Activating already active vnode"));
vp->v_iflag |= VI_ACTIVE;
mtx_lock(&mp->mnt_listmtx);
TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
mp->mnt_activevnodelistsize++;
mtx_unlock(&mp->mnt_listmtx);
VI_UNLOCK(vp);
MNT_IUNLOCK(mp);
return (0);
@ -3247,38 +3150,13 @@ vunref(struct vnode *vp)
static void
vhold_activate(struct vnode *vp)
{
struct mount *mp;
ASSERT_VI_LOCKED(vp, __func__);
VNASSERT(vp->v_holdcnt == 0, vp,
("%s: wrong hold count", __func__));
VNASSERT(vp->v_op != NULL, vp,
("%s: vnode already reclaimed.", __func__));
/*
* Remove a vnode from the free list, mark it as in use,
* and put it on the active list.
*/
VNASSERT(vp->v_mount != NULL, vp,
("_vhold: vnode not on per mount vnode list"));
mp = vp->v_mount;
mtx_lock(&mp->mnt_listmtx);
if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) {
TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
mp->mnt_tmpfreevnodelistsize--;
vp->v_mflag &= ~VMP_TMPMNTFREELIST;
} else {
mtx_lock(&vnode_free_list_mtx);
TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
freevnodes--;
mtx_unlock(&vnode_free_list_mtx);
}
KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
("Activating already active vnode"));
vp->v_iflag &= ~VI_FREE;
vp->v_iflag |= VI_ACTIVE;
TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
mp->mnt_activevnodelistsize++;
mtx_unlock(&mp->mnt_listmtx);
atomic_subtract_long(&freevnodes, 1);
refcount_acquire(&vp->v_holdcnt);
}
@ -3288,12 +3166,8 @@ vhold(struct vnode *vp)
ASSERT_VI_UNLOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
VNODE_REFCOUNT_FENCE_ACQ();
VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
("vhold: vnode with holdcnt is free"));
if (refcount_acquire_if_not_zero(&vp->v_holdcnt))
return;
}
VI_LOCK(vp);
vholdl(vp);
VI_UNLOCK(vp);
@ -3305,7 +3179,7 @@ vholdl(struct vnode *vp)
ASSERT_VI_LOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if ((vp->v_iflag & VI_FREE) == 0) {
if (vp->v_holdcnt > 0) {
refcount_acquire(&vp->v_holdcnt);
return;
}
@ -3348,34 +3222,26 @@ vdrop_deactivate(struct vnode *vp)
("vdrop: returning doomed vnode"));
VNASSERT(vp->v_op != NULL, vp,
("vdrop: vnode already reclaimed."));
VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
("vnode already free"));
VNASSERT(vp->v_holdcnt == 0, vp,
("vdrop: freeing when we shouldn't"));
VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
("vnode with VI_OWEINACT set"));
VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp,
("vnode with VI_DEFINACT set"));
VNASSERT(vp->v_holdcnt == 0, vp,
("vdrop: freeing when we shouldn't"));
mp = vp->v_mount;
mtx_lock(&mp->mnt_listmtx);
if (vp->v_mflag & VMP_LAZYLIST) {
mp = vp->v_mount;
mtx_lock(&mp->mnt_listmtx);
vp->v_mflag &= ~VMP_LAZYLIST;
TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
mp->mnt_lazyvnodelistsize--;
mtx_unlock(&mp->mnt_listmtx);
}
if (vp->v_iflag & VI_ACTIVE) {
vp->v_iflag &= ~VI_ACTIVE;
TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
mp->mnt_activevnodelistsize--;
}
TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
mp->mnt_tmpfreevnodelistsize++;
vp->v_iflag |= VI_FREE;
vp->v_mflag |= VMP_TMPMNTFREELIST;
mtx_lock(&vnode_list_mtx);
TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
atomic_add_long(&freevnodes, 1);
VI_UNLOCK(vp);
if (mp->mnt_tmpfreevnodelistsize >= mnt_free_list_batch)
vnlru_return_batch_locked(mp);
mtx_unlock(&mp->mnt_listmtx);
}
void
@ -3922,27 +3788,21 @@ vn_printf(struct vnode *vp, const char *fmt, ...)
strlcat(buf, "|VI_TEXT_REF", sizeof(buf));
if (vp->v_iflag & VI_MOUNT)
strlcat(buf, "|VI_MOUNT", sizeof(buf));
if (vp->v_iflag & VI_FREE)
strlcat(buf, "|VI_FREE", sizeof(buf));
if (vp->v_iflag & VI_ACTIVE)
strlcat(buf, "|VI_ACTIVE", sizeof(buf));
if (vp->v_iflag & VI_DOINGINACT)
strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
if (vp->v_iflag & VI_OWEINACT)
strlcat(buf, "|VI_OWEINACT", sizeof(buf));
if (vp->v_iflag & VI_DEFINACT)
strlcat(buf, "|VI_DEFINACT", sizeof(buf));
flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_FREE | VI_ACTIVE |
VI_DOINGINACT | VI_OWEINACT | VI_DEFINACT);
flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT |
VI_OWEINACT | VI_DEFINACT);
if (flags != 0) {
snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
strlcat(buf, buf2, sizeof(buf));
}
if (vp->v_mflag & VMP_TMPMNTFREELIST)
strlcat(buf, "|VMP_TMPMNTFREELIST", sizeof(buf));
if (vp->v_mflag & VMP_LAZYLIST)
strlcat(buf, "|VMP_LAZYLIST", sizeof(buf));
flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST | VMP_LAZYLIST);
flags = vp->v_mflag & ~(VMP_LAZYLIST);
if (flags != 0) {
snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
strlcat(buf, buf2, sizeof(buf));
@ -4160,8 +4020,6 @@ DB_SHOW_COMMAND(mount, db_show_mount)
vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
db_printf(" mnt_gen = %d\n", mp->mnt_gen);
db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
db_printf(" mnt_activevnodelistsize = %d\n",
mp->mnt_activevnodelistsize);
db_printf(" mnt_lazyvnodelistsize = %d\n",
mp->mnt_lazyvnodelistsize);
db_printf(" mnt_writeopcount = %d (with %d in the struct)\n",
@ -4179,8 +4037,8 @@ DB_SHOW_COMMAND(mount, db_show_mount)
db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
db_printf("\n\nList of active vnodes\n");
TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
if (vp->v_type != VMARKER) {
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
if (vp->v_type != VMARKER && vp->v_holdcnt > 0) {
vn_printf(vp, "vnode ");
if (db_pager_quit)
break;
@ -4188,7 +4046,7 @@ DB_SHOW_COMMAND(mount, db_show_mount)
}
db_printf("\n\nList of inactive vnodes\n");
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
if (vp->v_type != VMARKER && vp->v_holdcnt == 0) {
vn_printf(vp, "vnode ");
if (db_pager_quit)
break;
@ -4838,7 +4696,6 @@ sync_fsync(struct vop_fsync_args *ap)
* The filesystem at hand may be idle with free vnodes stored in the
* batch. Return them instead of letting them stay there indefinitely.
*/
vnlru_return_batch(mp);
vfs_periodic(mp, MNT_NOWAIT);
error = VFS_SYNC(mp, MNT_LAZY);
curthread_pflags_restore(save);
@ -6112,191 +5969,6 @@ __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
*mvp = NULL;
}
/*
* These are helper functions for filesystems to traverse their
* active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
*/
static void
mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
{
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
MNT_ILOCK(mp);
MNT_REL(mp);
MNT_IUNLOCK(mp);
vn_free_marker(*mvp);
*mvp = NULL;
}
/*
* Relock the mp mount vnode list lock with the vp vnode interlock in the
* conventional lock order during mnt_vnode_next_active iteration.
*
* On entry, the mount vnode list lock is held and the vnode interlock is not.
* The list lock is dropped and reacquired. On success, both locks are held.
* On failure, the mount vnode list lock is held but the vnode interlock is
* not, and the procedure may have yielded.
*/
static bool
mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp,
struct vnode *vp)
{
const struct vnode *tmp;
bool held, ret;
VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp,
("%s: bad marker", __func__));
VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
("%s: inappropriate vnode", __func__));
ASSERT_VI_UNLOCKED(vp, __func__);
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
ret = false;
TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist);
TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist);
/*
* Use a hold to prevent vp from disappearing while the mount vnode
* list lock is dropped and reacquired. Normally a hold would be
* acquired with vhold(), but that might try to acquire the vnode
* interlock, which would be a LOR with the mount vnode list lock.
*/
held = refcount_acquire_if_not_zero(&vp->v_holdcnt);
mtx_unlock(&mp->mnt_listmtx);
if (!held)
goto abort;
VI_LOCK(vp);
if (!refcount_release_if_not_last(&vp->v_holdcnt)) {
vdropl(vp);
goto abort;
}
mtx_lock(&mp->mnt_listmtx);
/*
* Determine whether the vnode is still the next one after the marker,
* excepting any other markers. If the vnode has not been doomed by
* vgone() then the hold should have ensured that it remained on the
* active list. If it has been doomed but is still on the active list,
* don't abort, but rather skip over it (avoid spinning on doomed
* vnodes).
*/
tmp = mvp;
do {
tmp = TAILQ_NEXT(tmp, v_actfreelist);
} while (tmp != NULL && tmp->v_type == VMARKER);
if (tmp != vp) {
mtx_unlock(&mp->mnt_listmtx);
VI_UNLOCK(vp);
goto abort;
}
ret = true;
goto out;
abort:
maybe_yield();
mtx_lock(&mp->mnt_listmtx);
out:
if (ret)
ASSERT_VI_LOCKED(vp, __func__);
else
ASSERT_VI_UNLOCKED(vp, __func__);
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
return (ret);
}
static struct vnode *
mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
{
struct vnode *vp, *nvp;
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
restart:
vp = TAILQ_NEXT(*mvp, v_actfreelist);
while (vp != NULL) {
if (vp->v_type == VMARKER) {
vp = TAILQ_NEXT(vp, v_actfreelist);
continue;
}
/*
* Try-lock because this is the wrong lock order. If that does
* not succeed, drop the mount vnode list lock and try to
* reacquire it and the vnode interlock in the right order.
*/
if (!VI_TRYLOCK(vp) &&
!mnt_vnode_next_active_relock(*mvp, mp, vp))
goto restart;
KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
("alien vnode on the active list %p %p", vp, mp));
if (vp->v_mount == mp && !VN_IS_DOOMED(vp))
break;
nvp = TAILQ_NEXT(vp, v_actfreelist);
VI_UNLOCK(vp);
vp = nvp;
}
TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
/* Check if we are done */
if (vp == NULL) {
mtx_unlock(&mp->mnt_listmtx);
mnt_vnode_markerfree_active(mvp, mp);
return (NULL);
}
TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
mtx_unlock(&mp->mnt_listmtx);
ASSERT_VI_LOCKED(vp, "active iter");
KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
return (vp);
}
struct vnode *
__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
{
if (should_yield())
kern_yield(PRI_USER);
mtx_lock(&mp->mnt_listmtx);
return (mnt_vnode_next_active(mvp, mp));
}
struct vnode *
__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
{
struct vnode *vp;
*mvp = vn_alloc_marker(mp);
MNT_ILOCK(mp);
MNT_REF(mp);
MNT_IUNLOCK(mp);
mtx_lock(&mp->mnt_listmtx);
vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
if (vp == NULL) {
mtx_unlock(&mp->mnt_listmtx);
mnt_vnode_markerfree_active(mvp, mp);
return (NULL);
}
TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
return (mnt_vnode_next_active(mvp, mp));
}
void
__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
{
if (*mvp == NULL)
return;
mtx_lock(&mp->mnt_listmtx);
TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
mtx_unlock(&mp->mnt_listmtx);
mnt_vnode_markerfree_active(mvp, mp);
}
/*
* These are helper functions for filesystems to traverse their
* lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h
@ -6310,7 +5982,7 @@ mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
MNT_ILOCK(mp);
MNT_REL(mp);
MNT_IUNLOCK(mp);
free(*mvp, M_VNODE_MARKER);
vn_free_marker(*mvp);
*mvp = NULL;
}
@ -6454,7 +6126,6 @@ mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist);
mtx_unlock(&mp->mnt_listmtx);
ASSERT_VI_LOCKED(vp, "lazy iter");
KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
return (vp);
}
@ -6475,12 +6146,10 @@ __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
{
struct vnode *vp;
*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
*mvp = vn_alloc_marker(mp);
MNT_ILOCK(mp);
MNT_REF(mp);
MNT_IUNLOCK(mp);
(*mvp)->v_type = VMARKER;
(*mvp)->v_mount = mp;
mtx_lock(&mp->mnt_listmtx);
vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist);

View File

@ -219,10 +219,6 @@ struct mount {
#define mnt_endzero mnt_gjprovider
char *mnt_gjprovider; /* gjournal provider name */
struct mtx mnt_listmtx;
struct vnodelst mnt_activevnodelist; /* (l) list of active vnodes */
int mnt_activevnodelistsize;/* (l) # of active vnodes */
struct vnodelst mnt_tmpfreevnodelist; /* (l) list of free vnodes */
int mnt_tmpfreevnodelistsize;/* (l) # of free vnodes */
struct vnodelst mnt_lazyvnodelist; /* (l) list of lazy vnodes */
int mnt_lazyvnodelistsize; /* (l) # of lazy vnodes */
struct lock mnt_explock; /* vfs_export walkers lock */
@ -255,20 +251,6 @@ void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp);
mtx_assert(MNT_MTX(mp), MA_NOTOWNED); \
} while (0)
/*
* Definitions for MNT_VNODE_FOREACH_ACTIVE.
*/
struct vnode *__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp);
struct vnode *__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp);
void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *);
#define MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) \
for (vp = __mnt_vnode_first_active(&(mvp), (mp)); \
(vp) != NULL; vp = __mnt_vnode_next_active(&(mvp), (mp)))
#define MNT_VNODE_FOREACH_ACTIVE_ABORT(mp, mvp) \
__mnt_vnode_markerfree_active(&(mvp), (mp))
/*
* Definitions for MNT_VNODE_FOREACH_LAZY.
*/

View File

@ -147,7 +147,7 @@ struct vnode {
/*
* The machinery of being a vnode
*/
TAILQ_ENTRY(vnode) v_actfreelist; /* l vnode active/free lists */
TAILQ_ENTRY(vnode) v_vnodelist; /* l vnode lists */
TAILQ_ENTRY(vnode) v_lazylist; /* l vnode lazy list */
struct bufobj v_bufobj; /* * Buffer cache object */
@ -239,8 +239,6 @@ struct xvnode {
#define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */
#define VI_MOUNT 0x0020 /* Mount in progress */
#define VI_FREE 0x0100 /* This vnode is on the freelist */
#define VI_ACTIVE 0x0200 /* This vnode is on the active list */
#define VI_DOINGINACT 0x0800 /* VOP_INACTIVE is in progress */
#define VI_OWEINACT 0x1000 /* Need to call inactive */
#define VI_DEFINACT 0x2000 /* deferred inactive */
@ -260,8 +258,7 @@ struct xvnode {
#define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */
#define VV_READLINK 0x2000 /* fdescfs linux vnode */
#define VMP_TMPMNTFREELIST 0x0001 /* Vnode is on mnt's tmp free list */
#define VMP_LAZYLIST 0x0002 /* Vnode is on mnt's lazy list */
#define VMP_LAZYLIST 0x0001 /* Vnode is on mnt's lazy list */
/*
* Vnode attributes. A field value of VNOVAL represents a field whose value

View File

@ -1083,7 +1083,7 @@ qsync(struct mount *mp)
* synchronizing any modified dquot structures.
*/
again:
MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
if (vp->v_type == VNON) {
VI_UNLOCK(vp);
continue;
@ -1091,7 +1091,7 @@ qsync(struct mount *mp)
error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td);
if (error) {
if (error == ENOENT) {
MNT_VNODE_FOREACH_ACTIVE_ABORT(mp, mvp);
MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
goto again;
}
continue;