vfs: per-cpu batched requeuing of free vnodes

Constant requeuing adds significant lock contention in certain
workloads. Lessen the problem by batching it.

Per-cpu areas are locked in order to synchronize against UMA freeing
memory.

vnode's v_mflag is converted to short to prevent the struct from
growing.

Sample result from an incremental make -s -j 104 bzImage on tmpfs:
stock:   122.38s user 1780.45s system 6242% cpu 30.480 total
patched: 144.84s user 985.90s system 4856% cpu 23.282 total

Reviewed by:	jeff
Tested by:	pho (in a larger patch, previous version)
Differential Revision:	https://reviews.freebsd.org/D22998
This commit is contained in:
Mateusz Guzik 2020-01-13 02:39:41 +00:00
parent cc3593fbd9
commit 0c236d3d52
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=356673
2 changed files with 116 additions and 7 deletions

View File

@ -295,6 +295,16 @@ static int stat_rush_requests; /* number of times I/O speeded up */
SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
"Number of times I/O speeded up (rush requests)");
#define VDBATCH_SIZE 8
struct vdbatch {
u_int index;
struct mtx lock;
struct vnode *tab[VDBATCH_SIZE];
};
DPCPU_DEFINE_STATIC(struct vdbatch, vd);
static void vdbatch_dequeue(struct vnode *vp);
/*
* When shutting down the syncer, run it at four times normal speed.
*/
@ -552,6 +562,8 @@ vnode_init(void *mem, int size, int flags)
*/
rangelock_init(&vp->v_rl);
vp->v_dbatchcpu = NOCPU;
mtx_lock(&vnode_list_mtx);
TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
@ -568,6 +580,7 @@ vnode_fini(void *mem, int size)
struct bufobj *bo;
vp = mem;
vdbatch_dequeue(vp);
mtx_lock(&vnode_list_mtx);
TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
@ -602,8 +615,9 @@ vnode_fini(void *mem, int size)
static void
vntblinit(void *dummy __unused)
{
struct vdbatch *vd;
int cpu, physvnodes, virtvnodes;
u_int i;
int physvnodes, virtvnodes;
/*
* Desiredvnodes is a function of the physical memory size and the
@ -669,6 +683,12 @@ vntblinit(void *dummy __unused)
for (i = 1; i <= sizeof(struct vnode); i <<= 1)
vnsz2log++;
vnsz2log--;
CPU_FOREACH(cpu) {
vd = DPCPU_ID_PTR((cpu), vd);
bzero(vd, sizeof(*vd));
mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
}
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
@ -3199,6 +3219,98 @@ vholdnz(struct vnode *vp)
#endif
}
static void __noinline
vdbatch_process(struct vdbatch *vd)
{
struct vnode *vp;
int i;
mtx_assert(&vd->lock, MA_OWNED);
MPASS(vd->index == VDBATCH_SIZE);
mtx_lock(&vnode_list_mtx);
for (i = 0; i < VDBATCH_SIZE; i++) {
vp = vd->tab[i];
TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
MPASS(vp->v_dbatchcpu != NOCPU);
vp->v_dbatchcpu = NOCPU;
}
bzero(vd->tab, sizeof(vd->tab));
vd->index = 0;
mtx_unlock(&vnode_list_mtx);
}
static void
vdbatch_enqueue(struct vnode *vp)
{
struct vdbatch *vd;
ASSERT_VI_LOCKED(vp, __func__);
VNASSERT(!VN_IS_DOOMED(vp), vp,
("%s: deferring requeue of a doomed vnode", __func__));
if (vp->v_dbatchcpu != NOCPU) {
VI_UNLOCK(vp);
return;
}
/*
* A hack: pin us to the current CPU so that we know what to put in
* ->v_dbatchcpu.
*/
sched_pin();
vd = DPCPU_PTR(vd);
mtx_lock(&vd->lock);
MPASS(vd->index < VDBATCH_SIZE);
MPASS(vd->tab[vd->index] == NULL);
vp->v_dbatchcpu = curcpu;
vd->tab[vd->index] = vp;
vd->index++;
VI_UNLOCK(vp);
if (vd->index == VDBATCH_SIZE)
vdbatch_process(vd);
mtx_unlock(&vd->lock);
sched_unpin();
}
/*
* This routine must only be called for vnodes which are about to be
* deallocated. Supporting dequeue for arbitrary vndoes would require
* validating that the locked batch matches.
*/
static void
vdbatch_dequeue(struct vnode *vp)
{
struct vdbatch *vd;
int i;
short cpu;
VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp,
("%s: called for a used vnode\n", __func__));
cpu = vp->v_dbatchcpu;
if (cpu == NOCPU)
return;
vd = DPCPU_ID_PTR(cpu, vd);
mtx_lock(&vd->lock);
for (i = 0; i < vd->index; i++) {
if (vd->tab[i] != vp)
continue;
vp->v_dbatchcpu = NOCPU;
vd->index--;
vd->tab[i] = vd->tab[vd->index];
vd->tab[vd->index] = NULL;
break;
}
mtx_unlock(&vd->lock);
/*
* Either we dequeued the vnode above or the target CPU beat us to it.
*/
MPASS(vp->v_dbatchcpu == NOCPU);
}
/*
* Drop the hold count of the vnode. If this is the last reference to
* the vnode we place it on the free list unless it has been vgone'd
@ -3236,12 +3348,8 @@ vdrop_deactivate(struct vnode *vp)
mp->mnt_lazyvnodelistsize--;
mtx_unlock(&mp->mnt_listmtx);
}
mtx_lock(&vnode_list_mtx);
TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
atomic_add_long(&freevnodes, 1);
VI_UNLOCK(vp);
vdbatch_enqueue(vp);
}
void

View File

@ -171,7 +171,8 @@ struct vnode {
u_int v_usecount; /* I ref count of users */
u_int v_iflag; /* i vnode flags (see below) */
u_int v_vflag; /* v vnode flags */
u_int v_mflag; /* l mnt-specific vnode flags */
u_short v_mflag; /* l mnt-specific vnode flags */
short v_dbatchcpu; /* i LRU requeue deferral batch */
int v_writecount; /* I ref count of writers or
(negative) text users */
u_int v_hash;