As the kernel allocates and frees vnodes, it fully initializes them

on every allocation and fully releases them on every free.  These
are not trivial costs: it starts by zeroing a large structure then
initializes a mutex, a lock manager lock, an rw lock, four lists,
and six pointers. And looking at vfs.vnodes_created, these operations
are being done millions of times an hour on a busy machine.

As a performance optimization, this code update uses the uma_init
and uma_fini routines to do these initializations and cleanups only
as the vnodes enter and leave the vnode_zone. With this change the
initializations are only done kern.maxvnodes times at system startup
and then only rarely again. The frees are done only if the vnode_zone
shrinks which never happens in practice. For those curious about the
avoided work, look at the vnode_init() and vnode_fini() functions in
kern/vfs_subr.c to see the code that has been removed from the main
vnode allocation/free path.

Reviewed by: kib
Tested by:   Peter Holm
This commit is contained in:
Kirk McKusick 2015-11-29 21:42:26 +00:00
parent 43a993bb7d
commit 41d4f10391

View File

@ -346,6 +346,66 @@ PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
#ifndef MAXVNODES_MAX
#define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */
#endif
/*
* Initialize a vnode as it first enters the zone.
*/
static int
vnode_init(void *mem, int size, int flags)
{
struct vnode *vp;
struct bufobj *bo;
vp = mem;
bzero(vp, size);
/*
* Setup locks.
*/
vp->v_vnlock = &vp->v_lock;
mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
/*
* By default, don't allow shared locks unless filesystems opt-in.
*/
lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
LK_NOSHARE | LK_IS_VNODE);
/*
* Initialize bufobj.
*/
bo = &vp->v_bufobj;
bo->__bo_vnode = vp;
rw_init(BO_LOCKPTR(bo), "bufobj interlock");
bo->bo_private = vp;
TAILQ_INIT(&bo->bo_clean.bv_hd);
TAILQ_INIT(&bo->bo_dirty.bv_hd);
/*
* Initialize namecache.
*/
LIST_INIT(&vp->v_cache_src);
TAILQ_INIT(&vp->v_cache_dst);
/*
* Initialize rangelocks.
*/
rangelock_init(&vp->v_rl);
return (0);
}
/*
* Free a vnode when it is cleared from the zone.
*/
static void
vnode_fini(void *mem, int size)
{
struct vnode *vp;
struct bufobj *bo;
vp = mem;
rangelock_destroy(&vp->v_rl);
lockdestroy(vp->v_vnlock);
mtx_destroy(&vp->v_interlock);
bo = &vp->v_bufobj;
rw_destroy(BO_LOCKPTR(bo));
}
static void
vntblinit(void *dummy __unused)
{
@ -379,7 +439,7 @@ vntblinit(void *dummy __unused)
TAILQ_INIT(&vnode_free_list);
mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
/*
@ -1223,8 +1283,8 @@ getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
struct vnode **vpp)
{
struct vnode *vp;
struct bufobj *bo;
struct thread *td;
struct lock_object *lo;
static int cyclecount;
int error;
@ -1271,40 +1331,42 @@ getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
mtx_unlock(&vnode_free_list_mtx);
alloc:
atomic_add_long(&vnodes_created, 1);
vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
/*
* Setup locks.
* Locks are given the generic name "vnode" when created.
* Follow the historic practice of using the filesystem
* name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
*
* Locks live in a witness group keyed on their name. Thus,
* when a lock is renamed, it must also move from the witness
* group of its old name to the witness group of its new name.
*
* The change only needs to be made when the vnode moves
* from one filesystem type to another. We ensure that each
* filesystem use a single static name pointer for its tag so
* that we can compare pointers rather than doing a strcmp().
*/
vp->v_vnlock = &vp->v_lock;
mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
lo = &vp->v_vnlock->lock_object;
if (lo->lo_name != tag) {
lo->lo_name = tag;
WITNESS_DESTROY(lo);
WITNESS_INIT(lo, tag);
}
/*
* By default, don't allow shared locks unless filesystems
* opt-in.
* By default, don't allow shared locks unless filesystems opt-in.
*/
lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
/*
* Initialize bufobj.
*/
bo = &vp->v_bufobj;
bo->__bo_vnode = vp;
rw_init(BO_LOCKPTR(bo), "bufobj interlock");
bo->bo_ops = &buf_ops_bio;
bo->bo_private = vp;
TAILQ_INIT(&bo->bo_clean.bv_hd);
TAILQ_INIT(&bo->bo_dirty.bv_hd);
/*
* Initialize namecache.
*/
LIST_INIT(&vp->v_cache_src);
TAILQ_INIT(&vp->v_cache_dst);
vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
/*
* Finalize various vnode identity bits.
*/
KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
vp->v_type = VNON;
vp->v_tag = tag;
vp->v_op = vops;
v_init_counters(vp);
vp->v_data = NULL;
vp->v_bufobj.bo_ops = &buf_ops_bio;
#ifdef MAC
mac_vnode_init(vp);
if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
@ -1313,11 +1375,10 @@ alloc:
printf("NULL mp in getnewvnode()\n");
#endif
if (mp != NULL) {
bo->bo_bsize = mp->mnt_stat.f_iosize;
vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
vp->v_vflag |= VV_NOKNOTE;
}
rangelock_init(&vp->v_rl);
/*
* For the filesystems which do not use vfs_hash_insert(),
@ -2683,6 +2744,12 @@ _vdrop(struct vnode *vp, bool locked)
}
/*
* The vnode has been marked for destruction, so free it.
*
* The vnode will be returned to the zone where it will
* normally remain until it is needed for another vnode. We
* need to cleanup (or verify that the cleanup has already
* been done) any residual data left from its current use
* so as not to contaminate the freshly allocated vnode.
*/
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
atomic_subtract_long(&numvnodes, 1);
@ -2707,16 +2774,17 @@ _vdrop(struct vnode *vp, bool locked)
#ifdef MAC
mac_vnode_destroy(vp);
#endif
if (vp->v_pollinfo != NULL)
if (vp->v_pollinfo != NULL) {
destroy_vpollinfo(vp->v_pollinfo);
vp->v_pollinfo = NULL;
}
#ifdef INVARIANTS
/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
vp->v_op = NULL;
#endif
rangelock_destroy(&vp->v_rl);
lockdestroy(vp->v_vnlock);
mtx_destroy(&vp->v_interlock);
rw_destroy(BO_LOCKPTR(bo));
vp->v_iflag = 0;
vp->v_vflag = 0;
bo->bo_flag = 0;
uma_zfree(vnode_zone, vp);
}
@ -3081,6 +3149,7 @@ vgonel(struct vnode *vp)
* Clear the advisory locks and wake up waiting threads.
*/
(void)VOP_ADVLOCKPURGE(vp);
vp->v_lockf = NULL;
/*
* Delete from old mount point vnode list.
*/