From 245df27cee60dfce76e5f6f5e133e16e70279af8 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 26 Oct 2001 00:08:05 +0000 Subject: [PATCH] Implement kern.maxvnodes. adjusting kern.maxvnodes now actually has a real effect. Optimize vfs_msync(). Avoid having to continually drop and re-obtain mutexes when scanning the vnode list. Improves looping case by 500%. Optimize ffs_sync(). Avoid having to continually drop and re-obtain mutexes when scanning the vnode list. This makes a couple of assumptions, which I believe are ok, in regards to vnode stability when the mount list mutex is held. Improves looping case by 500%. (more optimization work is needed on top of these fixes) MFC after: 1 week --- sys/kern/vfs_subr.c | 112 ++++++++++++++++++++++++++------------- sys/sys/vnode.h | 5 ++ sys/ufs/ffs/ffs_vfsops.c | 38 +++++++------ sys/vm/vm_fault.c | 3 +- sys/vm/vm_object.c | 41 +++++++++++++- sys/vm/vm_object.h | 1 + sys/vm/vm_page.c | 2 +- 7 files changed, 145 insertions(+), 57 deletions(-) diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index fde8ce2a2699..d690ab356fee 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -518,6 +518,49 @@ vattr_null(vap) vap->va_vaflags = 0; } +/* + * This routine is called when we have too many vnodes. It attempts + * to free vnodes and will potentially free vnodes that still + * have VM backing store (VM backing store is typically the cause + * of a vnode blowout so we want to do this). Therefore, this operation + * is not considered cheap. + * + * A number of conditions may prevent a vnode from being reclaimed. + * the buffer cache may have references on the vnode, a directory + * vnode may still have references due to the namei cache representing + * underlying files, or the vnode may be in active use. It is not + * desireable to reuse such vnodes. These conditions may cause the + * number of vnodes to reach some minimum value regardless of what + * you set kern.maxvnodes to. Do not set kernl.maxvnodes too low. + */ +static void +vlrureclaim(struct mount *mp, int count) +{ + struct vnode *vp; + + mtx_lock(&mntvnode_mtx); + while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { + TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + + if (vp->v_type != VNON && + vp->v_type != VBAD && + VMIGHTFREE(vp) && /* critical path opt */ + mtx_trylock(&vp->v_interlock) + ) { + mtx_unlock(&mntvnode_mtx); + if (VMIGHTFREE(vp)) { + vgonel(vp, curthread); + } else { + mtx_unlock(&vp->v_interlock); + } + mtx_lock(&mntvnode_mtx); + } + --count; + } + mtx_unlock(&mntvnode_mtx); +} + /* * Routines having to do with the management of the vnode table. */ @@ -532,25 +575,33 @@ getnewvnode(tag, mp, vops, vpp) vop_t **vops; struct vnode **vpp; { - int s, count; + int s; struct thread *td = curthread; /* XXX */ struct vnode *vp = NULL; struct mount *vnmp; vm_object_t object; + s = splbio(); /* - * We take the least recently used vnode from the freelist - * if we can get it and it has no cached pages, and no - * namecache entries are relative to it. - * Otherwise we allocate a new vnode + * Try to reuse vnodes if we hit the max. This situation only + * occurs in certain large-memory (2G+) situations. For the + * algorithm to be stable we have to try to reuse at least 2. + * No hysteresis should be necessary. + */ + if (numvnodes - freevnodes > desiredvnodes) + vlrureclaim(mp, 2); + + /* + * Attempt to reuse a vnode already on the free list, allocating + * a new vnode if we can't find one or if we have not reached a + * good minimum for good LRU performance. */ - s = splbio(); mtx_lock(&vnode_free_list_mtx); - if (freevnodes < wantfreevnodes) { - vp = NULL; - } else if (numvnodes >= minvnodes) { + if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) { + int count; + for (count = 0; count < freevnodes; count++) { vp = TAILQ_FIRST(&vnode_free_list); if (vp == NULL || vp->v_usecount) @@ -2408,22 +2459,20 @@ vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *nvp; struct vm_object *obj; - int anyio, tries; + int tries; GIANT_REQUIRED; tries = 5; -loop: - anyio = 0; mtx_lock(&mntvnode_mtx); +loop: for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) { - - nvp = TAILQ_NEXT(vp, v_nmntvnodes); - if (vp->v_mount != mp) { - mtx_unlock(&mntvnode_mtx); - goto loop; + if (--tries > 0) + goto loop; + break; } + nvp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ continue; @@ -2431,36 +2480,27 @@ vfs_msync(struct mount *mp, int flags) if (vp->v_flag & VNOSYNC) /* unlinked, skip it */ continue; - if (flags != MNT_WAIT) { - if (VOP_GETVOBJECT(vp, &obj) != 0 || - (obj->flags & OBJ_MIGHTBEDIRTY) == 0) - continue; - if (VOP_ISLOCKED(vp, NULL)) - continue; - } - - mtx_unlock(&mntvnode_mtx); - mtx_lock(&vp->v_interlock); - if (VOP_GETVOBJECT(vp, &obj) == 0 && - (obj->flags & OBJ_MIGHTBEDIRTY)) { + if ((vp->v_flag & VOBJDIRTY) && + (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { + mtx_unlock(&mntvnode_mtx); if (!vget(vp, - LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) { + LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) { if (VOP_GETVOBJECT(vp, &obj) == 0) { vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); - anyio = 1; } vput(vp); } - } else { - mtx_unlock(&vp->v_interlock); + mtx_lock(&mntvnode_mtx); + if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) { + if (--tries > 0) + goto loop; + break; + } } - mtx_lock(&mntvnode_mtx); } mtx_unlock(&mntvnode_mtx); - if (anyio && (--tries > 0)) - goto loop; } /* diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 5986579a7875..0d78fcbab6f8 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -175,6 +175,7 @@ struct vnode { /* open for business 0x100000 */ #define VONWORKLST 0x200000 /* On syncer work-list */ #define VMOUNT 0x400000 /* Mount in progress */ +#define VOBJDIRTY 0x800000 /* object might be dirty */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value @@ -311,6 +312,10 @@ extern void (*lease_updatetime) __P((int deltat)); (!(vp)->v_object || \ !((vp)->v_object->ref_count || (vp)->v_object->resident_page_count))) +#define VMIGHTFREE(vp) \ + (!((vp)->v_flag & (VFREE|VDOOMED)) && \ + !(vp)->v_holdcnt && !(vp)->v_usecount) + #define VSHOULDBUSY(vp) \ (((vp)->v_flag & VFREE) && \ ((vp)->v_holdcnt || (vp)->v_usecount)) diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index ad4c24d2f9cd..d080fbb99a63 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1001,10 +1001,10 @@ ffs_sync(mp, waitfor, cred, td) * Write back each (modified) inode. */ wait = 0; - lockreq = LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK; + lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_WAIT) { wait = 1; - lockreq = LK_EXCLUSIVE | LK_INTERLOCK; + lockreq = LK_EXCLUSIVE; } mtx_lock(&mntvnode_mtx); loop: @@ -1015,34 +1015,40 @@ ffs_sync(mp, waitfor, cred, td) */ if (vp->v_mount != mp) goto loop; - nvp = TAILQ_NEXT(vp, v_nmntvnodes); - mtx_unlock(&mntvnode_mtx); - mtx_lock(&vp->v_interlock); + /* + * Depend on the mntvnode_slock to keep things stable enough + * for a quick test. Since there might be hundreds of + * thousands of vnodes, we cannot afford even a subroutine + * call unless there's a good chance that we have work to do. + */ + nvp = TAILQ_NEXT(vp, v_nmntvnodes); ip = VTOI(vp); if (vp->v_type == VNON || ((ip->i_flag & - (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && - TAILQ_EMPTY(&vp->v_dirtyblkhd))) { - mtx_unlock(&vp->v_interlock); - mtx_lock(&mntvnode_mtx); + (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && + TAILQ_EMPTY(&vp->v_dirtyblkhd))) { continue; } if (vp->v_type != VCHR) { + mtx_unlock(&mntvnode_mtx); if ((error = vget(vp, lockreq, td)) != 0) { mtx_lock(&mntvnode_mtx); if (error == ENOENT) goto loop; - continue; + } else { + if ((error = VOP_FSYNC(vp, cred, waitfor, td)) != 0) + allerror = error; + VOP_UNLOCK(vp, 0, td); + vrele(vp); + mtx_lock(&mntvnode_mtx); } - if ((error = VOP_FSYNC(vp, cred, waitfor, td)) != 0) - allerror = error; - VOP_UNLOCK(vp, 0, td); - vrele(vp); } else { - mtx_unlock(&vp->v_interlock); + mtx_unlock(&mntvnode_mtx); UFS_UPDATE(vp, wait); + mtx_lock(&mntvnode_mtx); } - mtx_lock(&mntvnode_mtx); + if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) + goto loop; } mtx_unlock(&mntvnode_mtx); /* diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 350735387142..8814ae5ae2a9 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -806,8 +806,7 @@ RetryFault:; if (prot & VM_PROT_WRITE) { vm_page_flag_set(fs.m, PG_WRITEABLE); - vm_object_set_flag(fs.m->object, - OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); + vm_object_set_writeable_dirty(fs.m->object); /* * If the fault is a write, we know that this page is being diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 1d5a9892fbc2..b7613ebadd88 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -321,8 +321,11 @@ vm_object_reference(vm_object_t object) if (object == NULL) return; +#if 0 + /* object can be re-referenced during final cleaning */ KASSERT(!(object->flags & OBJ_DEAD), ("vm_object_reference: attempting to reference dead obj")); +#endif object->ref_count++; if (object->type == OBJT_VNODE) { @@ -454,8 +457,13 @@ vm_object_deallocate(vm_object_t object) temp->generation++; object->backing_object = NULL; } - vm_object_terminate(object); - /* unlocks and deallocates object */ + /* + * Don't double-terminate, we could be in a termination + * recursion due to the terminate having to sync data + * to disk. + */ + if ((object->flags & OBJ_DEAD) == 0) + vm_object_terminate(object); object = temp; } } @@ -627,7 +635,17 @@ vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int } if (clearobjflags && (tstart == 0) && (tend == object->size)) { + struct vnode *vp; + vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); + if (object->type == OBJT_VNODE && + (vp = (struct vnode *)object->handle) != NULL) { + if (vp->v_flag & VOBJDIRTY) { + mtx_lock(&vp->v_interlock); + vp->v_flag &= ~VOBJDIRTY; + mtx_unlock(&vp->v_interlock); + } + } } rescan: @@ -1357,6 +1375,8 @@ vm_object_collapse(vm_object_t object) * and no object references within it, all that is * necessary is to dispose of it. */ + KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object)); + KASSERT(TAILQ_FIRST(&backing_object->memq) == NULL, ("backing_object %p somehow has left over pages during collapse!", backing_object)); TAILQ_REMOVE( &vm_object_list, @@ -1684,6 +1704,23 @@ vm_object_in_map(vm_object_t object) return 0; } +void +vm_object_set_writeable_dirty(vm_object_t object) +{ + struct vnode *vp; + + vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); + if (object->type == OBJT_VNODE && + (vp = (struct vnode *)object->handle) != NULL) { + if ((vp->v_flag & VOBJDIRTY) == 0) { + mtx_lock(&vp->v_interlock); + vp->v_flag |= VOBJDIRTY; + mtx_unlock(&vp->v_interlock); + } + } +} + + DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index c126cb6ecd42..1256e850c063 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -184,6 +184,7 @@ void vm_object_collapse (vm_object_t); void vm_object_deallocate (vm_object_t); void vm_object_terminate (vm_object_t); void vm_object_vndeallocate (vm_object_t); +void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); void vm_object_page_clean (vm_object_t, vm_pindex_t, vm_pindex_t, boolean_t); void vm_object_page_remove (vm_object_t, vm_pindex_t, vm_pindex_t, boolean_t); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 61d821ff3d63..0eb06fc06cef 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -609,7 +609,7 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags. */ if (m->flags & PG_WRITEABLE) - vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); + vm_object_set_writeable_dirty(object); } /*