Improve VFS locking:

- Implement real draining for vfs consumers by not relying on the
  mnt_lock and using instead a refcount in order to keep track of lock
  requesters.
- Due to the change above, remove the mnt_lock lockmgr because it is now
  useless.
- Due to the change above, vfs_busy() is no more linked to a lockmgr.
  Change so its KPI by removing the interlock argument and defining 2 new
  flags for it: MBF_NOWAIT which basically replaces the LK_NOWAIT of the
  old version (which was unlinked from the lockmgr alredy) and
  MBF_MNTLSTLOCK which provides the ability to drop the mountlist_mtx
  once the mnt interlock is held (ability still desired by most consumers).
- The stub used into vfs_mount_destroy(), that allows to override the
  mnt_ref if running for more than 3 seconds, make it totally useless.
  Remove it as it was thought to work into older versions.
  If a problem of "refcount held never going away" should appear, we will
  need to fix properly instead than trust on such hackish solution.
- Fix a bug where returning (with an error) from dounmount() was still
  leaving the MNTK_MWAIT flag on even if it the waiters were actually
  woken up. Just a place in vfs_mount_destroy() is left because it is
  going to recycle the structure in any case, so it doesn't matter.
- Remove the markercnt refcount as it is useless.

This patch modifies VFS ABI and breaks KPI for vfs_busy() so manpages and
__FreeBSD_version will be modified accordingly.

Discussed with:	kib
Tested by:	pho
This commit is contained in:
Attilio Rao 2008-11-02 10:15:42 +00:00
parent 0952268ecd
commit 83b3bdbc8a
12 changed files with 95 additions and 73 deletions

View File

@ -2879,7 +2879,7 @@ g_journal_do_switch(struct g_class *classp)
desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
if (desc == NULL)
continue;
if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx))
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
continue;
/* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */

View File

@ -171,7 +171,7 @@ ext2_mount(mp, td)
flags = WRITECLOSE;
if (mp->mnt_flag & MNT_FORCE)
flags |= FORCECLOSE;
if (vfs_busy(mp, LK_NOWAIT, 0))
if (vfs_busy(mp, MBF_NOWAIT))
return (EBUSY);
error = ext2_flushfiles(mp, flags, td);
vfs_unbusy(mp);

View File

@ -684,7 +684,7 @@ unionlookup:
*/
while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
(cnp->cn_flags & NOCROSSMOUNT) == 0) {
if (vfs_busy(mp, 0, 0))
if (vfs_busy(mp, 0))
continue;
vput(dp);
VFS_UNLOCK_GIANT(vfslocked);

View File

@ -450,7 +450,6 @@ mount_init(void *mem, int size, int flags)
mp = (struct mount *)mem;
mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
return (0);
}
@ -462,7 +461,6 @@ mount_fini(void *mem, int size)
mp = (struct mount *)mem;
lockdestroy(&mp->mnt_explock);
lockdestroy(&mp->mnt_lock);
mtx_destroy(&mp->mnt_mtx);
}
@ -481,7 +479,7 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
TAILQ_INIT(&mp->mnt_nvnodelist);
mp->mnt_nvnodelistsize = 0;
mp->mnt_ref = 0;
(void) vfs_busy(mp, LK_NOWAIT, 0);
(void) vfs_busy(mp, MBF_NOWAIT);
mp->mnt_op = vfsp->vfc_vfsops;
mp->mnt_vfc = vfsp;
vfsp->vfc_refcount++; /* XXX Unlocked */
@ -507,19 +505,10 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
void
vfs_mount_destroy(struct mount *mp)
{
int i;
MNT_ILOCK(mp);
for (i = 0; mp->mnt_ref && i < 3; i++)
msleep(mp, MNT_MTX(mp), PVFS, "mntref", hz);
/*
* This will always cause a 3 second delay in rebooting due to
* refs on the root mountpoint that never go away. Most of these
* are held by init which never exits.
*/
if (i == 3 && (!rebooting || bootverbose))
printf("Mount point %s had %d dangling refs\n",
mp->mnt_stat.f_mntonname, mp->mnt_ref);
while (mp->mnt_ref)
msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
if (mp->mnt_holdcnt != 0) {
printf("Waiting for mount point to be unheld\n");
while (mp->mnt_holdcnt != 0) {
@ -928,7 +917,7 @@ vfs_domount(
vput(vp);
return (error);
}
if (vfs_busy(mp, LK_NOWAIT, 0)) {
if (vfs_busy(mp, MBF_NOWAIT)) {
vput(vp);
return (EBUSY);
}
@ -1245,19 +1234,31 @@ dounmount(mp, flags, td)
/* Allow filesystems to detect that a forced unmount is in progress. */
if (flags & MNT_FORCE)
mp->mnt_kern_flag |= MNTK_UNMOUNTF;
error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp));
if (error) {
MNT_ILOCK(mp);
mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
MNTK_UNMOUNTF);
if (mp->mnt_kern_flag & MNTK_MWAIT)
wakeup(mp);
MNT_IUNLOCK(mp);
if (coveredvp)
VOP_UNLOCK(coveredvp, 0);
return (error);
error = 0;
if (mp->mnt_lockref) {
if (flags & MNT_FORCE) {
mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
MNTK_UNMOUNTF);
if (mp->mnt_kern_flag & MNTK_MWAIT) {
mp->mnt_kern_flag &= ~MNTK_MWAIT;
wakeup(mp);
}
MNT_IUNLOCK(mp);
if (coveredvp)
VOP_UNLOCK(coveredvp, 0);
return (EBUSY);
}
mp->mnt_kern_flag |= MNTK_DRAINING;
error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
"mount drain", 0);
}
MNT_IUNLOCK(mp);
KASSERT(mp->mnt_lockref == 0,
("%s: invalid lock refcount in the drain path @ %s:%d",
__func__, __FILE__, __LINE__));
KASSERT(error == 0,
("%s: invalid return value for msleep in the drain path @ %s:%d",
__func__, __FILE__, __LINE__));
vn_start_write(NULL, &mp, V_WAIT);
if (mp->mnt_flag & MNT_EXPUBLIC)
@ -1321,9 +1322,10 @@ dounmount(mp, flags, td)
mp->mnt_flag |= async_flag;
if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
mp->mnt_kern_flag |= MNTK_ASYNC;
lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
if (mp->mnt_kern_flag & MNTK_MWAIT)
if (mp->mnt_kern_flag & MNTK_MWAIT) {
mp->mnt_kern_flag &= ~MNTK_MWAIT;
wakeup(mp);
}
MNT_IUNLOCK(mp);
if (coveredvp)
VOP_UNLOCK(coveredvp, 0);
@ -1337,7 +1339,6 @@ dounmount(mp, flags, td)
vput(coveredvp);
}
vfs_event_signal(NULL, VQ_UNMOUNT, 0);
lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
vfs_mount_destroy(mp);
return (0);
}
@ -2070,7 +2071,6 @@ __mnt_vnode_first(struct vnode **mvp, struct mount *mp)
wakeup(&mp->mnt_holdcnt);
return (NULL);
}
mp->mnt_markercnt++;
(*mvp)->v_mount = mp;
TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
return (vp);
@ -2093,7 +2093,6 @@ __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
MNT_ILOCK(mp);
*mvp = NULL;
mp->mnt_markercnt--;
mp->mnt_holdcnt--;
if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
wakeup(&mp->mnt_holdcnt);

View File

@ -335,42 +335,36 @@ SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
/*
* Mark a mount point as busy. Used to synchronize access and to delay
* unmounting. Interlock is not released on failure.
* unmounting. Eventually, mountlist_mtx is not released on failure.
*/
int
vfs_busy(struct mount *mp, int flags, struct mtx *interlkp)
vfs_busy(struct mount *mp, int flags)
{
int lkflags;
MPASS((flags & ~MBF_MASK) == 0);
MNT_ILOCK(mp);
MNT_REF(mp);
if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
if (flags & LK_NOWAIT) {
if (flags & MBF_NOWAIT) {
MNT_REL(mp);
MNT_IUNLOCK(mp);
return (ENOENT);
}
if (interlkp)
mtx_unlock(interlkp);
if (flags & MBF_MNTLSTLOCK)
mtx_unlock(&mountlist_mtx);
mp->mnt_kern_flag |= MNTK_MWAIT;
/*
* Since all busy locks are shared except the exclusive
* lock granted when unmounting, the only place that a
* wakeup needs to be done is at the release of the
* exclusive lock at the end of dounmount.
*/
msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0);
MNT_REL(mp);
MNT_IUNLOCK(mp);
if (interlkp)
mtx_lock(interlkp);
if (flags & MBF_MNTLSTLOCK)
mtx_lock(&mountlist_mtx);
return (ENOENT);
}
if (interlkp)
mtx_unlock(interlkp);
lkflags = LK_SHARED | LK_INTERLOCK | LK_NOWAIT;
if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp)))
panic("vfs_busy: unexpected lock failure");
if (flags & MBF_MNTLSTLOCK)
mtx_unlock(&mountlist_mtx);
mp->mnt_lockref++;
MNT_IUNLOCK(mp);
return (0);
}
@ -381,8 +375,15 @@ void
vfs_unbusy(struct mount *mp)
{
lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
vfs_rel(mp);
MNT_ILOCK(mp);
MNT_REL(mp);
mp->mnt_lockref--;
if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
mp->mnt_kern_flag &= ~MNTK_DRAINING;
wakeup(&mp->mnt_lockref);
}
MNT_IUNLOCK(mp);
}
/*
@ -747,7 +748,7 @@ vnlru_proc(void)
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
int vfsunlocked;
if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx)) {
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
@ -2837,7 +2838,6 @@ DB_SHOW_COMMAND(mount, db_show_mount)
db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max);
db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed);
db_printf(" mnt_markercnt = %d\n", mp->mnt_markercnt);
db_printf(" mnt_holdcnt = %d\n", mp->mnt_holdcnt);
db_printf(" mnt_holdcntwaiters = %d\n", mp->mnt_holdcntwaiters);
db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
@ -2999,7 +2999,7 @@ sysctl_vnode(SYSCTL_HANDLER_ARGS)
n = 0;
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx))
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
continue;
MNT_ILOCK(mp);
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
@ -3361,7 +3361,7 @@ sync_fsync(struct vop_fsync_args *ap)
* not already on the sync list.
*/
mtx_lock(&mountlist_mtx);
if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx) != 0) {
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
mtx_unlock(&mountlist_mtx);
return (0);
}

View File

@ -124,7 +124,7 @@ sync(td, uap)
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx)) {
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
@ -197,7 +197,7 @@ quotactl(td, uap)
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
mp = nd.ni_vp->v_mount;
if ((error = vfs_busy(mp, 0, NULL))) {
if ((error = vfs_busy(mp, 0))) {
vrele(nd.ni_vp);
VFS_UNLOCK_GIANT(vfslocked);
return (error);
@ -479,7 +479,7 @@ kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
continue;
}
#endif
if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx)) {
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
@ -741,7 +741,7 @@ fchdir(td, uap)
error = change_dir(vp, td);
while (!error && (mp = vp->v_mountedhere) != NULL) {
int tvfslocked;
if (vfs_busy(mp, 0, 0))
if (vfs_busy(mp, 0))
continue;
tvfslocked = VFS_LOCK_GIANT(mp);
error = VFS_ROOT(mp, LK_EXCLUSIVE, &tdp, td);

View File

@ -957,9 +957,18 @@ vn_start_write(vp, mpp, flags)
}
if ((mp = *mpp) == NULL)
return (0);
/*
* VOP_GETWRITEMOUNT() returns with the mp refcount held through
* a vfs_ref().
* As long as a vnode is not provided we need to acquire a
* refcount for the provided mountpoint too, in order to
* emulate a vfs_ref().
*/
MNT_ILOCK(mp);
if (vp == NULL)
MNT_REF(mp);
/*
* Check on status of suspension.
*/
@ -1021,6 +1030,14 @@ vn_start_secondary_write(vp, mpp, flags)
*/
if ((mp = *mpp) == NULL)
return (0);
/*
* VOP_GETWRITEMOUNT() returns with the mp refcount held through
* a vfs_ref().
* As long as a vnode is not provided we need to acquire a
* refcount for the provided mountpoint too, in order to
* emulate a vfs_ref().
*/
MNT_ILOCK(mp);
if (vp == NULL)
MNT_REF(mp);

View File

@ -1429,8 +1429,8 @@ nfs_timer(void *arg)
/*
* Terminate request if force-unmount in progress.
* Note that NFS could have vfs_busy'ed the mount,
* causing the unmount to wait for the mnt_lock, making
* this bit of logic necessary.
* causing the unmount to wait and making this bit
* of logic necessary.
*/
if (rep->r_nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF) {
nfs_softterm(rep);

View File

@ -258,7 +258,7 @@ nfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
#ifndef nolint
sfp = NULL;
#endif
error = vfs_busy(mp, LK_NOWAIT, NULL);
error = vfs_busy(mp, MBF_NOWAIT);
if (error)
return (error);
error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE);

View File

@ -142,13 +142,11 @@ struct vfsopt;
* Lock reference:
* m - mountlist_mtx
* i - interlock
* l - mnt_lock
*
* Unmarked fields are considered stable as long as a ref is held.
*
*/
struct mount {
struct lock mnt_lock; /* mount structure lock */
struct mtx mnt_mtx; /* mount structure interlock */
int mnt_gen; /* struct mount generation */
#define mnt_startzero mnt_list
@ -175,7 +173,7 @@ struct mount {
struct netexport *mnt_export; /* export list */
struct label *mnt_label; /* MAC label for the fs */
u_int mnt_hashseed; /* Random seed for vfs_hash */
int mnt_markercnt; /* marker vnodes in use */
int mnt_lockref; /* (i) Lock reference count */
int mnt_holdcnt; /* hold count */
int mnt_holdcntwaiters; /* waits on hold count */
int mnt_secondary_writes; /* (i) # of secondary writes */
@ -319,6 +317,7 @@ void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp);
#define MNTK_ASYNC 0x00000002 /* filtered async flag */
#define MNTK_SOFTDEP 0x00000004 /* async disabled by softdep */
#define MNTK_NOINSMNTQ 0x00000008 /* insmntque is not allowed */
#define MNTK_DRAINING 0x00000010 /* lock draining is happening */
#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */
#define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */
#define MNTK_SUSPEND 0x08000000 /* request write suspension */
@ -506,6 +505,13 @@ struct uio;
#ifdef _KERNEL
/*
* vfs_busy specific flags and mask.
*/
#define MBF_NOWAIT 0x01
#define MBF_MNTLSTLOCK 0x02
#define MBF_MASK (MBF_NOWAIT | MBF_MNTLSTLOCK)
#ifdef MALLOC_DECLARE
MALLOC_DECLARE(M_MOUNT);
#endif
@ -686,7 +692,7 @@ int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
int vfs_setpublicfs /* set publicly exported fs */
(struct mount *, struct netexport *, struct export_args *);
void vfs_msync(struct mount *, int);
int vfs_busy(struct mount *, int, struct mtx *);
int vfs_busy(struct mount *, int);
int vfs_export /* process mount export info */
(struct mount *, struct export_args *);
int vfs_allocate_syncvnode(struct mount *);

View File

@ -745,7 +745,7 @@ softdep_flush(void)
nmp = TAILQ_NEXT(mp, mnt_list);
if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
continue;
if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx))
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
continue;
vfslocked = VFS_LOCK_GIANT(mp);
softdep_process_worklist(mp, 0);

View File

@ -118,7 +118,7 @@ ufs_quotactl(mp, cmds, id, arg, td)
if ((u_int)type >= MAXQUOTAS)
return (EINVAL);
if (vfs_busy(mp, LK_NOWAIT, 0))
if (vfs_busy(mp, MBF_NOWAIT))
return (0);
switch (cmd) {