Below is slightly edited description of the LOR by Tor Egge:

--------------------------
[Deadlock] is caused by a lock order reversal in vfs_lookup(), where
[some] process is trying to lock a directory vnode, that is the parent
directory of covered vnode) while holding an exclusive vnode lock on
covering vnode.

A simplified scenario:

root fs					var fs
/    		A			/    (/var)	D
/var		B			/log (/var/log) E
vfs lock	C			vfs lock	F

Within each file system, the lock order is clear: C->A->B and F->D->E

When traversing across mounts, the system can choose between two lock orders,
but everything must then follow that lock order:

      L1: C->A->B
		|
	        +->F->D->E

      L2: F->D->E
	     |
             +->C->A->B

The lookup() process for namei("/var") mixes those two lock orders:

    VOP_LOOKUP() obtains B while A is held
    vfs_busy() obtains a shared lock on F while A and B are held (follows L1,
    violates L2)
    vput() releases lock on B
    VOP_UNLOCK() releases lock on A
    VFS_ROOT() obtains lock on D while shared lock on F is held
    vfs_unbusy() releases shared lock on F
    vn_lock() obtains lock on A while D is held (violates L1, follows L2)

dounmount() follows L1 (B is locked while F is drained).

Without unmount activity, vfs_busy() will always succeed without blocking
and the deadlock isn't triggered (the system behaves as if L2 is followed).

With unmount, you can get 4 processes in a deadlock:

     p1: holds D, want A (in lookup())
     p2: holds shared lock on F, want D (in VFS_ROOT())
     p3: holds B, want drain lock on F (in dounmount())
     p4: holds A, want B (in VOP_LOOKUP())

You can have more than one instance of p2.

The reversal was introduced in revision 1.81 of src/sys/kern/vfs_lookup.c and
MFCed to revision 1.80.2.1, probably to avoid a cascade of vnode locks when nfs
servers are dead (VFS_ROOT() just hangs) spreading to the root fs root vnode.

- Tor Egge

To fix the LOR, ups@ noted that when crossing the mount point, ni_dvp
is actually not used by the callers of namei. Thus, placeholder deadfs
vnode vp_crossmp is introduced that is filled into ni_dvp.

Idea by:	ups
Reviewed by:	tegge, ups, jeff, rwatson (mac interaction)
Tested by:	Peter Holm
MFC after:	2 weeks
This commit is contained in:
Konstantin Belousov 2007-01-22 11:25:22 +00:00
parent d5f2a6f556
commit 7f92c4ee02
2 changed files with 41 additions and 5 deletions

View File

@ -49,6 +49,7 @@ static vop_poll_t dead_poll;
static vop_read_t dead_read;
static vop_write_t dead_write;
static vop_getwritemount_t dead_getwritemount;
static vop_rename_t dead_rename;
struct vop_vector dead_vnodeops = {
.vop_default = &default_vnodeops,
@ -73,7 +74,7 @@ struct vop_vector dead_vnodeops = {
.vop_readlink = VOP_EBADF,
.vop_reclaim = VOP_NULL,
.vop_remove = VOP_PANIC,
.vop_rename = VOP_PANIC,
.vop_rename = dead_rename,
.vop_rmdir = VOP_PANIC,
.vop_setattr = VOP_EBADF,
.vop_symlink = VOP_PANIC,
@ -211,3 +212,25 @@ dead_poll(ap)
{
return (POLLHUP);
}
static int
dead_rename(ap)
struct vop_rename_args /* {
struct vnode *a_fdvp;
struct vnode *a_fvp;
struct componentname *a_fcnp;
struct vnode *a_tdvp;
struct vnode *a_tvp;
struct componentname *a_tcnp;
} */ *ap;
{
if (ap->a_tvp)
vput(ap->a_tvp);
if (ap->a_tdvp == ap->a_tvp)
vrele(ap->a_tdvp);
else
vput(ap->a_tdvp);
vrele(ap->a_fdvp);
vrele(ap->a_fvp);
return (EXDEV);
}

View File

@ -69,13 +69,18 @@ __FBSDID("$FreeBSD$");
* Allocation zone for namei
*/
uma_zone_t namei_zone;
/*
* Placeholder vnode for mp traversal
*/
static struct vnode *vp_crossmp;
static void
nameiinit(void *dummy __unused)
{
namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
UMA_ALIGN_PTR, 0);
getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp);
vp_crossmp->v_vnlock->lk_flags &= ~LK_NOSHARE;
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL)
@ -559,7 +564,8 @@ unionlookup:
* If we have a shared lock we may need to upgrade the lock for the
* last operation.
*/
if (VOP_ISLOCKED(dp, td) == LK_SHARED &&
if (dp != vp_crossmp &&
VOP_ISLOCKED(dp, td) == LK_SHARED &&
(cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
vn_lock(dp, LK_UPGRADE|LK_RETRY, td);
/*
@ -658,10 +664,17 @@ unionlookup:
VFS_UNLOCK_GIANT(vfslocked);
vfslocked = VFS_LOCK_GIANT(mp);
if (dp != ndp->ni_dvp)
VOP_UNLOCK(ndp->ni_dvp, 0, td);
vput(ndp->ni_dvp);
else
vrele(ndp->ni_dvp);
VFS_UNLOCK_GIANT(dvfslocked);
dvfslocked = 0;
vref(vp_crossmp);
ndp->ni_dvp = vp_crossmp;
error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags), &tdp, td);
vfs_unbusy(mp, td);
vn_lock(ndp->ni_dvp, compute_cn_lkflags(mp, cnp->cn_lkflags | LK_RETRY), td);
if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT, td))
panic("vp_crossmp exclusively locked or reclaimed");
if (error) {
dpunlocked = 1;
goto bad2;