79752b63e1
-------------------------- [Deadlock] is caused by a lock order reversal in vfs_lookup(), where [some] process is trying to lock a directory vnode, that is the parent directory of covered vnode) while holding an exclusive vnode lock on covering vnode. A simplified scenario: root fs var fs / A / (/var) D /var B /log (/var/log) E vfs lock C vfs lock F Within each file system, the lock order is clear: C->A->B and F->D->E When traversing across mounts, the system can choose between two lock orders, but everything must then follow that lock order: L1: C->A->B | +->F->D->E L2: F->D->E | +->C->A->B The lookup() process for namei("/var") mixes those two lock orders: VOP_LOOKUP() obtains B while A is held vfs_busy() obtains a shared lock on F while A and B are held (follows L1, violates L2) vput() releases lock on B VOP_UNLOCK() releases lock on A VFS_ROOT() obtains lock on D while shared lock on F is held vfs_unbusy() releases shared lock on F vn_lock() obtains lock on A while D is held (violates L1, follows L2) dounmount() follows L1 (B is locked while F is drained). Without unmount activity, vfs_busy() will always succeed without blocking and the deadlock isn't triggered (the system behaves as if L2 is followed). With unmount, you can get 4 processes in a deadlock: p1: holds D, want A (in lookup()) p2: holds shared lock on F, want D (in VFS_ROOT()) p3: holds B, want drain lock on F (in dounmount()) p4: holds A, want B (in VOP_LOOKUP()) You can have more than one instance of p2. The reversal was introduced in revision 1.81 of src/sys/kern/vfs_lookup.c and MFCed to revision 1.80.2.1, probably to avoid a cascade of vnode locks when nfs servers are dead (VFS_ROOT() just hangs) spreading to the root fs root vnode. - Tor Egge To fix the LOR, ups@ noted that when crossing the mount point, ni_dvp is actually not used by the callers of namei. Thus, placeholder deadfs vnode vp_crossmp is introduced that is filled into ni_dvp. Idea by: ups Reviewed by: tegge, ups, jeff, rwatson (mac interaction) Tested by: Peter Holm MFC after: 2 weeks
237 lines
5.3 KiB
C
237 lines
5.3 KiB
C
/*-
|
|
* Copyright (c) 1989, 1993
|
|
* The Regents of the University of California. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
* @(#)dead_vnops.c 8.1 (Berkeley) 6/10/93
|
|
* $FreeBSD$
|
|
*/
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/kernel.h>
|
|
#include <sys/lock.h>
|
|
#include <sys/mutex.h>
|
|
#include <sys/poll.h>
|
|
#include <sys/vnode.h>
|
|
|
|
/*
|
|
* Prototypes for dead operations on vnodes.
|
|
*/
|
|
static vop_bmap_t dead_bmap;
|
|
static vop_ioctl_t dead_ioctl;
|
|
static vop_lookup_t dead_lookup;
|
|
static vop_open_t dead_open;
|
|
static vop_poll_t dead_poll;
|
|
static vop_read_t dead_read;
|
|
static vop_write_t dead_write;
|
|
static vop_getwritemount_t dead_getwritemount;
|
|
static vop_rename_t dead_rename;
|
|
|
|
struct vop_vector dead_vnodeops = {
|
|
.vop_default = &default_vnodeops,
|
|
|
|
.vop_access = VOP_EBADF,
|
|
.vop_advlock = VOP_EBADF,
|
|
.vop_bmap = dead_bmap,
|
|
.vop_create = VOP_PANIC,
|
|
.vop_getattr = VOP_EBADF,
|
|
.vop_getwritemount = dead_getwritemount,
|
|
.vop_inactive = VOP_NULL,
|
|
.vop_ioctl = dead_ioctl,
|
|
.vop_link = VOP_PANIC,
|
|
.vop_lookup = dead_lookup,
|
|
.vop_mkdir = VOP_PANIC,
|
|
.vop_mknod = VOP_PANIC,
|
|
.vop_open = dead_open,
|
|
.vop_pathconf = VOP_EBADF, /* per pathconf(2) */
|
|
.vop_poll = dead_poll,
|
|
.vop_read = dead_read,
|
|
.vop_readdir = VOP_EBADF,
|
|
.vop_readlink = VOP_EBADF,
|
|
.vop_reclaim = VOP_NULL,
|
|
.vop_remove = VOP_PANIC,
|
|
.vop_rename = dead_rename,
|
|
.vop_rmdir = VOP_PANIC,
|
|
.vop_setattr = VOP_EBADF,
|
|
.vop_symlink = VOP_PANIC,
|
|
.vop_write = dead_write,
|
|
};
|
|
|
|
/* ARGSUSED */
|
|
static int
|
|
dead_getwritemount(ap)
|
|
struct vop_getwritemount_args /* {
|
|
struct vnode *a_vp;
|
|
struct mount **a_mpp;
|
|
} */ *ap;
|
|
{
|
|
*(ap->a_mpp) = NULL;
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Trivial lookup routine that always fails.
|
|
*/
|
|
/* ARGSUSED */
|
|
static int
|
|
dead_lookup(ap)
|
|
struct vop_lookup_args /* {
|
|
struct vnode * a_dvp;
|
|
struct vnode ** a_vpp;
|
|
struct componentname * a_cnp;
|
|
} */ *ap;
|
|
{
|
|
|
|
*ap->a_vpp = NULL;
|
|
return (ENOTDIR);
|
|
}
|
|
|
|
/*
|
|
* Open always fails as if device did not exist.
|
|
*/
|
|
/* ARGSUSED */
|
|
static int
|
|
dead_open(ap)
|
|
struct vop_open_args /* {
|
|
struct vnode *a_vp;
|
|
int a_mode;
|
|
struct ucred *a_cred;
|
|
struct proc *a_p;
|
|
} */ *ap;
|
|
{
|
|
|
|
return (ENXIO);
|
|
}
|
|
|
|
/*
|
|
* Vnode op for read
|
|
*/
|
|
/* ARGSUSED */
|
|
static int
|
|
dead_read(ap)
|
|
struct vop_read_args /* {
|
|
struct vnode *a_vp;
|
|
struct uio *a_uio;
|
|
int a_ioflag;
|
|
struct ucred *a_cred;
|
|
} */ *ap;
|
|
{
|
|
/*
|
|
* Return EOF for tty devices, EIO for others
|
|
*/
|
|
if ((ap->a_vp->v_vflag & VV_ISTTY) == 0)
|
|
return (EIO);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Vnode op for write
|
|
*/
|
|
/* ARGSUSED */
|
|
static int
|
|
dead_write(ap)
|
|
struct vop_write_args /* {
|
|
struct vnode *a_vp;
|
|
struct uio *a_uio;
|
|
int a_ioflag;
|
|
struct ucred *a_cred;
|
|
} */ *ap;
|
|
{
|
|
return (EIO);
|
|
}
|
|
|
|
/*
|
|
* Device ioctl operation.
|
|
*/
|
|
/* ARGSUSED */
|
|
static int
|
|
dead_ioctl(ap)
|
|
struct vop_ioctl_args /* {
|
|
struct vnode *a_vp;
|
|
u_long a_command;
|
|
caddr_t a_data;
|
|
int a_fflag;
|
|
struct ucred *a_cred;
|
|
struct proc *a_p;
|
|
} */ *ap;
|
|
{
|
|
/* XXX: Doesn't this just recurse back here ? */
|
|
return (VOP_IOCTL_AP(ap));
|
|
}
|
|
|
|
/*
|
|
* Wait until the vnode has finished changing state.
|
|
*/
|
|
static int
|
|
dead_bmap(ap)
|
|
struct vop_bmap_args /* {
|
|
struct vnode *a_vp;
|
|
daddr_t a_bn;
|
|
struct bufobj **a_bop;
|
|
daddr_t *a_bnp;
|
|
int *a_runp;
|
|
int *a_runb;
|
|
} */ *ap;
|
|
{
|
|
|
|
return (VOP_BMAP(ap->a_vp, ap->a_bn, ap->a_bop, ap->a_bnp, ap->a_runp, ap->a_runb));
|
|
}
|
|
|
|
/*
|
|
* Trivial poll routine that always returns POLLHUP.
|
|
* This is necessary so that a process which is polling a file
|
|
* gets notified when that file is revoke()d.
|
|
*/
|
|
static int
|
|
dead_poll(ap)
|
|
struct vop_poll_args *ap;
|
|
{
|
|
return (POLLHUP);
|
|
}
|
|
|
|
static int
|
|
dead_rename(ap)
|
|
struct vop_rename_args /* {
|
|
struct vnode *a_fdvp;
|
|
struct vnode *a_fvp;
|
|
struct componentname *a_fcnp;
|
|
struct vnode *a_tdvp;
|
|
struct vnode *a_tvp;
|
|
struct componentname *a_tcnp;
|
|
} */ *ap;
|
|
{
|
|
if (ap->a_tvp)
|
|
vput(ap->a_tvp);
|
|
if (ap->a_tdvp == ap->a_tvp)
|
|
vrele(ap->a_tdvp);
|
|
else
|
|
vput(ap->a_tdvp);
|
|
vrele(ap->a_fdvp);
|
|
vrele(ap->a_fvp);
|
|
return (EXDEV);
|
|
}
|