2005-01-06 18:10:42 +00:00
|
|
|
/*-
|
1994-05-24 10:09:53 +00:00
|
|
|
* Copyright (c) 1989, 1993
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* @(#)dead_vnops.c 8.1 (Berkeley) 6/10/93
|
1999-08-28 01:08:13 +00:00
|
|
|
* $FreeBSD$
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
1994-09-21 03:47:43 +00:00
|
|
|
#include <sys/kernel.h>
|
1997-12-05 19:55:52 +00:00
|
|
|
#include <sys/lock.h>
|
2001-05-01 08:13:21 +00:00
|
|
|
#include <sys/mutex.h>
|
1997-12-15 03:09:59 +00:00
|
|
|
#include <sys/poll.h>
|
2001-05-01 08:13:21 +00:00
|
|
|
#include <sys/vnode.h>
|
2000-10-04 01:29:17 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Prototypes for dead operations on vnodes.
|
|
|
|
*/
|
2004-12-01 12:24:41 +00:00
|
|
|
static vop_bmap_t dead_bmap;
|
|
|
|
static vop_ioctl_t dead_ioctl;
|
|
|
|
static vop_lookup_t dead_lookup;
|
|
|
|
static vop_open_t dead_open;
|
|
|
|
static vop_poll_t dead_poll;
|
|
|
|
static vop_read_t dead_read;
|
|
|
|
static vop_write_t dead_write;
|
2006-02-22 06:11:59 +00:00
|
|
|
static vop_getwritemount_t dead_getwritemount;
|
Below is slightly edited description of the LOR by Tor Egge:
--------------------------
[Deadlock] is caused by a lock order reversal in vfs_lookup(), where
[some] process is trying to lock a directory vnode, that is the parent
directory of covered vnode) while holding an exclusive vnode lock on
covering vnode.
A simplified scenario:
root fs var fs
/ A / (/var) D
/var B /log (/var/log) E
vfs lock C vfs lock F
Within each file system, the lock order is clear: C->A->B and F->D->E
When traversing across mounts, the system can choose between two lock orders,
but everything must then follow that lock order:
L1: C->A->B
|
+->F->D->E
L2: F->D->E
|
+->C->A->B
The lookup() process for namei("/var") mixes those two lock orders:
VOP_LOOKUP() obtains B while A is held
vfs_busy() obtains a shared lock on F while A and B are held (follows L1,
violates L2)
vput() releases lock on B
VOP_UNLOCK() releases lock on A
VFS_ROOT() obtains lock on D while shared lock on F is held
vfs_unbusy() releases shared lock on F
vn_lock() obtains lock on A while D is held (violates L1, follows L2)
dounmount() follows L1 (B is locked while F is drained).
Without unmount activity, vfs_busy() will always succeed without blocking
and the deadlock isn't triggered (the system behaves as if L2 is followed).
With unmount, you can get 4 processes in a deadlock:
p1: holds D, want A (in lookup())
p2: holds shared lock on F, want D (in VFS_ROOT())
p3: holds B, want drain lock on F (in dounmount())
p4: holds A, want B (in VOP_LOOKUP())
You can have more than one instance of p2.
The reversal was introduced in revision 1.81 of src/sys/kern/vfs_lookup.c and
MFCed to revision 1.80.2.1, probably to avoid a cascade of vnode locks when nfs
servers are dead (VFS_ROOT() just hangs) spreading to the root fs root vnode.
- Tor Egge
To fix the LOR, ups@ noted that when crossing the mount point, ni_dvp
is actually not used by the callers of namei. Thus, placeholder deadfs
vnode vp_crossmp is introduced that is filled into ni_dvp.
Idea by: ups
Reviewed by: tegge, ups, jeff, rwatson (mac interaction)
Tested by: Peter Holm
MFC after: 2 weeks
2007-01-22 11:25:22 +00:00
|
|
|
static vop_rename_t dead_rename;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2004-12-01 23:16:38 +00:00
|
|
|
struct vop_vector dead_vnodeops = {
|
|
|
|
.vop_default = &default_vnodeops,
|
2005-01-13 18:59:48 +00:00
|
|
|
|
2004-12-01 23:16:38 +00:00
|
|
|
.vop_access = VOP_EBADF,
|
|
|
|
.vop_advlock = VOP_EBADF,
|
|
|
|
.vop_bmap = dead_bmap,
|
|
|
|
.vop_create = VOP_PANIC,
|
|
|
|
.vop_getattr = VOP_EBADF,
|
2006-02-22 06:11:59 +00:00
|
|
|
.vop_getwritemount = dead_getwritemount,
|
2004-12-01 23:16:38 +00:00
|
|
|
.vop_inactive = VOP_NULL,
|
|
|
|
.vop_ioctl = dead_ioctl,
|
|
|
|
.vop_link = VOP_PANIC,
|
|
|
|
.vop_lookup = dead_lookup,
|
|
|
|
.vop_mkdir = VOP_PANIC,
|
|
|
|
.vop_mknod = VOP_PANIC,
|
|
|
|
.vop_open = dead_open,
|
|
|
|
.vop_pathconf = VOP_EBADF, /* per pathconf(2) */
|
|
|
|
.vop_poll = dead_poll,
|
|
|
|
.vop_read = dead_read,
|
|
|
|
.vop_readdir = VOP_EBADF,
|
|
|
|
.vop_readlink = VOP_EBADF,
|
|
|
|
.vop_reclaim = VOP_NULL,
|
|
|
|
.vop_remove = VOP_PANIC,
|
Below is slightly edited description of the LOR by Tor Egge:
--------------------------
[Deadlock] is caused by a lock order reversal in vfs_lookup(), where
[some] process is trying to lock a directory vnode, that is the parent
directory of covered vnode) while holding an exclusive vnode lock on
covering vnode.
A simplified scenario:
root fs var fs
/ A / (/var) D
/var B /log (/var/log) E
vfs lock C vfs lock F
Within each file system, the lock order is clear: C->A->B and F->D->E
When traversing across mounts, the system can choose between two lock orders,
but everything must then follow that lock order:
L1: C->A->B
|
+->F->D->E
L2: F->D->E
|
+->C->A->B
The lookup() process for namei("/var") mixes those two lock orders:
VOP_LOOKUP() obtains B while A is held
vfs_busy() obtains a shared lock on F while A and B are held (follows L1,
violates L2)
vput() releases lock on B
VOP_UNLOCK() releases lock on A
VFS_ROOT() obtains lock on D while shared lock on F is held
vfs_unbusy() releases shared lock on F
vn_lock() obtains lock on A while D is held (violates L1, follows L2)
dounmount() follows L1 (B is locked while F is drained).
Without unmount activity, vfs_busy() will always succeed without blocking
and the deadlock isn't triggered (the system behaves as if L2 is followed).
With unmount, you can get 4 processes in a deadlock:
p1: holds D, want A (in lookup())
p2: holds shared lock on F, want D (in VFS_ROOT())
p3: holds B, want drain lock on F (in dounmount())
p4: holds A, want B (in VOP_LOOKUP())
You can have more than one instance of p2.
The reversal was introduced in revision 1.81 of src/sys/kern/vfs_lookup.c and
MFCed to revision 1.80.2.1, probably to avoid a cascade of vnode locks when nfs
servers are dead (VFS_ROOT() just hangs) spreading to the root fs root vnode.
- Tor Egge
To fix the LOR, ups@ noted that when crossing the mount point, ni_dvp
is actually not used by the callers of namei. Thus, placeholder deadfs
vnode vp_crossmp is introduced that is filled into ni_dvp.
Idea by: ups
Reviewed by: tegge, ups, jeff, rwatson (mac interaction)
Tested by: Peter Holm
MFC after: 2 weeks
2007-01-22 11:25:22 +00:00
|
|
|
.vop_rename = dead_rename,
|
2004-12-01 23:16:38 +00:00
|
|
|
.vop_rmdir = VOP_PANIC,
|
|
|
|
.vop_setattr = VOP_EBADF,
|
|
|
|
.vop_symlink = VOP_PANIC,
|
2008-12-12 00:59:36 +00:00
|
|
|
.vop_vptocnp = VOP_EBADF,
|
2004-12-01 23:16:38 +00:00
|
|
|
.vop_write = dead_write,
|
1994-05-24 10:09:53 +00:00
|
|
|
};
|
1994-09-21 03:47:43 +00:00
|
|
|
|
2006-02-22 06:11:59 +00:00
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
|
|
|
dead_getwritemount(ap)
|
|
|
|
struct vop_getwritemount_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
struct mount **a_mpp;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
*(ap->a_mpp) = NULL;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Trivial lookup routine that always fails.
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
1995-12-02 17:14:34 +00:00
|
|
|
static int
|
1994-05-24 10:09:53 +00:00
|
|
|
dead_lookup(ap)
|
|
|
|
struct vop_lookup_args /* {
|
|
|
|
struct vnode * a_dvp;
|
|
|
|
struct vnode ** a_vpp;
|
|
|
|
struct componentname * a_cnp;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
|
|
|
|
*ap->a_vpp = NULL;
|
|
|
|
return (ENOTDIR);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Open always fails as if device did not exist.
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
1995-12-02 17:14:34 +00:00
|
|
|
static int
|
1994-05-24 10:09:53 +00:00
|
|
|
dead_open(ap)
|
|
|
|
struct vop_open_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
int a_mode;
|
|
|
|
struct ucred *a_cred;
|
|
|
|
struct proc *a_p;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
|
|
|
|
return (ENXIO);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Vnode op for read
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
1995-12-02 17:14:34 +00:00
|
|
|
static int
|
1994-05-24 10:09:53 +00:00
|
|
|
dead_read(ap)
|
|
|
|
struct vop_read_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
struct uio *a_uio;
|
|
|
|
int a_ioflag;
|
|
|
|
struct ucred *a_cred;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
1997-02-10 02:22:35 +00:00
|
|
|
/*
|
|
|
|
* Return EOF for tty devices, EIO for others
|
|
|
|
*/
|
2002-08-04 10:29:36 +00:00
|
|
|
if ((ap->a_vp->v_vflag & VV_ISTTY) == 0)
|
1997-02-10 02:22:35 +00:00
|
|
|
return (EIO);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Vnode op for write
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
1995-12-02 17:14:34 +00:00
|
|
|
static int
|
1994-05-24 10:09:53 +00:00
|
|
|
dead_write(ap)
|
|
|
|
struct vop_write_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
struct uio *a_uio;
|
|
|
|
int a_ioflag;
|
|
|
|
struct ucred *a_cred;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
return (EIO);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Device ioctl operation.
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
1995-12-02 17:14:34 +00:00
|
|
|
static int
|
1994-05-24 10:09:53 +00:00
|
|
|
dead_ioctl(ap)
|
|
|
|
struct vop_ioctl_args /* {
|
|
|
|
struct vnode *a_vp;
|
2002-10-16 08:04:11 +00:00
|
|
|
u_long a_command;
|
1994-05-24 10:09:53 +00:00
|
|
|
caddr_t a_data;
|
|
|
|
int a_fflag;
|
|
|
|
struct ucred *a_cred;
|
|
|
|
struct proc *a_p;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
2002-10-16 08:04:11 +00:00
|
|
|
/* XXX: Doesn't this just recurse back here ? */
|
2005-01-13 07:53:01 +00:00
|
|
|
return (VOP_IOCTL_AP(ap));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait until the vnode has finished changing state.
|
|
|
|
*/
|
1995-12-02 17:14:34 +00:00
|
|
|
static int
|
1994-05-24 10:09:53 +00:00
|
|
|
dead_bmap(ap)
|
|
|
|
struct vop_bmap_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
daddr_t a_bn;
|
2004-11-15 09:18:27 +00:00
|
|
|
struct bufobj **a_bop;
|
1994-05-24 10:09:53 +00:00
|
|
|
daddr_t *a_bnp;
|
|
|
|
int *a_runp;
|
1995-09-04 00:21:16 +00:00
|
|
|
int *a_runb;
|
1994-05-24 10:09:53 +00:00
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
|
2004-11-15 09:18:27 +00:00
|
|
|
return (VOP_BMAP(ap->a_vp, ap->a_bn, ap->a_bop, ap->a_bnp, ap->a_runp, ap->a_runb));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1997-12-15 03:09:59 +00:00
|
|
|
/*
|
|
|
|
* Trivial poll routine that always returns POLLHUP.
|
|
|
|
* This is necessary so that a process which is polling a file
|
|
|
|
* gets notified when that file is revoke()d.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
dead_poll(ap)
|
|
|
|
struct vop_poll_args *ap;
|
|
|
|
{
|
|
|
|
return (POLLHUP);
|
|
|
|
}
|
Below is slightly edited description of the LOR by Tor Egge:
--------------------------
[Deadlock] is caused by a lock order reversal in vfs_lookup(), where
[some] process is trying to lock a directory vnode, that is the parent
directory of covered vnode) while holding an exclusive vnode lock on
covering vnode.
A simplified scenario:
root fs var fs
/ A / (/var) D
/var B /log (/var/log) E
vfs lock C vfs lock F
Within each file system, the lock order is clear: C->A->B and F->D->E
When traversing across mounts, the system can choose between two lock orders,
but everything must then follow that lock order:
L1: C->A->B
|
+->F->D->E
L2: F->D->E
|
+->C->A->B
The lookup() process for namei("/var") mixes those two lock orders:
VOP_LOOKUP() obtains B while A is held
vfs_busy() obtains a shared lock on F while A and B are held (follows L1,
violates L2)
vput() releases lock on B
VOP_UNLOCK() releases lock on A
VFS_ROOT() obtains lock on D while shared lock on F is held
vfs_unbusy() releases shared lock on F
vn_lock() obtains lock on A while D is held (violates L1, follows L2)
dounmount() follows L1 (B is locked while F is drained).
Without unmount activity, vfs_busy() will always succeed without blocking
and the deadlock isn't triggered (the system behaves as if L2 is followed).
With unmount, you can get 4 processes in a deadlock:
p1: holds D, want A (in lookup())
p2: holds shared lock on F, want D (in VFS_ROOT())
p3: holds B, want drain lock on F (in dounmount())
p4: holds A, want B (in VOP_LOOKUP())
You can have more than one instance of p2.
The reversal was introduced in revision 1.81 of src/sys/kern/vfs_lookup.c and
MFCed to revision 1.80.2.1, probably to avoid a cascade of vnode locks when nfs
servers are dead (VFS_ROOT() just hangs) spreading to the root fs root vnode.
- Tor Egge
To fix the LOR, ups@ noted that when crossing the mount point, ni_dvp
is actually not used by the callers of namei. Thus, placeholder deadfs
vnode vp_crossmp is introduced that is filled into ni_dvp.
Idea by: ups
Reviewed by: tegge, ups, jeff, rwatson (mac interaction)
Tested by: Peter Holm
MFC after: 2 weeks
2007-01-22 11:25:22 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
dead_rename(ap)
|
|
|
|
struct vop_rename_args /* {
|
|
|
|
struct vnode *a_fdvp;
|
|
|
|
struct vnode *a_fvp;
|
|
|
|
struct componentname *a_fcnp;
|
|
|
|
struct vnode *a_tdvp;
|
|
|
|
struct vnode *a_tvp;
|
|
|
|
struct componentname *a_tcnp;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
if (ap->a_tvp)
|
|
|
|
vput(ap->a_tvp);
|
|
|
|
if (ap->a_tdvp == ap->a_tvp)
|
|
|
|
vrele(ap->a_tdvp);
|
|
|
|
else
|
|
|
|
vput(ap->a_tdvp);
|
|
|
|
vrele(ap->a_fdvp);
|
|
|
|
vrele(ap->a_fvp);
|
|
|
|
return (EXDEV);
|
|
|
|
}
|