From 7d7d9e22427335af6d0ff1fe811ac085fb8bf204 Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Wed, 13 Sep 2006 18:39:09 +0000 Subject: [PATCH] Fixes up the handling of shared vnode lock lookups in the NFS client, adds a FS type specific flag indicating that the FS supports shared vnode lock lookups, adds some logic in vfs_lookup.c to test this flag and set lock flags appropriately. - amd on 6.x is a non-starter (without this change). Using amd under heavy load results in a deadlock (with cascading vnode locks all the way to the root) very quickly. - This change should also fix the more general problem of cascading vnode deadlocks when an NFS server goes down. Ideally, we wouldn't need these changes, as enabling shared vnode lock lookups globally would work. Unfortunately, UFS, for example isn't ready for shared vnode lock lookups, crashing pretty quickly. This change is the result of discussions with Stephan Uphoff (ups@). Reviewed by: ups@ --- sys/kern/vfs_lookup.c | 30 +++++++++++++++++++++++------- sys/nfs4client/nfs4_vfsops.c | 4 ++-- sys/nfs4client/nfs4_vnops.c | 12 ++++++------ sys/nfsclient/nfs_node.c | 6 +++--- sys/nfsclient/nfs_subs.c | 2 +- sys/nfsclient/nfs_vfsops.c | 8 ++++---- sys/nfsclient/nfs_vnops.c | 10 +++++----- sys/nfsclient/nfsnode.h | 2 +- sys/sys/mount.h | 1 + 9 files changed, 46 insertions(+), 29 deletions(-) diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c index 137e36826bf7..a1497961cc78 100644 --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -303,6 +303,16 @@ namei(struct nameidata *ndp) return (error); } +static int +compute_cn_lkflags(struct mount *mp, int lkflags) +{ + if ((lkflags & LK_SHARED) && !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED)) { + lkflags &= ~LK_SHARED; + lkflags |= LK_EXCLUSIVE; + } + return lkflags; +} + /* * Search a pathname. * This is a very central and rather complicated routine. @@ -359,7 +369,8 @@ lookup(struct nameidata *ndp) int vfslocked; /* VFS Giant state for child */ int dvfslocked; /* VFS Giant state for parent */ int tvfslocked; - + int lkflags_save; + /* * Setup: break out flag bits into variables. */ @@ -387,7 +398,7 @@ lookup(struct nameidata *ndp) cnp->cn_lkflags = LK_EXCLUSIVE; dp = ndp->ni_startdir; ndp->ni_startdir = NULLVP; - vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td); + vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td); dirloop: /* @@ -524,7 +535,7 @@ lookup(struct nameidata *ndp) VREF(dp); vput(tdp); VFS_UNLOCK_GIANT(tvfslocked); - vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td); + vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td); } } @@ -560,7 +571,10 @@ lookup(struct nameidata *ndp) #ifdef NAMEI_DIAGNOSTIC vprint("lookup in", dp); #endif + lkflags_save = cnp->cn_lkflags; + cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags); if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) { + cnp->cn_lkflags = lkflags_save; KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); #ifdef NAMEI_DIAGNOSTIC printf("not found\n"); @@ -575,7 +589,7 @@ lookup(struct nameidata *ndp) VREF(dp); vput(tdp); VFS_UNLOCK_GIANT(tvfslocked); - vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td); + vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td); goto unionlookup; } @@ -612,7 +626,8 @@ lookup(struct nameidata *ndp) VREF(ndp->ni_startdir); } goto success; - } + } else + cnp->cn_lkflags = lkflags_save; #ifdef NAMEI_DIAGNOSTIC printf("found\n"); #endif @@ -643,9 +658,9 @@ lookup(struct nameidata *ndp) vfslocked = VFS_LOCK_GIANT(mp); if (dp != ndp->ni_dvp) VOP_UNLOCK(ndp->ni_dvp, 0, td); - error = VFS_ROOT(mp, cnp->cn_lkflags, &tdp, td); + error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags), &tdp, td); vfs_unbusy(mp, td); - vn_lock(ndp->ni_dvp, cnp->cn_lkflags | LK_RETRY, td); + vn_lock(ndp->ni_dvp, compute_cn_lkflags(mp, cnp->cn_lkflags | LK_RETRY), td); if (error) { dpunlocked = 1; goto bad2; @@ -859,6 +874,7 @@ relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) */ return (0); } + dp = *vpp; /* diff --git a/sys/nfs4client/nfs4_vfsops.c b/sys/nfs4client/nfs4_vfsops.c index 0eb113bcfeaa..6bb1b9cbacd4 100644 --- a/sys/nfs4client/nfs4_vfsops.c +++ b/sys/nfs4client/nfs4_vfsops.c @@ -200,7 +200,7 @@ nfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) #ifndef nolint sfp = NULL; #endif - error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); + error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE); if (error) return (error); vp = NFSTOV(np); @@ -724,7 +724,7 @@ nfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td) int error; nmp = VFSTONFS(mp); - error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); + error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE); if (error) return (error); vp = NFSTOV(np); diff --git a/sys/nfs4client/nfs4_vnops.c b/sys/nfs4client/nfs4_vnops.c index f0e920ed268d..5867471234bf 100644 --- a/sys/nfs4client/nfs4_vnops.c +++ b/sys/nfs4client/nfs4_vnops.c @@ -497,7 +497,7 @@ nfs4_openrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, if (vp == NULL) { /* New file */ error = nfs_nget(dvp->v_mount, &getfh.fh_val, - getfh.fh_len, &np); + getfh.fh_len, &np, LK_EXCLUSIVE); if (error != 0) goto nfsmout; @@ -1031,7 +1031,7 @@ nfs4_lookup(struct vop_lookup_args *ap) if (NFS_CMPFH(np, fhp, fhsize)) return (EISDIR); - error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); + error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE); if (error) return (error); @@ -1047,7 +1047,7 @@ nfs4_lookup(struct vop_lookup_args *ap) if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, td); - error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); + error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); if (error) return (error); @@ -1058,7 +1058,7 @@ nfs4_lookup(struct vop_lookup_args *ap) VREF(dvp); newvp = dvp; } else { - error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); + error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE); if (error) return (error); newvp = NFSTOV(np); @@ -1431,7 +1431,7 @@ nfs4_createrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, nfsm_v4dissect_getattr(&cp, &ga); nfsm_v4dissect_getfh(&cp, &gfh); - error = nfs_nget(dvp->v_mount, &gfh.fh_val, gfh.fh_len, &np); + error = nfs_nget(dvp->v_mount, &gfh.fh_val, gfh.fh_len, &np, LK_EXCLUSIVE); if (error != 0) goto nfsmout; @@ -2336,7 +2336,7 @@ nfs4_lookitup(struct vnode *dvp, const char *name, int len, struct ucred *cred, VREF(dvp); newvp = dvp; } else { - error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np); + error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE); if (error) { m_freem(mrep); return (error); diff --git a/sys/nfsclient/nfs_node.c b/sys/nfsclient/nfs_node.c index 7edf2dd8985c..ebf453f8765a 100644 --- a/sys/nfsclient/nfs_node.c +++ b/sys/nfsclient/nfs_node.c @@ -99,7 +99,7 @@ nfs_vncmpf(struct vnode *vp, void *arg) * nfsnode structure is returned. */ int -nfs_nget(struct mount *mntp, nfsfh_t *fhp, int fhsize, struct nfsnode **npp) +nfs_nget(struct mount *mntp, nfsfh_t *fhp, int fhsize, struct nfsnode **npp, int flags) { struct thread *td = curthread; /* XXX */ struct nfsnode *np; @@ -117,7 +117,7 @@ nfs_nget(struct mount *mntp, nfsfh_t *fhp, int fhsize, struct nfsnode **npp) ncmp.fhsize = fhsize; ncmp.fh = fhp; - error = vfs_hash_get(mntp, hash, LK_EXCLUSIVE, + error = vfs_hash_get(mntp, hash, flags, td, &nvp, nfs_vncmpf, &ncmp); if (error) return (error); @@ -153,7 +153,7 @@ nfs_nget(struct mount *mntp, nfsfh_t *fhp, int fhsize, struct nfsnode **npp) */ vp->v_vnlock->lk_flags |= LK_CANRECURSE; vp->v_vnlock->lk_flags &= ~LK_NOSHARE; - error = vfs_hash_insert(vp, hash, LK_EXCLUSIVE, + error = vfs_hash_insert(vp, hash, flags, td, &nvp, nfs_vncmpf, &ncmp); if (error) return (error); diff --git a/sys/nfsclient/nfs_subs.c b/sys/nfsclient/nfs_subs.c index 8556b3c902ba..92dd32e272ad 100644 --- a/sys/nfsclient/nfs_subs.c +++ b/sys/nfsclient/nfs_subs.c @@ -930,7 +930,7 @@ nfsm_mtofh_xx(struct vnode *d, struct vnode **v, int v3, int *f, t1 = nfsm_getfh_xx(&ttfhp, &ttfhsize, (v3), md, dpos); if (t1 != 0) return t1; - t1 = nfs_nget(d->v_mount, ttfhp, ttfhsize, &ttnp); + t1 = nfs_nget(d->v_mount, ttfhp, ttfhsize, &ttnp, LK_EXCLUSIVE); if (t1 != 0) return t1; *v = NFSTOV(ttnp); diff --git a/sys/nfsclient/nfs_vfsops.c b/sys/nfsclient/nfs_vfsops.c index 6a29da811f1c..480a5d69ed9d 100644 --- a/sys/nfsclient/nfs_vfsops.c +++ b/sys/nfsclient/nfs_vfsops.c @@ -254,7 +254,7 @@ nfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) error = vfs_busy(mp, LK_NOWAIT, NULL, td); if (error) return (error); - error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); + error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE); if (error) { vfs_unbusy(mp, td); return (error); @@ -785,7 +785,7 @@ nfs_mount(struct mount *mp, struct thread *td) error = mountnfs(&args, mp, nam, hst, &vp, td->td_ucred); out: if (!error) - mp->mnt_kern_flag |= MNTK_MPSAFE; + mp->mnt_kern_flag |= (MNTK_MPSAFE|MNTK_LOOKUP_SHARED); return (error); } @@ -913,7 +913,7 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, * this problem, because one can identify root inodes by their * number == ROOTINO (2). */ - error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); + error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE); if (error) goto bad; *vpp = NFSTOV(np); @@ -995,7 +995,7 @@ nfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td) int error; nmp = VFSTONFS(mp); - error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); + error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, flags); if (error) return error; vp = NFSTOV(np); diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c index b6ad2713ee16..e073de9a3522 100644 --- a/sys/nfsclient/nfs_vnops.c +++ b/sys/nfsclient/nfs_vnops.c @@ -899,7 +899,7 @@ nfs_lookup(struct vop_lookup_args *ap) m_freem(mrep); return (EISDIR); } - error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); + error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE); if (error) { m_freem(mrep); return (error); @@ -918,7 +918,7 @@ nfs_lookup(struct vop_lookup_args *ap) if (flags & ISDOTDOT) { VOP_UNLOCK(dvp, 0, td); - error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); + error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); if (error) return (error); @@ -927,7 +927,7 @@ nfs_lookup(struct vop_lookup_args *ap) VREF(dvp); newvp = dvp; } else { - error = nfs_nget(dvp->v_mount, fhp, fhsize, &np); + error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags); if (error) { m_freem(mrep); return (error); @@ -2410,7 +2410,7 @@ nfs_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred) np = dnp; } else { error = nfs_nget(vp->v_mount, fhp, - fhsize, &np); + fhsize, &np, LK_EXCLUSIVE); if (error) doit = 0; else @@ -2604,7 +2604,7 @@ nfs_lookitup(struct vnode *dvp, const char *name, int len, struct ucred *cred, VREF(dvp); newvp = dvp; } else { - error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np); + error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE); if (error) { m_freem(mrep); return (error); diff --git a/sys/nfsclient/nfsnode.h b/sys/nfsclient/nfsnode.h index e03e97cd253a..2287235e1fd0 100644 --- a/sys/nfsclient/nfsnode.h +++ b/sys/nfsclient/nfsnode.h @@ -189,7 +189,7 @@ int nfs_reclaim(struct vop_reclaim_args *); /* other stuff */ int nfs_removeit(struct sillyrename *); int nfs4_removeit(struct sillyrename *); -int nfs_nget(struct mount *, nfsfh_t *, int, struct nfsnode **); +int nfs_nget(struct mount *, nfsfh_t *, int, struct nfsnode **, int flags); nfsuint64 *nfs_getcookie(struct nfsnode *, off_t, int); uint64_t *nfs4_getcookie(struct nfsnode *, off_t, int); void nfs_invaldir(struct vnode *); diff --git a/sys/sys/mount.h b/sys/sys/mount.h index 0bf0701e7454..66e3ea61eab0 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -310,6 +310,7 @@ void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp); #define MNTK_SUSPENDED 0x10000000 /* write operations are suspended */ #define MNTK_MPSAFE 0x20000000 /* Filesystem is MPSAFE. */ #define MNTK_NOKNOTE 0x80000000 /* Don't send KNOTEs from VOP hooks */ +#define MNTK_LOOKUP_SHARED 0x40000000 /* FS supports shared lock lookups */ /* * Sysctl CTL_VFS definitions.