From 0fc6daa72d3513dbb7c5699a3eb9cee69ef4bd23 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Sat, 11 May 2013 11:17:44 +0000 Subject: [PATCH] - Fix nullfs vnode reference leak in nullfs_reclaim_lowervp(). The null_hashget() obtains the reference on the nullfs vnode, which must be dropped. - Fix a wart which existed from the introduction of the nullfs caching, do not unlock lower vnode in the nullfs_reclaim_lowervp(). It should be innocent, but now it is also formally safe. Inform the nullfs_reclaim() about this using the NULLV_NOUNLOCK flag set on nullfs inode. - Add a callback to the upper filesystems for the lower vnode unlinking. When inactivating a nullfs vnode, check if the lower vnode was unlinked, indicated by nullfs flag NULLV_DROP or VV_NOSYNC on the lower vnode, and reclaim upper vnode if so. This allows nullfs to purge cached vnodes for the unlinked lower vnode, avoiding excessive caching. Reported by: G??ran L??wkrantz Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 2 weeks --- sys/fs/nullfs/null.h | 4 ++++ sys/fs/nullfs/null_subr.c | 1 + sys/fs/nullfs/null_vfsops.c | 33 +++++++++++++++++++++++++++++++-- sys/fs/nullfs/null_vnops.c | 19 ++++++++++++++----- sys/kern/vfs_subr.c | 25 ++++++++++++++++++------- sys/kern/vfs_syscalls.c | 2 ++ sys/sys/mount.h | 17 +++++++++++++++-- 7 files changed, 85 insertions(+), 16 deletions(-) diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h index 4f37020d6da5..0972dfc409c3 100644 --- a/sys/fs/nullfs/null.h +++ b/sys/fs/nullfs/null.h @@ -53,8 +53,12 @@ struct null_node { LIST_ENTRY(null_node) null_hash; /* Hash list */ struct vnode *null_lowervp; /* VREFed once */ struct vnode *null_vnode; /* Back pointer */ + u_int null_flags; }; +#define NULLV_NOUNLOCK 0x0001 +#define NULLV_DROP 0x0002 + #define MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data)) #define VTONULL(vp) ((struct null_node *)(vp)->v_data) #define NULLTOV(xp) ((xp)->null_vnode) diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c index 90e56ea6f5f5..fa6c4af12577 100644 --- a/sys/fs/nullfs/null_subr.c +++ b/sys/fs/nullfs/null_subr.c @@ -247,6 +247,7 @@ null_nodeget(mp, lowervp, vpp) xp->null_vnode = vp; xp->null_lowervp = lowervp; + xp->null_flags = 0; vp->v_type = lowervp->v_type; vp->v_data = xp; vp->v_vnlock = lowervp->v_vnlock; diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c index 02932bd932b2..ad0223664aec 100644 --- a/sys/fs/nullfs/null_vfsops.c +++ b/sys/fs/nullfs/null_vfsops.c @@ -65,7 +65,6 @@ static vfs_statfs_t nullfs_statfs; static vfs_unmount_t nullfs_unmount; static vfs_vget_t nullfs_vget; static vfs_extattrctl_t nullfs_extattrctl; -static vfs_reclaim_lowervp_t nullfs_reclaim_lowervp; /* * Mount null layer @@ -391,8 +390,37 @@ nullfs_reclaim_lowervp(struct mount *mp, struct vnode *lowervp) vp = null_hashget(mp, lowervp); if (vp == NULL) return; + VTONULL(vp)->null_flags |= NULLV_NOUNLOCK; vgone(vp); - vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY); + vput(vp); +} + +static void +nullfs_unlink_lowervp(struct mount *mp, struct vnode *lowervp) +{ + struct vnode *vp; + struct null_node *xp; + + vp = null_hashget(mp, lowervp); + if (vp == NULL) + return; + xp = VTONULL(vp); + xp->null_flags |= NULLV_DROP | NULLV_NOUNLOCK; + vhold(vp); + vunref(vp); + + /* + * If vunref() dropped the last use reference on the nullfs + * vnode, it must be reclaimed, and its lock was split from + * the lower vnode lock. Need to do extra unlock before + * allowing the final vdrop() to free the vnode. + */ + if (vp->v_usecount == 0) { + KASSERT((vp->v_iflag & VI_DOOMED) != 0, + ("not reclaimed %p", vp)); + VOP_UNLOCK(vp, 0); + } + vdrop(vp); } static struct vfsops null_vfsops = { @@ -408,6 +436,7 @@ static struct vfsops null_vfsops = { .vfs_unmount = nullfs_unmount, .vfs_vget = nullfs_vget, .vfs_reclaim_lowervp = nullfs_reclaim_lowervp, + .vfs_unlink_lowervp = nullfs_unlink_lowervp, }; VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK | VFCF_JAIL); diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c index f59865f9680f..6ff15ee9f1d1 100644 --- a/sys/fs/nullfs/null_vnops.c +++ b/sys/fs/nullfs/null_vnops.c @@ -692,18 +692,24 @@ null_unlock(struct vop_unlock_args *ap) static int null_inactive(struct vop_inactive_args *ap __unused) { - struct vnode *vp; + struct vnode *vp, *lvp; + struct null_node *xp; struct mount *mp; struct null_mount *xmp; vp = ap->a_vp; + xp = VTONULL(vp); + lvp = NULLVPTOLOWERVP(vp); mp = vp->v_mount; xmp = MOUNTTONULLMOUNT(mp); - if ((xmp->nullm_flags & NULLM_CACHE) == 0) { + if ((xmp->nullm_flags & NULLM_CACHE) == 0 || + (xp->null_flags & NULLV_DROP) != 0 || + (lvp->v_vflag & VV_NOSYNC) != 0) { /* * If this is the last reference and caching of the - * nullfs vnodes is not enabled, then free up the - * vnode so as not to tie up the lower vnodes. + * nullfs vnodes is not enabled, or the lower vnode is + * deleted, then free up the vnode so as not to tie up + * the lower vnodes. */ vp->v_object = NULL; vrecycle(vp); @@ -748,7 +754,10 @@ null_reclaim(struct vop_reclaim_args *ap) */ if (vp->v_writecount > 0) VOP_ADD_WRITECOUNT(lowervp, -1); - vput(lowervp); + if ((xp->null_flags & NULLV_NOUNLOCK) != 0) + vunref(lowervp); + else + vput(lowervp); free(xp, M_NULLFSNODE); return (0); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index d9cbde2fbabc..7f72ed15ae30 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -2700,19 +2700,20 @@ vgone(struct vnode *vp) } static void -vgonel_reclaim_lowervp_vfs(struct mount *mp __unused, +notify_lowervp_vfs_dummy(struct mount *mp __unused, struct vnode *lowervp __unused) { } /* - * Notify upper mounts about reclaimed vnode. + * Notify upper mounts about reclaimed or unlinked vnode. */ -static void -vgonel_reclaim_lowervp(struct vnode *vp) +void +vfs_notify_upper(struct vnode *vp, int event) { static struct vfsops vgonel_vfsops = { - .vfs_reclaim_lowervp = vgonel_reclaim_lowervp_vfs + .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, + .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, }; struct mount *mp, *ump, *mmp; @@ -2736,7 +2737,17 @@ vgonel_reclaim_lowervp(struct vnode *vp) } TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); MNT_IUNLOCK(mp); - VFS_RECLAIM_LOWERVP(ump, vp); + switch (event) { + case VFS_NOTIFY_UPPER_RECLAIM: + VFS_RECLAIM_LOWERVP(ump, vp); + break; + case VFS_NOTIFY_UPPER_UNLINK: + VFS_UNLINK_LOWERVP(ump, vp); + break; + default: + KASSERT(0, ("invalid event %d", event)); + break; + } MNT_ILOCK(mp); ump = TAILQ_NEXT(mmp, mnt_upper_link); TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); @@ -2783,7 +2794,7 @@ vgonel(struct vnode *vp) active = vp->v_usecount; oweinact = (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); - vgonel_reclaim_lowervp(vp); + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * Clean out any buffers associated with the vnode. diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 29cb7bdca801..a004ea0d51ed 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1846,6 +1846,7 @@ kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg, if (error) goto out; #endif + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: @@ -3825,6 +3826,7 @@ kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg) return (error); goto restart; } + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: diff --git a/sys/sys/mount.h b/sys/sys/mount.h index a9c86f67fb92..a953dae420cd 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -608,7 +608,7 @@ typedef int vfs_mount_t(struct mount *mp); typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op, struct sysctl_req *req); typedef void vfs_susp_clean_t(struct mount *mp); -typedef void vfs_reclaim_lowervp_t(struct mount *mp, struct vnode *lowervp); +typedef void vfs_notify_lowervp_t(struct mount *mp, struct vnode *lowervp); struct vfsops { vfs_mount_t *vfs_mount; @@ -626,7 +626,8 @@ struct vfsops { vfs_extattrctl_t *vfs_extattrctl; vfs_sysctl_t *vfs_sysctl; vfs_susp_clean_t *vfs_susp_clean; - vfs_reclaim_lowervp_t *vfs_reclaim_lowervp; + vfs_notify_lowervp_t *vfs_reclaim_lowervp; + vfs_notify_lowervp_t *vfs_unlink_lowervp; }; vfs_statfs_t __vfs_statfs; @@ -747,6 +748,14 @@ vfs_statfs_t __vfs_statfs; } \ } while (0) +#define VFS_UNLINK_LOWERVP(MP, VP) do { \ + if (*(MP)->mnt_op->vfs_unlink_lowervp != NULL) { \ + VFS_PROLOGUE(MP); \ + (*(MP)->mnt_op->vfs_unlink_lowervp)((MP), (VP)); \ + VFS_EPILOGUE(MP); \ + } \ +} while (0) + #define VFS_KNOTE_LOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ @@ -759,6 +768,9 @@ vfs_statfs_t __vfs_statfs; VN_KNOTE((vp), (hint), 0); \ } while (0) +#define VFS_NOTIFY_UPPER_RECLAIM 1 +#define VFS_NOTIFY_UPPER_UNLINK 2 + #include /* @@ -840,6 +852,7 @@ int vfs_modevent(module_t, int, void *); void vfs_mount_error(struct mount *, const char *, ...); void vfs_mountroot(void); /* mount our root filesystem */ void vfs_mountedfrom(struct mount *, const char *from); +void vfs_notify_upper(struct vnode *, int); void vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp); void vfs_ref(struct mount *);