Add a new "mntfs" pseudo file system which provides private device vnodes for

file systems to safely access their disk devices, and adapt FFS to use it. Also add a new BO_NOBUFS flag to allow enforcing that file systems using mntfs vnodes do not accidentally use the original devfs vnode to create buffers. Reviewed by: kib, mckusick Approved by: imp (mentor) Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D23787
svn path=/head/; revision=358714
2020-03-06 18:41:37 +00:00 · 2020-03-06 18:41:37 +00:00 · f15ccf8836 · 2020-12-20 02:59:44 +00:00
commit f15ccf8836
parent 6c37d6032e
8 changed files with 144 additions and 13 deletions
--- a/sys/conf/files
+++ b/sys/conf/files
@ -3479,6 +3479,7 @@ fs/fuse/fuse_main.c		optional fusefs
 fs/fuse/fuse_node.c		optional fusefs
 fs/fuse/fuse_vfsops.c		optional fusefs
 fs/fuse/fuse_vnops.c		optional fusefs
+fs/mntfs/mntfs_vnops.c		standard
 fs/msdosfs/msdosfs_conv.c	optional msdosfs
 fs/msdosfs/msdosfs_denode.c	optional msdosfs
 fs/msdosfs/msdosfs_fat.c	optional msdosfs
--- a/sys/fs/mntfs/mntfs_vnops.c
+++ b/sys/fs/mntfs/mntfs_vnops.c
@ -0,0 +1,95 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Netflix, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+
+/*
+ * The "mntfs" VCHR vnodes implemented here provide a safe way for file systems
+ * to access their disk devices.  Using the normal devfs vnode has the problem
+ * that if the device disappears, the devfs vnode is vgone'd as part of
+ * removing it from the application-visible namespace, and some file systems
+ * (notably FFS with softdep) get very unhappy if their dirty buffers are
+ * invalidated out from under them.  By using a separate, private vnode,
+ * file systems are able to clean up their buffer state in a controlled fashion
+ * when the underlying device disappears.
+ */
+
+static int
+mntfs_reclaim(struct vop_reclaim_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+
+	dev_rel(vp->v_rdev);
+	return (0);
+}
+
+struct vop_vector mntfs_vnodeops = {
+	.vop_default =		&default_vnodeops,
+
+	.vop_fsync =		vop_stdfsync,
+	.vop_strategy = 	VOP_PANIC,
+	.vop_reclaim =		mntfs_reclaim,
+};
+VFS_VOP_VECTOR_REGISTER(mntfs_vnodeops);
+
+/*
+ * Allocate a private VCHR vnode for use by a mounted fs.
+ * The underlying device will be the same as for the given vnode.
+ * This mntfs vnode must be freed with mntfs_freevp() rather than just
+ * releasing the reference.
+ */
+struct vnode *
+mntfs_allocvp(struct mount *mp, struct vnode *ovp)
+{
+	struct vnode *vp;
+	struct cdev *dev;
+
+	ASSERT_VOP_ELOCKED(ovp, __func__);
+
+	dev = ovp->v_rdev;
+
+	getnewvnode("mntfs", mp, &mntfs_vnodeops, &vp);
+	vp->v_type = VCHR;
+	vp->v_data = NULL;
+	dev_ref(dev);
+	vp->v_rdev = dev;
+
+	return (vp);
+}
+
+void
+mntfs_freevp(struct vnode *vp)
+{
+
+	vgone(vp);
+	vrele(vp);
+}
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@ -2289,6 +2289,8 @@ buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 	int error;

 	ASSERT_BO_WLOCKED(bo);
+	KASSERT((bo->bo_flag & BO_NOBUFS) == 0,
+	    ("buf_vlist_add: bo %p does not allow bufs", bo));
 	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
 	    ("dead bo %p", bo));
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
--- a/sys/sys/bufobj.h
+++ b/sys/sys/bufobj.h
@ -117,6 +117,7 @@ struct bufobj {
 #define	BO_ONWORKLST	(1 << 0)	/* On syncer work-list */
 #define	BO_WWAIT	(1 << 1)	/* Wait for output to complete */
 #define	BO_DEAD		(1 << 2)	/* Dead; only with INVARIANTS */
+#define	BO_NOBUFS	(1 << 3)	/* No bufs allowed */

 #define	BO_LOCKPTR(bo)		(&(bo)->bo_lock)
 #define	BO_LOCK(bo)		rw_wlock(BO_LOCKPTR((bo)))
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@ -940,6 +940,8 @@ extern	struct sx vfsconf_sx;
 #define	vfsconf_unlock()	sx_xunlock(&vfsconf_sx)
 #define	vfsconf_slock()		sx_slock(&vfsconf_sx)
 #define	vfsconf_sunlock()	sx_sunlock(&vfsconf_sx)
+struct vnode *mntfs_allocvp(struct mount *, struct vnode *);
+void   mntfs_freevp(struct vnode *);

 /*
 * Declarations for these vfs default operations are located in
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@ -3594,6 +3594,7 @@ buffered_write(fp, uio, active_cred, flags, td)
 	struct inode *ip;
 	struct buf *bp;
 	struct fs *fs;
+	struct ufsmount *ump;
 	struct filedesc *fdp;
 	int error;
 	daddr_t lbn;
@ -3622,10 +3623,12 @@ buffered_write(fp, uio, active_cred, flags, td)
 		return (EINVAL);
 	}
 	ip = VTOI(vp);
-	if (ITODEVVP(ip) != devvp) {
+	ump = ip->i_ump;
+	if (ump->um_odevvp != devvp) {
 		vput(vp);
 		return (EINVAL);
 	}
+	devvp = ump->um_devvp;
 	fs = ITOFS(ip);
 	vput(vp);
 	foffset_lock_uio(fp, uio, flags);
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@ -151,7 +151,7 @@ static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr",
 static int
 ffs_mount(struct mount *mp)
 {
-	struct vnode *devvp;
+	struct vnode *devvp, *odevvp;
 	struct thread *td;
 	struct ufsmount *ump = NULL;
 	struct fs *fs;
@ -246,6 +246,7 @@ ffs_mount(struct mount *mp)
 	if (mp->mnt_flag & MNT_UPDATE) {
 		ump = VFSTOUFS(mp);
 		fs = ump->um_fs;
+		odevvp = ump->um_odevvp;
 		devvp = ump->um_devvp;
 		if (fsckpid == -1 && ump->um_fsckpid > 0) {
 			if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 ||
@ -337,16 +338,15 @@ ffs_mount(struct mount *mp)
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
-			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
-			error = VOP_ACCESS(devvp, VREAD | VWRITE,
+			vn_lock(odevvp, LK_EXCLUSIVE | LK_RETRY);
+			error = VOP_ACCESS(odevvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error)
 				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
+			VOP_UNLOCK(odevvp);
 			if (error) {
-				VOP_UNLOCK(devvp);
 				return (error);
 			}
-			VOP_UNLOCK(devvp);
 			fs->fs_flags &= ~FS_UNCLEAN;
 			if (fs->fs_clean == 0) {
 				fs->fs_flags |= FS_UNCLEAN;
@ -782,8 +782,8 @@ ffs_reload(struct mount *mp, struct thread *td, int flags)
 * Common code for mount and mountroot
 */
 static int
-ffs_mountfs(devvp, mp, td)
-	struct vnode *devvp;
+ffs_mountfs(odevvp, mp, td)
+	struct vnode *odevvp;
 	struct mount *mp;
 	struct thread *td;
 {
@ -794,6 +794,7 @@ ffs_mountfs(devvp, mp, td)
 	struct ucred *cred;
 	struct g_consumer *cp;
 	struct mount *nmp;
+	struct vnode *devvp;
 	int candelete, canspeedup;
 	off_t loc;

@ -802,11 +803,13 @@ ffs_mountfs(devvp, mp, td)
 	cred = td ? td->td_ucred : NOCRED;
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;

+	devvp = mntfs_allocvp(mp, odevvp);
+	VOP_UNLOCK(odevvp);
 	KASSERT(devvp->v_type == VCHR, ("reclaimed devvp"));
 	dev = devvp->v_rdev;
 	if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0,
 	    (uintptr_t)mp) == 0) {
-		VOP_UNLOCK(devvp);
+		mntfs_freevp(devvp);
 		return (EBUSY);
 	}
 	g_topology_lock();
@ -814,12 +817,14 @@ ffs_mountfs(devvp, mp, td)
 	g_topology_unlock();
 	if (error != 0) {
 		atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
-		VOP_UNLOCK(devvp);
+		mntfs_freevp(devvp);
 		return (error);
 	}
 	dev_ref(dev);
 	devvp->v_bufobj.bo_ops = &ffs_ops;
-	VOP_UNLOCK(devvp);
+	BO_LOCK(&odevvp->v_bufobj);
+	odevvp->v_bufobj.bo_flag |= BO_NOBUFS;
+	BO_UNLOCK(&odevvp->v_bufobj);
 	if (dev->si_iosize_max != 0)
 		mp->mnt_iosize_max = dev->si_iosize_max;
 	if (mp->mnt_iosize_max > MAXPHYS)
@ -1020,6 +1025,7 @@ ffs_mountfs(devvp, mp, td)
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
+	ump->um_odevvp = odevvp;
 	ump->um_nindir = fs->fs_nindir;
 	ump->um_bptrtodb = fs->fs_fsbtodb;
 	ump->um_seqinc = fs->fs_frag;
@ -1099,7 +1105,11 @@ ffs_mountfs(devvp, mp, td)
 		free(ump, M_UFSMNT);
 		mp->mnt_data = NULL;
 	}
+	BO_LOCK(&odevvp->v_bufobj);
+	odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS;
+	BO_UNLOCK(&odevvp->v_bufobj);
 	atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
+	mntfs_freevp(devvp);
 	dev_rel(dev);
 	return (error);
 }
@ -1304,8 +1314,12 @@ ffs_unmount(mp, mntflags)
 	}
 	g_vfs_close(ump->um_cp);
 	g_topology_unlock();
+	BO_LOCK(&ump->um_odevvp->v_bufobj);
+	ump->um_odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS;
+	BO_UNLOCK(&ump->um_odevvp->v_bufobj);
 	atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0);
-	vrele(ump->um_devvp);
+	mntfs_freevp(ump->um_devvp);
+	vrele(ump->um_odevvp);
 	dev_rel(ump->um_dev);
 	mtx_destroy(UFS_MTX(ump));
 	if (mp->mnt_gjprovider != NULL) {
@ -2293,7 +2307,19 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
 	struct buf *tbp;
 	int error, nocopy;

+	/*
+	 * This is the bufobj strategy for the private VCHR vnodes
+	 * used by FFS to access the underlying storage device.
+	 * We override the default bufobj strategy and thus bypass
+	 * VOP_STRATEGY() for these vnodes.
+	 */
 	vp = bo2vnode(bo);
+	KASSERT(bp->b_vp == NULL || bp->b_vp->v_type != VCHR ||
+	    bp->b_vp->v_rdev == NULL ||
+	    bp->b_vp->v_rdev->si_mountpt == NULL ||
+	    VFSTOUFS(bp->b_vp->v_rdev->si_mountpt) == NULL ||
+	    vp == VFSTOUFS(bp->b_vp->v_rdev->si_mountpt)->um_devvp,
+	    ("ffs_geom_strategy() with wrong vp"));
 	if (bp->b_iocmd == BIO_WRITE) {
 		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
 		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
--- a/sys/ufs/ufs/ufsmount.h
+++ b/sys/ufs/ufs/ufsmount.h
@ -83,7 +83,8 @@ struct ufsmount {
 	struct	cdev *um_dev;			/* (r) device mounted */
 	struct	g_consumer *um_cp;		/* (r) GEOM access point */
 	struct	bufobj *um_bo;			/* (r) Buffer cache object */
-	struct	vnode *um_devvp;		/* (r) blk dev mounted vnode */
+	struct	vnode *um_odevvp;		/* (r) devfs dev vnode */
+	struct	vnode *um_devvp;		/* (r) mntfs private vnode */
 	u_long	um_fstype;			/* (c) type of filesystem */
 	struct	fs *um_fs;			/* (r) pointer to superblock */
 	struct	ufs_extattr_per_mount um_extattr; /* (c) extended attrs */