From 56dd3a6182bf4441a29614d0a6091cb6cf6717ad Mon Sep 17 00:00:00 2001
From: Poul-Henning Kamp <phk@FreeBSD.org>
Date: Mon, 8 Nov 2004 10:46:47 +0000
Subject: [PATCH] Add optional device vnode bypass to DEVFS.

The tunable vfs.devfs.fops controls this feature and defaults to off.

When enabled (vfs.devfs.fops=1 in loader), device vnodes opened
through a filedescriptor gets a special fops vector which instead
of the detour through the vnode layer goes directly to DEVFS.

Amongst other things this allows us to run Giant free read/write to
device drivers which have been weaned off D_NEEDGIANT.

Currently this means /dev/null, /dev/zero, disks, (and maybe the
random stuff ?)

On a 700MHz K7 machine this doubles the speed of
	dd if=/dev/zero of=/dev/null bs=1 count=1000000

This roughly translates to shaving 2usec of each read/write syscall.

The poll/kqfilter paths need more work before they are giant free,
this work is ongoing in p4::phk_bufwork

Please test this and report any problems, LORs etc.
---
 sys/fs/devfs/devfs_vnops.c | 287 +++++++++++++++++++++++++++++++++++++
 1 file changed, 287 insertions(+)

diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
index f3b95faddead..d3581aa38525 100644
--- a/sys/fs/devfs/devfs_vnops.c
+++ b/sys/fs/devfs/devfs_vnops.c
@@ -49,6 +49,9 @@
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
@@ -59,11 +62,33 @@
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/time.h>
+#include <sys/ttycom.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <fs/devfs/devfs.h>
 
+static int devfs_fops = 0;
+
+static fo_rdwr_t	devfs_read_f;
+static fo_rdwr_t	devfs_write_f;
+static fo_ioctl_t	devfs_ioctl_f;
+static fo_poll_t	devfs_poll_f;
+static fo_kqfilter_t	devfs_kqfilter_f;
+static fo_stat_t	devfs_stat_f;
+static fo_close_t	devfs_close_f;
+
+struct fileops devfs_ops_f = {
+	.fo_read =	devfs_read_f,
+	.fo_write =	devfs_write_f,
+	.fo_ioctl =	devfs_ioctl_f,
+	.fo_poll =	devfs_poll_f,
+	.fo_kqfilter =	devfs_kqfilter_f,
+	.fo_stat =	devfs_stat_f,
+	.fo_close =	devfs_close_f,
+	.fo_flags =	DFLAG_PASSABLE | DFLAG_SEEKABLE
+};
+
 static int	devfs_access(struct vop_access_args *ap);
 static int	devfs_advlock(struct vop_advlock_args *ap);
 static int	devfs_close(struct vop_close_args *ap);
@@ -314,6 +339,18 @@ devfs_close(ap)
 	return (error);
 }
 
+static int
+devfs_close_f(struct file *fp, struct thread *td)
+{
+	struct cdev *dev;
+
+	dev = fp->f_data;
+#if 0
+	printf("devfs_close_f(%s)\n", devtoname(dev));
+#endif
+	return (vnops.fo_close(fp, td));
+}
+
 /*
  * Synch buffers associated with a block device
  */
@@ -439,6 +476,67 @@ devfs_ioctl(ap)
 	return (error);
 }
 
+static int
+devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td)
+{
+	struct cdev *dev;
+	struct cdevsw *dsw;
+	struct vnode *vp = fp->f_vnode;
+	struct vnode *vpold;
+	int error;
+
+	dev = fp->f_data;
+#if 0
+	printf("devfs_ioctl_f(%s)\n", devtoname(dev));
+#endif
+	KASSERT(dev->si_refcount > 0,
+	    ("devfs_ioctl() on un-referenced struct cdev *(%s)",
+	    devtoname(dev)));
+	dsw = dev_refthread(dev);
+	if (dsw == NULL)
+		return (ENXIO);
+
+	if (com == FIODTYPE) {
+		*(int *)data = dsw->d_flags & D_TYPEMASK;
+		dev_relthread(dev);
+		return (0);
+	}
+	if (dsw->d_flags & D_NEEDGIANT)
+		mtx_lock(&Giant);
+	error = dsw->d_ioctl(dev, com, data, fp->f_flag, td);
+	if (dsw->d_flags & D_NEEDGIANT)
+		mtx_unlock(&Giant);
+	dev_relthread(dev);
+	if (error == ENOIOCTL)
+		error = ENOTTY;
+	if (error == 0 && com == TIOCSCTTY) {
+
+		/* Do nothing if reassigning same control tty */
+		sx_slock(&proctree_lock);
+		if (td->td_proc->p_session->s_ttyvp == vp) {
+			sx_sunlock(&proctree_lock);
+			return (0);
+		}
+
+		mtx_lock(&Giant);
+
+		vpold = td->td_proc->p_session->s_ttyvp;
+		VREF(vp);
+		SESS_LOCK(td->td_proc->p_session);
+		td->td_proc->p_session->s_ttyvp = vp;
+		SESS_UNLOCK(td->td_proc->p_session);
+
+		sx_sunlock(&proctree_lock);
+
+		/* Get rid of reference to old control tty */
+		if (vpold)
+			vrele(vpold);
+		mtx_unlock(&Giant);
+	}
+	return (error);
+}
+
+
 /* ARGSUSED */
 static int
 devfs_kqfilter(ap)
@@ -467,6 +565,32 @@ devfs_kqfilter(ap)
 	return (error);
 }
 
+static int
+devfs_kqfilter_f(struct file *fp, struct knote *kn)
+{
+	struct cdev *dev;
+	struct cdevsw *dsw;
+	int error;
+
+	dev = fp->f_data;
+#if 0
+	printf("devfs_kqfilter_f(%s)\n", devtoname(dev));
+#endif
+	KASSERT(dev->si_refcount > 0,
+	    ("devfs_kqfilter() on un-referenced struct cdev *(%s)",
+	    devtoname(dev)));
+	dsw = dev_refthread(dev);
+	if (dsw == NULL)
+		return(0);
+	if (dsw->d_flags & D_NEEDGIANT)
+		mtx_lock(&Giant);
+	error = dsw->d_kqfilter(dev, kn);
+	if (dsw->d_flags & D_NEEDGIANT)
+		mtx_unlock(&Giant);
+	dev_relthread(dev);
+	return (error);
+}
+
 static int
 devfs_lookupx(ap)
 	struct vop_lookup_args /* {
@@ -697,8 +821,17 @@ devfs_open(ap)
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	struct cdev *dev = vp->v_rdev;
+	struct file *fp;
 	int error;
 	struct cdevsw *dsw;
+	static int once;
+
+	if (!once) {
+		TUNABLE_INT_FETCH("vfs.devfs.fops", &devfs_fops);
+		if (devfs_fops)
+			printf("WARNING: DEVFS uses fops\n");
+		once = 1;
+	}
 
 	if (vp->v_type == VBLK)
 		return (ENXIO);
@@ -751,6 +884,23 @@ devfs_open(ap)
 	if (error)
 		return (error);
 
+	if (devfs_fops && ap->a_fdidx >= 0) {
+		/*
+		 * This is a pretty disgustingly long chain, but I am not
+		 * sure there is any better way.  Passing the fdidx into
+		 * VOP_OPEN() offers us more information than just passing
+		 * the file *.
+		 */
+		fp = ap->a_td->td_proc->p_fd->fd_ofiles[ap->a_fdidx];
+		if (fp->f_ops == &badfileops) {
+#if 0
+			printf("devfs_open(%s)\n", devtoname(dev));
+#endif
+			fp->f_ops = &devfs_ops_f;
+			fp->f_data = dev;
+		}
+	}
+
 	return (error);
 }
 
@@ -817,6 +967,32 @@ devfs_poll(ap)
 	return(error);
 }
 
+static int
+devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td)
+{
+	struct cdev *dev;
+	struct cdevsw *dsw;
+	int error;
+
+	dev = fp->f_data;
+#if 0
+	printf("devfs_poll_f(%s)\n", devtoname(dev));
+#endif
+	dsw = dev_refthread(dev);
+	if (dsw == NULL)
+		return (0);
+	KASSERT(dev->si_refcount > 0,
+	    ("devfs_poll() on un-referenced struct cdev *(%s)",
+	    devtoname(dev)));
+	if (dsw->d_flags & D_NEEDGIANT)
+		mtx_lock(&Giant);
+	error = dsw->d_poll(dev, events, td);
+	if (dsw->d_flags & D_NEEDGIANT)
+		mtx_unlock(&Giant);
+	dev_relthread(dev);
+	return(error);
+}
+
 /*
  * Print out the contents of a special device vnode.
  */
@@ -879,6 +1055,55 @@ devfs_read(ap)
 	return (error);
 }
 
+static int
+devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)
+{
+	struct cdev *dev;
+	int ioflag, error, resid;
+	struct cdevsw *dsw;
+	struct vnode *vp;
+
+	dev = fp->f_data;
+#if 0
+	/*
+	 * Enabling this one is dangerous, syslog will log once for each
+	 * read from /dev/klog so...
+	 */
+	printf("devfs_read_f(%s)\n", devtoname(dev));
+#endif
+	KASSERT(dev->si_refcount > 0,
+	    ("specread() on un-referenced struct cdev *(%s)", devtoname(dev)));
+	dsw = dev_refthread(dev);
+	if (dsw == NULL)
+		return (ENXIO);
+
+	vp = fp->f_vnode;
+	resid = uio->uio_resid;
+
+	ioflag = 0;
+	if (fp->f_flag & FNONBLOCK)
+		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
+
+	if ((flags & FOF_OFFSET) == 0)
+		uio->uio_offset = fp->f_offset;
+
+	if (dsw->d_flags & D_NEEDGIANT)
+		mtx_lock(&Giant);
+	error = dsw->d_read(dev, uio, ioflag);
+	if (dsw->d_flags & D_NEEDGIANT)
+		mtx_unlock(&Giant);
+	dev_relthread(dev);
+	if (uio->uio_resid != resid || (error == 0 && resid != 0))
+		vfs_timestamp(&dev->si_atime);
+
+	if ((flags & FOF_OFFSET) == 0)
+		fp->f_offset = uio->uio_offset;
+	fp->f_nextoff = uio->uio_offset;
+	return (error);
+}
+
 static int
 devfs_readdir(ap)
 	struct vop_readdir_args /* {
@@ -1198,6 +1423,18 @@ devfs_setlabel(ap)
 }
 #endif
 
+static int
+devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td)
+{
+	struct cdev *dev;
+
+	dev = fp->f_data;
+#if 0
+	printf("devfs_stat_f(%s)\n", devtoname(dev));
+#endif
+	return (vnops.fo_stat(fp, sb, cred, td));
+}
+
 static int
 devfs_symlink(ap)
 	struct vop_symlink_args /* {
@@ -1288,6 +1525,56 @@ devfs_write(ap)
 	return (error);
 }
 
+static int
+devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)
+{
+	struct cdev *dev;
+	struct vnode *vp;
+	int error, ioflag, resid;
+	struct cdevsw *dsw;
+
+	dev = fp->f_data;
+#if 0
+	printf("devfs_write_f(%s)\n", devtoname(dev));
+#endif
+	KASSERT(dev->si_refcount > 0,
+	    ("devfs_write() on un-referenced struct cdev *(%s)",
+	    devtoname(dev)));
+	dsw = dev_refthread(dev);
+	if (dsw == NULL)
+		return (ENXIO);
+
+	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td));
+	vp = fp->f_vnode;
+	ioflag = IO_UNIT;
+	if (fp->f_flag & FNONBLOCK)
+		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
+	if ((fp->f_flag & O_FSYNC) ||
+	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
+		ioflag |= IO_SYNC;
+	if ((flags & FOF_OFFSET) == 0)
+		uio->uio_offset = fp->f_offset;
+
+	resid = uio->uio_resid;
+
+	if (dsw->d_flags & D_NEEDGIANT)
+		mtx_lock(&Giant);
+	error = dsw->d_write(dev, uio, ioflag);
+	if (dsw->d_flags & D_NEEDGIANT)
+		mtx_unlock(&Giant);
+	dev_relthread(dev);
+	if (uio->uio_resid != resid || (error == 0 && resid != 0)) {
+		vfs_timestamp(&dev->si_ctime);
+		dev->si_mtime = dev->si_ctime;
+	}
+
+	if ((flags & FOF_OFFSET) == 0)
+		fp->f_offset = uio->uio_offset;
+	fp->f_nextoff = uio->uio_offset;
+	return (error);
+}
 
 static struct vnodeopv_entry_desc devfs_vnodeop_entries[] = {
 	{ &vop_default_desc,		(vop_t *) vop_defaultop },