From 56dd3a6182bf4441a29614d0a6091cb6cf6717ad Mon Sep 17 00:00:00 2001 From: Poul-Henning Kamp Date: Mon, 8 Nov 2004 10:46:47 +0000 Subject: [PATCH] Add optional device vnode bypass to DEVFS. The tunable vfs.devfs.fops controls this feature and defaults to off. When enabled (vfs.devfs.fops=1 in loader), device vnodes opened through a filedescriptor gets a special fops vector which instead of the detour through the vnode layer goes directly to DEVFS. Amongst other things this allows us to run Giant free read/write to device drivers which have been weaned off D_NEEDGIANT. Currently this means /dev/null, /dev/zero, disks, (and maybe the random stuff ?) On a 700MHz K7 machine this doubles the speed of dd if=/dev/zero of=/dev/null bs=1 count=1000000 This roughly translates to shaving 2usec of each read/write syscall. The poll/kqfilter paths need more work before they are giant free, this work is ongoing in p4::phk_bufwork Please test this and report any problems, LORs etc. --- sys/fs/devfs/devfs_vnops.c | 287 +++++++++++++++++++++++++++++++++++++ 1 file changed, 287 insertions(+) diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index f3b95faddead..d3581aa38525 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -49,6 +49,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -59,11 +62,33 @@ #include #include #include +#include #include #include #include +static int devfs_fops = 0; + +static fo_rdwr_t devfs_read_f; +static fo_rdwr_t devfs_write_f; +static fo_ioctl_t devfs_ioctl_f; +static fo_poll_t devfs_poll_f; +static fo_kqfilter_t devfs_kqfilter_f; +static fo_stat_t devfs_stat_f; +static fo_close_t devfs_close_f; + +struct fileops devfs_ops_f = { + .fo_read = devfs_read_f, + .fo_write = devfs_write_f, + .fo_ioctl = devfs_ioctl_f, + .fo_poll = devfs_poll_f, + .fo_kqfilter = devfs_kqfilter_f, + .fo_stat = devfs_stat_f, + .fo_close = devfs_close_f, + .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE +}; + static int devfs_access(struct vop_access_args *ap); static int devfs_advlock(struct vop_advlock_args *ap); static int devfs_close(struct vop_close_args *ap); @@ -314,6 +339,18 @@ devfs_close(ap) return (error); } +static int +devfs_close_f(struct file *fp, struct thread *td) +{ + struct cdev *dev; + + dev = fp->f_data; +#if 0 + printf("devfs_close_f(%s)\n", devtoname(dev)); +#endif + return (vnops.fo_close(fp, td)); +} + /* * Synch buffers associated with a block device */ @@ -439,6 +476,67 @@ devfs_ioctl(ap) return (error); } +static int +devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) +{ + struct cdev *dev; + struct cdevsw *dsw; + struct vnode *vp = fp->f_vnode; + struct vnode *vpold; + int error; + + dev = fp->f_data; +#if 0 + printf("devfs_ioctl_f(%s)\n", devtoname(dev)); +#endif + KASSERT(dev->si_refcount > 0, + ("devfs_ioctl() on un-referenced struct cdev *(%s)", + devtoname(dev))); + dsw = dev_refthread(dev); + if (dsw == NULL) + return (ENXIO); + + if (com == FIODTYPE) { + *(int *)data = dsw->d_flags & D_TYPEMASK; + dev_relthread(dev); + return (0); + } + if (dsw->d_flags & D_NEEDGIANT) + mtx_lock(&Giant); + error = dsw->d_ioctl(dev, com, data, fp->f_flag, td); + if (dsw->d_flags & D_NEEDGIANT) + mtx_unlock(&Giant); + dev_relthread(dev); + if (error == ENOIOCTL) + error = ENOTTY; + if (error == 0 && com == TIOCSCTTY) { + + /* Do nothing if reassigning same control tty */ + sx_slock(&proctree_lock); + if (td->td_proc->p_session->s_ttyvp == vp) { + sx_sunlock(&proctree_lock); + return (0); + } + + mtx_lock(&Giant); + + vpold = td->td_proc->p_session->s_ttyvp; + VREF(vp); + SESS_LOCK(td->td_proc->p_session); + td->td_proc->p_session->s_ttyvp = vp; + SESS_UNLOCK(td->td_proc->p_session); + + sx_sunlock(&proctree_lock); + + /* Get rid of reference to old control tty */ + if (vpold) + vrele(vpold); + mtx_unlock(&Giant); + } + return (error); +} + + /* ARGSUSED */ static int devfs_kqfilter(ap) @@ -467,6 +565,32 @@ devfs_kqfilter(ap) return (error); } +static int +devfs_kqfilter_f(struct file *fp, struct knote *kn) +{ + struct cdev *dev; + struct cdevsw *dsw; + int error; + + dev = fp->f_data; +#if 0 + printf("devfs_kqfilter_f(%s)\n", devtoname(dev)); +#endif + KASSERT(dev->si_refcount > 0, + ("devfs_kqfilter() on un-referenced struct cdev *(%s)", + devtoname(dev))); + dsw = dev_refthread(dev); + if (dsw == NULL) + return(0); + if (dsw->d_flags & D_NEEDGIANT) + mtx_lock(&Giant); + error = dsw->d_kqfilter(dev, kn); + if (dsw->d_flags & D_NEEDGIANT) + mtx_unlock(&Giant); + dev_relthread(dev); + return (error); +} + static int devfs_lookupx(ap) struct vop_lookup_args /* { @@ -697,8 +821,17 @@ devfs_open(ap) struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; + struct file *fp; int error; struct cdevsw *dsw; + static int once; + + if (!once) { + TUNABLE_INT_FETCH("vfs.devfs.fops", &devfs_fops); + if (devfs_fops) + printf("WARNING: DEVFS uses fops\n"); + once = 1; + } if (vp->v_type == VBLK) return (ENXIO); @@ -751,6 +884,23 @@ devfs_open(ap) if (error) return (error); + if (devfs_fops && ap->a_fdidx >= 0) { + /* + * This is a pretty disgustingly long chain, but I am not + * sure there is any better way. Passing the fdidx into + * VOP_OPEN() offers us more information than just passing + * the file *. + */ + fp = ap->a_td->td_proc->p_fd->fd_ofiles[ap->a_fdidx]; + if (fp->f_ops == &badfileops) { +#if 0 + printf("devfs_open(%s)\n", devtoname(dev)); +#endif + fp->f_ops = &devfs_ops_f; + fp->f_data = dev; + } + } + return (error); } @@ -817,6 +967,32 @@ devfs_poll(ap) return(error); } +static int +devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) +{ + struct cdev *dev; + struct cdevsw *dsw; + int error; + + dev = fp->f_data; +#if 0 + printf("devfs_poll_f(%s)\n", devtoname(dev)); +#endif + dsw = dev_refthread(dev); + if (dsw == NULL) + return (0); + KASSERT(dev->si_refcount > 0, + ("devfs_poll() on un-referenced struct cdev *(%s)", + devtoname(dev))); + if (dsw->d_flags & D_NEEDGIANT) + mtx_lock(&Giant); + error = dsw->d_poll(dev, events, td); + if (dsw->d_flags & D_NEEDGIANT) + mtx_unlock(&Giant); + dev_relthread(dev); + return(error); +} + /* * Print out the contents of a special device vnode. */ @@ -879,6 +1055,55 @@ devfs_read(ap) return (error); } +static int +devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) +{ + struct cdev *dev; + int ioflag, error, resid; + struct cdevsw *dsw; + struct vnode *vp; + + dev = fp->f_data; +#if 0 + /* + * Enabling this one is dangerous, syslog will log once for each + * read from /dev/klog so... + */ + printf("devfs_read_f(%s)\n", devtoname(dev)); +#endif + KASSERT(dev->si_refcount > 0, + ("specread() on un-referenced struct cdev *(%s)", devtoname(dev))); + dsw = dev_refthread(dev); + if (dsw == NULL) + return (ENXIO); + + vp = fp->f_vnode; + resid = uio->uio_resid; + + ioflag = 0; + if (fp->f_flag & FNONBLOCK) + ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; + + if ((flags & FOF_OFFSET) == 0) + uio->uio_offset = fp->f_offset; + + if (dsw->d_flags & D_NEEDGIANT) + mtx_lock(&Giant); + error = dsw->d_read(dev, uio, ioflag); + if (dsw->d_flags & D_NEEDGIANT) + mtx_unlock(&Giant); + dev_relthread(dev); + if (uio->uio_resid != resid || (error == 0 && resid != 0)) + vfs_timestamp(&dev->si_atime); + + if ((flags & FOF_OFFSET) == 0) + fp->f_offset = uio->uio_offset; + fp->f_nextoff = uio->uio_offset; + return (error); +} + static int devfs_readdir(ap) struct vop_readdir_args /* { @@ -1198,6 +1423,18 @@ devfs_setlabel(ap) } #endif +static int +devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td) +{ + struct cdev *dev; + + dev = fp->f_data; +#if 0 + printf("devfs_stat_f(%s)\n", devtoname(dev)); +#endif + return (vnops.fo_stat(fp, sb, cred, td)); +} + static int devfs_symlink(ap) struct vop_symlink_args /* { @@ -1288,6 +1525,56 @@ devfs_write(ap) return (error); } +static int +devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) +{ + struct cdev *dev; + struct vnode *vp; + int error, ioflag, resid; + struct cdevsw *dsw; + + dev = fp->f_data; +#if 0 + printf("devfs_write_f(%s)\n", devtoname(dev)); +#endif + KASSERT(dev->si_refcount > 0, + ("devfs_write() on un-referenced struct cdev *(%s)", + devtoname(dev))); + dsw = dev_refthread(dev); + if (dsw == NULL) + return (ENXIO); + + KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); + vp = fp->f_vnode; + ioflag = IO_UNIT; + if (fp->f_flag & FNONBLOCK) + ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; + if ((fp->f_flag & O_FSYNC) || + (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) + ioflag |= IO_SYNC; + if ((flags & FOF_OFFSET) == 0) + uio->uio_offset = fp->f_offset; + + resid = uio->uio_resid; + + if (dsw->d_flags & D_NEEDGIANT) + mtx_lock(&Giant); + error = dsw->d_write(dev, uio, ioflag); + if (dsw->d_flags & D_NEEDGIANT) + mtx_unlock(&Giant); + dev_relthread(dev); + if (uio->uio_resid != resid || (error == 0 && resid != 0)) { + vfs_timestamp(&dev->si_ctime); + dev->si_mtime = dev->si_ctime; + } + + if ((flags & FOF_OFFSET) == 0) + fp->f_offset = uio->uio_offset; + fp->f_nextoff = uio->uio_offset; + return (error); +} static struct vnodeopv_entry_desc devfs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop },