Add optional device vnode bypass to DEVFS.

The tunable vfs.devfs.fops controls this feature and defaults to off.

When enabled (vfs.devfs.fops=1 in loader), device vnodes opened
through a filedescriptor gets a special fops vector which instead
of the detour through the vnode layer goes directly to DEVFS.

Amongst other things this allows us to run Giant free read/write to
device drivers which have been weaned off D_NEEDGIANT.

Currently this means /dev/null, /dev/zero, disks, (and maybe the
random stuff ?)

On a 700MHz K7 machine this doubles the speed of
	dd if=/dev/zero of=/dev/null bs=1 count=1000000

This roughly translates to shaving 2usec of each read/write syscall.

The poll/kqfilter paths need more work before they are giant free,
this work is ongoing in p4::phk_bufwork

Please test this and report any problems, LORs etc.
This commit is contained in:
Poul-Henning Kamp 2004-11-08 10:46:47 +00:00
parent ed35e0a562
commit 56dd3a6182

View File

@ -49,6 +49,9 @@
#include <sys/conf.h>
#include <sys/dirent.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mac.h>
@ -59,11 +62,33 @@
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/time.h>
#include <sys/ttycom.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <fs/devfs/devfs.h>
static int devfs_fops = 0;
static fo_rdwr_t devfs_read_f;
static fo_rdwr_t devfs_write_f;
static fo_ioctl_t devfs_ioctl_f;
static fo_poll_t devfs_poll_f;
static fo_kqfilter_t devfs_kqfilter_f;
static fo_stat_t devfs_stat_f;
static fo_close_t devfs_close_f;
struct fileops devfs_ops_f = {
.fo_read = devfs_read_f,
.fo_write = devfs_write_f,
.fo_ioctl = devfs_ioctl_f,
.fo_poll = devfs_poll_f,
.fo_kqfilter = devfs_kqfilter_f,
.fo_stat = devfs_stat_f,
.fo_close = devfs_close_f,
.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
};
static int devfs_access(struct vop_access_args *ap);
static int devfs_advlock(struct vop_advlock_args *ap);
static int devfs_close(struct vop_close_args *ap);
@ -314,6 +339,18 @@ devfs_close(ap)
return (error);
}
static int
devfs_close_f(struct file *fp, struct thread *td)
{
struct cdev *dev;
dev = fp->f_data;
#if 0
printf("devfs_close_f(%s)\n", devtoname(dev));
#endif
return (vnops.fo_close(fp, td));
}
/*
* Synch buffers associated with a block device
*/
@ -439,6 +476,67 @@ devfs_ioctl(ap)
return (error);
}
static int
devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td)
{
struct cdev *dev;
struct cdevsw *dsw;
struct vnode *vp = fp->f_vnode;
struct vnode *vpold;
int error;
dev = fp->f_data;
#if 0
printf("devfs_ioctl_f(%s)\n", devtoname(dev));
#endif
KASSERT(dev->si_refcount > 0,
("devfs_ioctl() on un-referenced struct cdev *(%s)",
devtoname(dev)));
dsw = dev_refthread(dev);
if (dsw == NULL)
return (ENXIO);
if (com == FIODTYPE) {
*(int *)data = dsw->d_flags & D_TYPEMASK;
dev_relthread(dev);
return (0);
}
if (dsw->d_flags & D_NEEDGIANT)
mtx_lock(&Giant);
error = dsw->d_ioctl(dev, com, data, fp->f_flag, td);
if (dsw->d_flags & D_NEEDGIANT)
mtx_unlock(&Giant);
dev_relthread(dev);
if (error == ENOIOCTL)
error = ENOTTY;
if (error == 0 && com == TIOCSCTTY) {
/* Do nothing if reassigning same control tty */
sx_slock(&proctree_lock);
if (td->td_proc->p_session->s_ttyvp == vp) {
sx_sunlock(&proctree_lock);
return (0);
}
mtx_lock(&Giant);
vpold = td->td_proc->p_session->s_ttyvp;
VREF(vp);
SESS_LOCK(td->td_proc->p_session);
td->td_proc->p_session->s_ttyvp = vp;
SESS_UNLOCK(td->td_proc->p_session);
sx_sunlock(&proctree_lock);
/* Get rid of reference to old control tty */
if (vpold)
vrele(vpold);
mtx_unlock(&Giant);
}
return (error);
}
/* ARGSUSED */
static int
devfs_kqfilter(ap)
@ -467,6 +565,32 @@ devfs_kqfilter(ap)
return (error);
}
static int
devfs_kqfilter_f(struct file *fp, struct knote *kn)
{
struct cdev *dev;
struct cdevsw *dsw;
int error;
dev = fp->f_data;
#if 0
printf("devfs_kqfilter_f(%s)\n", devtoname(dev));
#endif
KASSERT(dev->si_refcount > 0,
("devfs_kqfilter() on un-referenced struct cdev *(%s)",
devtoname(dev)));
dsw = dev_refthread(dev);
if (dsw == NULL)
return(0);
if (dsw->d_flags & D_NEEDGIANT)
mtx_lock(&Giant);
error = dsw->d_kqfilter(dev, kn);
if (dsw->d_flags & D_NEEDGIANT)
mtx_unlock(&Giant);
dev_relthread(dev);
return (error);
}
static int
devfs_lookupx(ap)
struct vop_lookup_args /* {
@ -697,8 +821,17 @@ devfs_open(ap)
struct thread *td = ap->a_td;
struct vnode *vp = ap->a_vp;
struct cdev *dev = vp->v_rdev;
struct file *fp;
int error;
struct cdevsw *dsw;
static int once;
if (!once) {
TUNABLE_INT_FETCH("vfs.devfs.fops", &devfs_fops);
if (devfs_fops)
printf("WARNING: DEVFS uses fops\n");
once = 1;
}
if (vp->v_type == VBLK)
return (ENXIO);
@ -751,6 +884,23 @@ devfs_open(ap)
if (error)
return (error);
if (devfs_fops && ap->a_fdidx >= 0) {
/*
* This is a pretty disgustingly long chain, but I am not
* sure there is any better way. Passing the fdidx into
* VOP_OPEN() offers us more information than just passing
* the file *.
*/
fp = ap->a_td->td_proc->p_fd->fd_ofiles[ap->a_fdidx];
if (fp->f_ops == &badfileops) {
#if 0
printf("devfs_open(%s)\n", devtoname(dev));
#endif
fp->f_ops = &devfs_ops_f;
fp->f_data = dev;
}
}
return (error);
}
@ -817,6 +967,32 @@ devfs_poll(ap)
return(error);
}
static int
devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td)
{
struct cdev *dev;
struct cdevsw *dsw;
int error;
dev = fp->f_data;
#if 0
printf("devfs_poll_f(%s)\n", devtoname(dev));
#endif
dsw = dev_refthread(dev);
if (dsw == NULL)
return (0);
KASSERT(dev->si_refcount > 0,
("devfs_poll() on un-referenced struct cdev *(%s)",
devtoname(dev)));
if (dsw->d_flags & D_NEEDGIANT)
mtx_lock(&Giant);
error = dsw->d_poll(dev, events, td);
if (dsw->d_flags & D_NEEDGIANT)
mtx_unlock(&Giant);
dev_relthread(dev);
return(error);
}
/*
* Print out the contents of a special device vnode.
*/
@ -879,6 +1055,55 @@ devfs_read(ap)
return (error);
}
static int
devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)
{
struct cdev *dev;
int ioflag, error, resid;
struct cdevsw *dsw;
struct vnode *vp;
dev = fp->f_data;
#if 0
/*
* Enabling this one is dangerous, syslog will log once for each
* read from /dev/klog so...
*/
printf("devfs_read_f(%s)\n", devtoname(dev));
#endif
KASSERT(dev->si_refcount > 0,
("specread() on un-referenced struct cdev *(%s)", devtoname(dev)));
dsw = dev_refthread(dev);
if (dsw == NULL)
return (ENXIO);
vp = fp->f_vnode;
resid = uio->uio_resid;
ioflag = 0;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
if (fp->f_flag & O_DIRECT)
ioflag |= IO_DIRECT;
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;
if (dsw->d_flags & D_NEEDGIANT)
mtx_lock(&Giant);
error = dsw->d_read(dev, uio, ioflag);
if (dsw->d_flags & D_NEEDGIANT)
mtx_unlock(&Giant);
dev_relthread(dev);
if (uio->uio_resid != resid || (error == 0 && resid != 0))
vfs_timestamp(&dev->si_atime);
if ((flags & FOF_OFFSET) == 0)
fp->f_offset = uio->uio_offset;
fp->f_nextoff = uio->uio_offset;
return (error);
}
static int
devfs_readdir(ap)
struct vop_readdir_args /* {
@ -1198,6 +1423,18 @@ devfs_setlabel(ap)
}
#endif
static int
devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td)
{
struct cdev *dev;
dev = fp->f_data;
#if 0
printf("devfs_stat_f(%s)\n", devtoname(dev));
#endif
return (vnops.fo_stat(fp, sb, cred, td));
}
static int
devfs_symlink(ap)
struct vop_symlink_args /* {
@ -1288,6 +1525,56 @@ devfs_write(ap)
return (error);
}
static int
devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td)
{
struct cdev *dev;
struct vnode *vp;
int error, ioflag, resid;
struct cdevsw *dsw;
dev = fp->f_data;
#if 0
printf("devfs_write_f(%s)\n", devtoname(dev));
#endif
KASSERT(dev->si_refcount > 0,
("devfs_write() on un-referenced struct cdev *(%s)",
devtoname(dev)));
dsw = dev_refthread(dev);
if (dsw == NULL)
return (ENXIO);
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td));
vp = fp->f_vnode;
ioflag = IO_UNIT;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
if (fp->f_flag & O_DIRECT)
ioflag |= IO_DIRECT;
if ((fp->f_flag & O_FSYNC) ||
(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
ioflag |= IO_SYNC;
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;
resid = uio->uio_resid;
if (dsw->d_flags & D_NEEDGIANT)
mtx_lock(&Giant);
error = dsw->d_write(dev, uio, ioflag);
if (dsw->d_flags & D_NEEDGIANT)
mtx_unlock(&Giant);
dev_relthread(dev);
if (uio->uio_resid != resid || (error == 0 && resid != 0)) {
vfs_timestamp(&dev->si_ctime);
dev->si_mtime = dev->si_ctime;
}
if ((flags & FOF_OFFSET) == 0)
fp->f_offset = uio->uio_offset;
fp->f_nextoff = uio->uio_offset;
return (error);
}
static struct vnodeopv_entry_desc devfs_vnodeop_entries[] = {
{ &vop_default_desc, (vop_t *) vop_defaultop },