First cut of NFS direct IO support.
- NFS direct IO completely bypasses the buffer and page caches. If a file is open for direct IO all caching is disabled. - Direct IO for Directories will be addressed later. - 2 new NFS directio related sysctls are added. One is a knob to disable NFS direct IO completely (direct IO is enabled by default). The other is to disallow mmaped IO on a file that has at least one O_DIRECT open (see the comment in nfs_vnops.c for more details). The default is to allow mmaps on a file that has O_DIRECT opens. Submitted by: Mohan Srinivasan mohans at yahoo-inc dot com Obtained from: Yahoo!
This commit is contained in:
parent
3241f61d0e
commit
892d9ab784
@ -124,6 +124,7 @@ MALLOC_DECLARE(M_NFSREQ);
|
||||
MALLOC_DECLARE(M_NFSDIROFF);
|
||||
MALLOC_DECLARE(M_NFSBIGFH);
|
||||
MALLOC_DECLARE(M_NFSHASH);
|
||||
MALLOC_DECLARE(M_NFSDIRECTIO);
|
||||
#endif
|
||||
|
||||
extern struct uma_zone *nfsmount_zone;
|
||||
@ -275,6 +276,7 @@ int nfs_readdirrpc(struct vnode *, struct uio *, struct ucred *);
|
||||
int nfs_nfsiodnew(void);
|
||||
int nfs_asyncio(struct nfsmount *, struct buf *, struct ucred *, struct thread *);
|
||||
int nfs_doio(struct vnode *, struct buf *, struct ucred *, struct thread *);
|
||||
void nfs_doio_directwrite (struct buf *);
|
||||
void nfs_up(struct nfsreq *, struct nfsmount *, struct thread *,
|
||||
const char *, int);
|
||||
void nfs_down(struct nfsreq *, struct nfsmount *, struct thread *,
|
||||
|
@ -66,7 +66,11 @@ __FBSDID("$FreeBSD$");
|
||||
|
||||
static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
|
||||
struct thread *td);
|
||||
static int nfs_directio_write(struct vnode *vp, struct uio *uiop,
|
||||
struct ucred *cred, int ioflag);
|
||||
|
||||
extern int nfs_directio_enable;
|
||||
extern int nfs_directio_allow_mmap;
|
||||
/*
|
||||
* Vnode op for VM getpages.
|
||||
*/
|
||||
@ -84,10 +88,12 @@ nfs_getpages(struct vop_getpages_args *ap)
|
||||
struct nfsmount *nmp;
|
||||
vm_object_t object;
|
||||
vm_page_t *pages;
|
||||
struct nfsnode *np;
|
||||
|
||||
GIANT_REQUIRED;
|
||||
|
||||
vp = ap->a_vp;
|
||||
np = VTONFS(vp);
|
||||
td = curthread; /* XXX */
|
||||
cred = curthread->td_ucred; /* XXX */
|
||||
nmp = VFSTONFS(vp->v_mount);
|
||||
@ -99,6 +105,12 @@ nfs_getpages(struct vop_getpages_args *ap)
|
||||
return VM_PAGER_ERROR;
|
||||
}
|
||||
|
||||
if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) &&
|
||||
(vp->v_type == VREG)) {
|
||||
printf("nfs_getpages: called on non-cacheable vnode??\n");
|
||||
return VM_PAGER_ERROR;
|
||||
}
|
||||
|
||||
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
|
||||
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
|
||||
/* We'll never get here for v4, because we always have fsinfo */
|
||||
@ -275,6 +287,10 @@ nfs_putpages(struct vop_putpages_args *ap)
|
||||
(void)nfs_fsinfo(nmp, vp, cred, td);
|
||||
}
|
||||
|
||||
if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) &&
|
||||
(vp->v_type == VREG))
|
||||
printf("nfs_putpages: called on noncache-able vnode??\n");
|
||||
|
||||
for (i = 0; i < npages; i++)
|
||||
rtvals[i] = VM_PAGER_AGAIN;
|
||||
|
||||
@ -365,6 +381,11 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
|
||||
if (vp->v_type != VDIR &&
|
||||
(uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
|
||||
return (EFBIG);
|
||||
|
||||
if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
|
||||
/* No caching/ no readaheads. Just read data into the user buffer */
|
||||
return nfs_readrpc(vp, uio, cred);
|
||||
|
||||
biosize = vp->v_mount->mnt_stat.f_iosize;
|
||||
seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
|
||||
/*
|
||||
@ -683,6 +704,136 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* The NFS write path cannot handle iovecs with len > 1. So we need to
|
||||
* break up iovecs accordingly (restricting them to wsize).
|
||||
* For the SYNC case, we can do this with 1 copy (user buffer -> mbuf).
|
||||
* For the ASYNC case, 2 copies are needed. The first a copy from the
|
||||
* user buffer to a staging buffer and then a second copy from the staging
|
||||
* buffer to mbufs. This can be optimized by copying from the user buffer
|
||||
* directly into mbufs and passing the chain down, but that requires a
|
||||
* fair amount of re-working of the relevant codepaths (and can be done
|
||||
* later).
|
||||
*/
|
||||
static int
|
||||
nfs_directio_write(vp, uiop, cred, ioflag)
|
||||
struct vnode *vp;
|
||||
struct uio *uiop;
|
||||
struct ucred *cred;
|
||||
int ioflag;
|
||||
{
|
||||
int error;
|
||||
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
|
||||
struct thread *td = uiop->uio_td;
|
||||
int size;
|
||||
|
||||
if (ioflag & IO_SYNC) {
|
||||
int iomode, must_commit;
|
||||
struct uio uio;
|
||||
struct iovec iov;
|
||||
do_sync:
|
||||
while (uiop->uio_resid > 0) {
|
||||
size = min(uiop->uio_resid, nmp->nm_wsize);
|
||||
size = min(uiop->uio_iov->iov_len, size);
|
||||
iov.iov_base = uiop->uio_iov->iov_base;
|
||||
iov.iov_len = size;
|
||||
uio.uio_iov = &iov;
|
||||
uio.uio_iovcnt = 1;
|
||||
uio.uio_offset = uiop->uio_offset;
|
||||
uio.uio_resid = size;
|
||||
uio.uio_segflg = UIO_USERSPACE;
|
||||
uio.uio_rw = UIO_WRITE;
|
||||
uio.uio_td = td;
|
||||
iomode = NFSV3WRITE_FILESYNC;
|
||||
error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred,
|
||||
&iomode, &must_commit);
|
||||
KASSERT((must_commit == 0),
|
||||
("nfs_directio_write: Did not commit write"));
|
||||
if (error)
|
||||
return (error);
|
||||
uiop->uio_offset += size;
|
||||
uiop->uio_resid -= size;
|
||||
if (uiop->uio_iov->iov_len <= size) {
|
||||
uiop->uio_iovcnt--;
|
||||
uiop->uio_iov++;
|
||||
} else {
|
||||
uiop->uio_iov->iov_base =
|
||||
(char *)uiop->uio_iov->iov_base + size;
|
||||
uiop->uio_iov->iov_len -= size;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
struct uio *t_uio;
|
||||
struct iovec *t_iov;
|
||||
struct buf *bp;
|
||||
|
||||
/*
|
||||
* Break up the write into blocksize chunks and hand these
|
||||
* over to nfsiod's for write back.
|
||||
* Unfortunately, this incurs a copy of the data. Since
|
||||
* the user could modify the buffer before the write is
|
||||
* initiated.
|
||||
*
|
||||
* The obvious optimization here is that one of the 2 copies
|
||||
* in the async write path can be eliminated by copying the
|
||||
* data here directly into mbufs and passing the mbuf chain
|
||||
* down. But that will require a fair amount of re-working
|
||||
* of the code and can be done if there's enough interest
|
||||
* in NFS directio access.
|
||||
*/
|
||||
while (uiop->uio_resid > 0) {
|
||||
size = min(uiop->uio_resid, nmp->nm_wsize);
|
||||
size = min(uiop->uio_iov->iov_len, size);
|
||||
bp = getpbuf(&nfs_pbuf_freecnt);
|
||||
t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
|
||||
t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK);
|
||||
t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK);
|
||||
t_iov->iov_len = size;
|
||||
t_uio->uio_iov = t_iov;
|
||||
t_uio->uio_iovcnt = 1;
|
||||
t_uio->uio_offset = uiop->uio_offset;
|
||||
t_uio->uio_resid = size;
|
||||
t_uio->uio_segflg = UIO_SYSSPACE;
|
||||
t_uio->uio_rw = UIO_WRITE;
|
||||
t_uio->uio_td = td;
|
||||
bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size);
|
||||
bp->b_flags |= B_DIRECT;
|
||||
bp->b_iocmd = BIO_WRITE;
|
||||
if (cred != NOCRED) {
|
||||
crhold(cred);
|
||||
bp->b_wcred = cred;
|
||||
} else
|
||||
bp->b_wcred = NOCRED;
|
||||
bp->b_caller1 = (void *)t_uio;
|
||||
bp->b_vp = vp;
|
||||
vhold(vp);
|
||||
error = nfs_asyncio(nmp, bp, NOCRED, td);
|
||||
if (error) {
|
||||
free(t_iov->iov_base, M_NFSDIRECTIO);
|
||||
free(t_iov, M_NFSDIRECTIO);
|
||||
free(t_uio, M_NFSDIRECTIO);
|
||||
vdrop(bp->b_vp);
|
||||
bp->b_vp = NULL;
|
||||
relpbuf(bp, &nfs_pbuf_freecnt);
|
||||
if (error == EINTR)
|
||||
return (error);
|
||||
goto do_sync;
|
||||
}
|
||||
uiop->uio_offset += size;
|
||||
uiop->uio_resid -= size;
|
||||
if (uiop->uio_iov->iov_len <= size) {
|
||||
uiop->uio_iovcnt--;
|
||||
uiop->uio_iov++;
|
||||
} else {
|
||||
uiop->uio_iov->iov_base =
|
||||
(char *)uiop->uio_iov->iov_base + size;
|
||||
uiop->uio_iov->iov_len -= size;
|
||||
}
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Vnode op for write using bio
|
||||
*/
|
||||
@ -756,6 +907,9 @@ nfs_write(struct vop_write_args *ap)
|
||||
if (uio->uio_resid == 0)
|
||||
return (0);
|
||||
|
||||
if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG)
|
||||
return nfs_directio_write(vp, uio, cred, ioflag);
|
||||
|
||||
/*
|
||||
* We need to obtain the rslock if we intend to modify np->n_size
|
||||
* in order to guarentee the append point with multiple contending
|
||||
@ -1261,6 +1415,26 @@ nfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thr
|
||||
return (EIO);
|
||||
}
|
||||
|
||||
void
|
||||
nfs_doio_directwrite(struct buf *bp)
|
||||
{
|
||||
int iomode, must_commit;
|
||||
struct uio *uiop = (struct uio *)bp->b_caller1;
|
||||
char *iov_base = uiop->uio_iov->iov_base;
|
||||
struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount);
|
||||
|
||||
iomode = NFSV3WRITE_FILESYNC;
|
||||
uiop->uio_td = NULL; /* NULL since we're in nfsiod */
|
||||
(nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit);
|
||||
KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write"));
|
||||
free(iov_base, M_NFSDIRECTIO);
|
||||
free(uiop->uio_iov, M_NFSDIRECTIO);
|
||||
free(uiop, M_NFSDIRECTIO);
|
||||
vdrop(bp->b_vp);
|
||||
bp->b_vp = NULL;
|
||||
relpbuf(bp, &nfs_pbuf_freecnt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Do an I/O operation to/from a cache block. This may be called
|
||||
* synchronously or from an nfsiod.
|
||||
|
@ -243,10 +243,16 @@ nfssvc_iod(void *instance)
|
||||
nmp->nm_bufqwant = 0;
|
||||
wakeup(&nmp->nm_bufq);
|
||||
}
|
||||
if (bp->b_iocmd == BIO_READ)
|
||||
(void) nfs_doio(bp->b_vp, bp, bp->b_rcred, NULL);
|
||||
else
|
||||
(void) nfs_doio(bp->b_vp, bp, bp->b_wcred, NULL);
|
||||
if (bp->b_flags & B_DIRECT) {
|
||||
KASSERT((bp->b_iocmd == BIO_WRITE), ("nfscvs_iod: BIO_WRITE not set"));
|
||||
(void)nfs_doio_directwrite(bp);
|
||||
} else {
|
||||
if (bp->b_iocmd == BIO_READ)
|
||||
(void) nfs_doio(bp->b_vp, bp, bp->b_rcred, NULL);
|
||||
else
|
||||
(void) nfs_doio(bp->b_vp, bp, bp->b_wcred, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* If there are more than one iod on this mount, then defect
|
||||
* so that the iods can be shared out fairly between the mounts
|
||||
|
@ -78,6 +78,7 @@ MALLOC_DEFINE(M_NFSREQ, "NFS req", "NFS request header");
|
||||
MALLOC_DEFINE(M_NFSBIGFH, "NFSV3 bigfh", "NFS version 3 file handle");
|
||||
MALLOC_DEFINE(M_NFSDIROFF, "NFSV3 diroff", "NFS directory offset data");
|
||||
MALLOC_DEFINE(M_NFSHASH, "NFS hash", "NFS hash tables");
|
||||
MALLOC_DEFINE(M_NFSDIRECTIO, "NFS DirectIO", "NFS Direct IO async write state");
|
||||
|
||||
uma_zone_t nfsmount_zone;
|
||||
|
||||
|
@ -211,6 +211,24 @@ static int nfs_clean_pages_on_close = 1;
|
||||
SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
|
||||
&nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
|
||||
|
||||
int nfs_directio_enable = 1;
|
||||
SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
|
||||
&nfs_directio_enable, 0, "Enable NFS directio");
|
||||
|
||||
/*
|
||||
* This sysctl allows other processes to mmap a file that has been opened O_DIRECT
|
||||
* by a process. In general, having processes mmap the file while Direct IO is in
|
||||
* progress can lead to Data Inconsistencies. But, we allow this by default to
|
||||
* prevent DoS attacks - to prevent a malicious user from opening up files O_DIRECT
|
||||
* preventing other users from mmap'ing these files. "Protected" environments where
|
||||
* stricter consistency guarantees are required can disable this knob.
|
||||
* The process that opened the file O_DIRECT cannot mmap() the file, because
|
||||
* mmap'ed IO on an O_DIRECT open() is not meaningful.
|
||||
*/
|
||||
int nfs_directio_allow_mmap = 1;
|
||||
SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
|
||||
&nfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
|
||||
|
||||
#if 0
|
||||
SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD,
|
||||
&nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count");
|
||||
@ -401,6 +419,7 @@ nfs_open(struct vop_open_args *ap)
|
||||
struct nfsnode *np = VTONFS(vp);
|
||||
struct vattr vattr;
|
||||
int error;
|
||||
int fmode = ap->a_mode;
|
||||
|
||||
if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
|
||||
return (EOPNOTSUPP);
|
||||
@ -434,6 +453,18 @@ nfs_open(struct vop_open_args *ap)
|
||||
np->n_mtime = vattr.va_mtime;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* If the object has >= 1 O_DIRECT active opens, we disable caching.
|
||||
*/
|
||||
if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
|
||||
if (np->n_directio_opens == 0) {
|
||||
error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_td, 1);
|
||||
if (error)
|
||||
return (error);
|
||||
np->n_flag |= NNONCACHE;
|
||||
}
|
||||
np->n_directio_opens++;
|
||||
}
|
||||
np->ra_expect_lbn = 0;
|
||||
return (0);
|
||||
}
|
||||
@ -472,6 +503,7 @@ nfs_close(struct vop_close_args *ap)
|
||||
struct vnode *vp = ap->a_vp;
|
||||
struct nfsnode *np = VTONFS(vp);
|
||||
int error = 0;
|
||||
int fmode = ap->a_fflag;
|
||||
|
||||
if (vp->v_type == VREG) {
|
||||
/*
|
||||
@ -520,6 +552,13 @@ nfs_close(struct vop_close_args *ap)
|
||||
error = np->n_error;
|
||||
}
|
||||
}
|
||||
if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
|
||||
KASSERT((np->n_directio_opens > 0),
|
||||
("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
|
||||
np->n_directio_opens--;
|
||||
if (np->n_directio_opens == 0)
|
||||
np->n_flag &= ~NNONCACHE;
|
||||
}
|
||||
return (error);
|
||||
}
|
||||
|
||||
|
@ -127,6 +127,7 @@ struct nfsnode {
|
||||
u_char *n_name; /* leaf name, for v4 OPEN op */
|
||||
uint32_t n_namelen;
|
||||
daddr_t ra_expect_lbn;
|
||||
int n_directio_opens;
|
||||
};
|
||||
|
||||
#define n_atim n_un1.nf_atim
|
||||
@ -149,6 +150,7 @@ struct nfsnode {
|
||||
#define NCREATED 0x0800 /* Opened by nfs_create() */
|
||||
#define NTRUNCATE 0x1000 /* Opened by nfs_setattr() */
|
||||
#define NSIZECHANGED 0x2000 /* File size has changed: need cache inval */
|
||||
#define NNONCACHE 0x4000 /* Node marked as noncacheable */
|
||||
|
||||
/*
|
||||
* Convert between nfsnode pointers and vnode pointers
|
||||
|
Loading…
Reference in New Issue
Block a user