freebsd-skq/sys/nfsclient/nfs_subs.c

1090 lines
27 KiB
C
Raw Normal View History

1994-05-24 10:09:53 +00:00
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Rick Macklem at The University of Guelph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95
1994-05-24 10:09:53 +00:00
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
1994-05-24 10:09:53 +00:00
/*
* These functions support the macros and help fiddle mbuf chains for
* the nfs op functions. They do things like create the rpc header and
* copy data between mbuf chains and uio lists.
*/
1994-05-24 10:09:53 +00:00
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/proc.h>
1994-05-24 10:09:53 +00:00
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/malloc.h>
1994-09-22 22:10:49 +00:00
#include <sys/sysent.h>
#include <sys/syscall.h>
#include <sys/sysproto.h>
1994-05-24 10:09:53 +00:00
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
#include <rpc/rpcclnt.h>
1994-05-24 10:09:53 +00:00
#include <nfs/rpcv2.h>
#include <nfs/nfsproto.h>
#include <nfsclient/nfs.h>
#include <nfsclient/nfsnode.h>
1994-05-24 10:09:53 +00:00
#include <nfs/xdr_subs.h>
#include <nfsclient/nfsm_subs.h>
#include <nfsclient/nfsmount.h>
1994-05-24 10:09:53 +00:00
#include <netinet/in.h>
/*
* Data items converted to xdr at startup, since they are constant
* This is kinda hokey, but may save a little time doing byte swaps
*/
u_int32_t nfs_xdrneg1;
u_int32_t rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr,
rpc_mismatch, rpc_auth_unix, rpc_msgaccepted;
u_int32_t nfs_true, nfs_false;
1994-05-24 10:09:53 +00:00
/* And other global data */
static u_int32_t nfs_xid = 0;
1995-12-17 21:14:36 +00:00
static enum vtype nv2tov_type[8]= {
VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON
1995-12-17 21:14:36 +00:00
};
int nfs_ticks;
int nfs_pbuf_freecnt = -1; /* start out unlimited */
struct nfs_reqq nfs_reqq;
struct nfs_bufq nfs_bufq;
static int nfs_prev_nfsclnt_sy_narg;
static sy_call_t *nfs_prev_nfsclnt_sy_call;
/*
* and the reverse mapping from generic to Version 2 procedure numbers
*/
int nfsv2_procid[NFS_NPROCS] = {
NFSV2PROC_NULL,
NFSV2PROC_GETATTR,
NFSV2PROC_SETATTR,
NFSV2PROC_LOOKUP,
NFSV2PROC_NOOP,
NFSV2PROC_READLINK,
NFSV2PROC_READ,
NFSV2PROC_WRITE,
NFSV2PROC_CREATE,
NFSV2PROC_MKDIR,
NFSV2PROC_SYMLINK,
NFSV2PROC_CREATE,
NFSV2PROC_REMOVE,
NFSV2PROC_RMDIR,
NFSV2PROC_RENAME,
NFSV2PROC_LINK,
NFSV2PROC_READDIR,
NFSV2PROC_NOOP,
NFSV2PROC_STATFS,
NFSV2PROC_NOOP,
NFSV2PROC_NOOP,
NFSV2PROC_NOOP,
NFSV2PROC_NOOP,
};
LIST_HEAD(nfsnodehashhead, nfsnode);
1994-05-24 10:09:53 +00:00
/*
* Create the header for an rpc request packet
* The hsiz is the size of the rest of the nfs request header.
* (just used to decide if a cluster is a good idea)
*/
struct mbuf *
nfsm_reqhead(struct vnode *vp, u_long procid, int hsiz)
1994-05-24 10:09:53 +00:00
{
struct mbuf *mb;
1994-05-24 10:09:53 +00:00
MGET(mb, M_TRYWAIT, MT_DATA);
1994-05-24 10:09:53 +00:00
if (hsiz >= MINCLSIZE)
MCLGET(mb, M_TRYWAIT);
1994-05-24 10:09:53 +00:00
mb->m_len = 0;
return (mb);
}
/*
* Build the RPC header and fill in the authorization info.
* The authorization string argument is only used when the credentials
* come from outside of the kernel.
* Returns the head of the mbuf list.
*/
struct mbuf *
nfsm_rpchead(struct ucred *cr, int nmflag, int procid, int auth_type,
int auth_len, struct mbuf *mrest, int mrest_len, struct mbuf **mbp,
u_int32_t *xidp)
1994-05-24 10:09:53 +00:00
{
struct mbuf *mb;
u_int32_t *tl;
caddr_t bpos;
int i;
struct mbuf *mreq;
int grpsiz, authsiz;
1994-05-24 10:09:53 +00:00
authsiz = nfsm_rndup(auth_len);
MGETHDR(mb, M_TRYWAIT, MT_DATA);
if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) {
MCLGET(mb, M_TRYWAIT);
} else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) {
MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED);
1994-05-24 10:09:53 +00:00
} else {
MH_ALIGN(mb, 8 * NFSX_UNSIGNED);
1994-05-24 10:09:53 +00:00
}
mb->m_len = 0;
mreq = mb;
bpos = mtod(mb, caddr_t);
/*
* First the RPC header.
*/
tl = nfsm_build(u_int32_t *, 8 * NFSX_UNSIGNED);
1998-04-06 11:41:07 +00:00
/* Get a pretty random xid to start with */
if (!nfs_xid)
1998-04-06 11:41:07 +00:00
nfs_xid = random();
/*
* Skip zero xid if it should ever happen.
*/
1994-05-24 10:09:53 +00:00
if (++nfs_xid == 0)
nfs_xid++;
1994-05-24 10:09:53 +00:00
*tl++ = *xidp = txdr_unsigned(nfs_xid);
*tl++ = rpc_call;
*tl++ = rpc_vers;
*tl++ = txdr_unsigned(NFS_PROG);
if (nmflag & NFSMNT_NFSV3) {
*tl++ = txdr_unsigned(NFS_VER3);
*tl++ = txdr_unsigned(procid);
} else {
*tl++ = txdr_unsigned(NFS_VER2);
*tl++ = txdr_unsigned(nfsv2_procid[procid]);
}
1994-05-24 10:09:53 +00:00
/*
* And then the authorization cred.
*/
*tl++ = txdr_unsigned(auth_type);
*tl = txdr_unsigned(authsiz);
switch (auth_type) {
case RPCAUTH_UNIX:
tl = nfsm_build(u_int32_t *, auth_len);
1994-05-24 10:09:53 +00:00
*tl++ = 0; /* stamp ?? */
*tl++ = 0; /* NULL hostname */
*tl++ = txdr_unsigned(cr->cr_uid);
*tl++ = txdr_unsigned(cr->cr_groups[0]);
grpsiz = (auth_len >> 2) - 5;
*tl++ = txdr_unsigned(grpsiz);
for (i = 1; i <= grpsiz; i++)
*tl++ = txdr_unsigned(cr->cr_groups[i]);
break;
}
/*
* And the verifier...
*/
tl = nfsm_build(u_int32_t *, 2 * NFSX_UNSIGNED);
*tl++ = txdr_unsigned(RPCAUTH_NULL);
*tl = 0;
1994-05-24 10:09:53 +00:00
mb->m_next = mrest;
mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len;
mreq->m_pkthdr.rcvif = NULL;
1994-05-24 10:09:53 +00:00
*mbp = mb;
return (mreq);
}
/*
* copies a uio scatter/gather list to an mbuf chain.
* NOTE: can ony handle iovcnt == 1
1994-05-24 10:09:53 +00:00
*/
int
nfsm_uiotombuf(struct uio *uiop, struct mbuf **mq, int siz, caddr_t *bpos)
1994-05-24 10:09:53 +00:00
{
char *uiocp;
struct mbuf *mp, *mp2;
int xfer, left, mlen;
1994-05-24 10:09:53 +00:00
int uiosiz, clflg, rem;
char *cp;
#ifdef DIAGNOSTIC
if (uiop->uio_iovcnt != 1)
panic("nfsm_uiotombuf: iovcnt != 1");
#endif
1994-05-24 10:09:53 +00:00
if (siz > MLEN) /* or should it >= MCLBYTES ?? */
clflg = 1;
else
clflg = 0;
rem = nfsm_rndup(siz)-siz;
mp = mp2 = *mq;
while (siz > 0) {
left = uiop->uio_iov->iov_len;
uiocp = uiop->uio_iov->iov_base;
if (left > siz)
left = siz;
uiosiz = left;
while (left > 0) {
mlen = M_TRAILINGSPACE(mp);
if (mlen == 0) {
MGET(mp, M_TRYWAIT, MT_DATA);
1994-05-24 10:09:53 +00:00
if (clflg)
MCLGET(mp, M_TRYWAIT);
1994-05-24 10:09:53 +00:00
mp->m_len = 0;
mp2->m_next = mp;
mp2 = mp;
mlen = M_TRAILINGSPACE(mp);
}
xfer = (left > mlen) ? mlen : left;
#ifdef notdef
/* Not Yet.. */
if (uiop->uio_iov->iov_op != NULL)
(*(uiop->uio_iov->iov_op))
(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
else
#endif
if (uiop->uio_segflg == UIO_SYSSPACE)
bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
else
copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
mp->m_len += xfer;
left -= xfer;
uiocp += xfer;
uiop->uio_offset += xfer;
uiop->uio_resid -= xfer;
}
uiop->uio_iov->iov_base =
(char *)uiop->uio_iov->iov_base + uiosiz;
uiop->uio_iov->iov_len -= uiosiz;
1994-05-24 10:09:53 +00:00
siz -= uiosiz;
}
if (rem > 0) {
if (rem > M_TRAILINGSPACE(mp)) {
MGET(mp, M_TRYWAIT, MT_DATA);
1994-05-24 10:09:53 +00:00
mp->m_len = 0;
mp2->m_next = mp;
}
cp = mtod(mp, caddr_t)+mp->m_len;
for (left = 0; left < rem; left++)
*cp++ = '\0';
mp->m_len += rem;
*bpos = cp;
} else
*bpos = mtod(mp, caddr_t)+mp->m_len;
*mq = mp;
return (0);
}
/*
* Copy a string into mbufs for the hard cases...
*/
int
nfsm_strtmbuf(struct mbuf **mb, char **bpos, const char *cp, long siz)
1994-05-24 10:09:53 +00:00
{
struct mbuf *m1 = NULL, *m2;
1994-05-24 10:09:53 +00:00
long left, xfer, len, tlen;
u_int32_t *tl;
1994-05-24 10:09:53 +00:00
int putsize;
putsize = 1;
m2 = *mb;
left = M_TRAILINGSPACE(m2);
if (left > 0) {
tl = ((u_int32_t *)(*bpos));
1994-05-24 10:09:53 +00:00
*tl++ = txdr_unsigned(siz);
putsize = 0;
left -= NFSX_UNSIGNED;
m2->m_len += NFSX_UNSIGNED;
if (left > 0) {
bcopy(cp, (caddr_t) tl, left);
siz -= left;
cp += left;
m2->m_len += left;
left = 0;
}
}
/* Loop around adding mbufs */
while (siz > 0) {
MGET(m1, M_TRYWAIT, MT_DATA);
1994-05-24 10:09:53 +00:00
if (siz > MLEN)
MCLGET(m1, M_TRYWAIT);
1994-05-24 10:09:53 +00:00
m1->m_len = NFSMSIZ(m1);
m2->m_next = m1;
m2 = m1;
tl = mtod(m1, u_int32_t *);
1994-05-24 10:09:53 +00:00
tlen = 0;
if (putsize) {
*tl++ = txdr_unsigned(siz);
m1->m_len -= NFSX_UNSIGNED;
tlen = NFSX_UNSIGNED;
putsize = 0;
}
if (siz < m1->m_len) {
len = nfsm_rndup(siz);
xfer = siz;
if (xfer < len)
*(tl+(xfer>>2)) = 0;
} else {
xfer = len = m1->m_len;
}
bcopy(cp, (caddr_t) tl, xfer);
m1->m_len = len+tlen;
siz -= xfer;
cp += xfer;
}
*mb = m1;
*bpos = mtod(m1, caddr_t)+m1->m_len;
return (0);
}
/*
* Called once to initialize data structures...
*/
int
nfs_init(struct vfsconf *vfsp)
1994-05-24 10:09:53 +00:00
{
int i;
1994-05-24 10:09:53 +00:00
nfsmount_zone = uma_zcreate("NFSMOUNT", sizeof(struct nfsmount),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1994-05-24 10:09:53 +00:00
rpc_vers = txdr_unsigned(RPC_VER2);
rpc_call = txdr_unsigned(RPC_CALL);
rpc_reply = txdr_unsigned(RPC_REPLY);
rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED);
rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED);
rpc_mismatch = txdr_unsigned(RPC_MISMATCH);
rpc_autherr = txdr_unsigned(RPC_AUTHERR);
rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX);
nfs_true = txdr_unsigned(TRUE);
nfs_false = txdr_unsigned(FALSE);
nfs_xdrneg1 = txdr_unsigned(-1);
nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
if (nfs_ticks < 1)
nfs_ticks = 1;
1994-05-24 10:09:53 +00:00
/* Ensure async daemons disabled */
for (i = 0; i < NFS_MAXASYNCDAEMON; i++) {
nfs_iodwant[i] = NULL;
nfs_iodmount[i] = NULL;
}
1994-05-24 10:09:53 +00:00
nfs_nhinit(); /* Init the nfsnode table */
/*
* Initialize reply list and start timer
*/
TAILQ_INIT(&nfs_reqq);
callout_init(&nfs_callout, 0);
nfs_prev_nfsclnt_sy_narg = sysent[SYS_nfsclnt].sy_narg;
sysent[SYS_nfsclnt].sy_narg = 2;
nfs_prev_nfsclnt_sy_call = sysent[SYS_nfsclnt].sy_call;
sysent[SYS_nfsclnt].sy_call = (sy_call_t *)nfsclnt;
1994-09-22 22:10:49 +00:00
nfs_pbuf_freecnt = nswbuf / 2 + 1;
return (0);
1994-05-24 10:09:53 +00:00
}
int
nfs_uninit(struct vfsconf *vfsp)
{
int i;
callout_stop(&nfs_callout);
sysent[SYS_nfsclnt].sy_narg = nfs_prev_nfsclnt_sy_narg;
sysent[SYS_nfsclnt].sy_call = nfs_prev_nfsclnt_sy_call;
KASSERT(TAILQ_EMPTY(&nfs_reqq),
("nfs_uninit: request queue not empty"));
/*
* Tell all nfsiod processes to exit. Clear nfs_iodmax, and wakeup
* any sleeping nfsiods so they check nfs_iodmax and exit.
*/
nfs_iodmax = 0;
for (i = 0; i < nfs_numasync; i++)
if (nfs_iodwant[i])
wakeup(&nfs_iodwant[i]);
/* The last nfsiod to exit will wake us up when nfs_numasync hits 0 */
while (nfs_numasync)
tsleep(&nfs_numasync, PWAIT, "ioddie", 0);
nfs_nhuninit();
uma_zdestroy(nfsmount_zone);
return (0);
}
1994-05-24 10:09:53 +00:00
/*
* Attribute cache routines.
* nfs_loadattrcache() - loads or updates the cache contents from attributes
* that are on the mbuf list
* nfs_getattrcache() - returns valid attributes if found in cache, returns
* error otherwise
*/
/*
* Load the attribute cache (that lives in the nfsnode entry) with
* the values on the mbuf list and
* Iff vap not NULL
* copy the attributes to *vaper
*/
int
nfs_loadattrcache(struct vnode **vpp, struct mbuf **mdp, caddr_t *dposp,
struct vattr *vaper, int dontshrink)
1994-05-24 10:09:53 +00:00
{
struct vnode *vp = *vpp;
struct vattr *vap;
struct nfs_fattr *fp;
struct nfsnode *np;
int32_t t1;
caddr_t cp2;
int rdev;
1994-05-24 10:09:53 +00:00
struct mbuf *md;
enum vtype vtyp;
u_short vmode;
struct timespec mtime;
int v3 = NFS_ISV3(vp);
1994-05-24 10:09:53 +00:00
md = *mdp;
t1 = (mtod(md, caddr_t) + md->m_len) - *dposp;
cp2 = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1);
if (cp2 == NULL)
return EBADRPC;
fp = (struct nfs_fattr *)cp2;
if (v3) {
vtyp = nfsv3tov_type(fp->fa_type);
vmode = fxdr_unsigned(u_short, fp->fa_mode);
rdev = makedev(fxdr_unsigned(int, fp->fa3_rdev.specdata1),
fxdr_unsigned(int, fp->fa3_rdev.specdata2));
fxdr_nfsv3time(&fp->fa3_mtime, &mtime);
1994-05-24 10:09:53 +00:00
} else {
vtyp = nfsv2tov_type(fp->fa_type);
vmode = fxdr_unsigned(u_short, fp->fa_mode);
/*
* XXX
*
* The duplicate information returned in fa_type and fa_mode
* is an ambiguity in the NFS version 2 protocol.
*
* VREG should be taken literally as a regular file. If a
* server intents to return some type information differently
* in the upper bits of the mode field (e.g. for sockets, or
* FIFOs), NFSv2 mandates fa_type to be VNON. Anyway, we
* leave the examination of the mode bits even in the VREG
* case to avoid breakage for bogus servers, but we make sure
* that there are actually type bits set in the upper part of
* fa_mode (and failing that, trust the va_type field).
*
* NFSv3 cleared the issue, and requires fa_mode to not
* contain any type information (while also introduing sockets
* and FIFOs for fa_type).
*/
if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0))
vtyp = IFTOVT(vmode);
rdev = fxdr_unsigned(int32_t, fp->fa2_rdev);
fxdr_nfsv2time(&fp->fa2_mtime, &mtime);
/*
* Really ugly NFSv2 kludge.
*/
if (vtyp == VCHR && rdev == 0xffffffff)
vtyp = VFIFO;
1994-05-24 10:09:53 +00:00
}
1994-05-24 10:09:53 +00:00
/*
* If v_type == VNON it is a new node, so fill in the v_type,
1995-05-30 08:16:23 +00:00
* n_mtime fields. Check to see if it represents a special
1994-05-24 10:09:53 +00:00
* device, and if so, check for a possible alias. Once the
* correct vnode has been obtained, fill in the rest of the
* information.
*/
np = VTONFS(vp);
if (vp->v_type != vtyp) {
vp->v_type = vtyp;
if (vp->v_type == VFIFO)
vp->v_op = fifo_nfsnodeop_p;
else if (vp->v_type == VBLK)
vp->v_op = spec_nfsnodeop_p;
else if (vp->v_type == VCHR) {
vp->v_op = spec_nfsnodeop_p;
This patch corrects the first round of panics and hangs reported with the new snapshot code. Update addaliasu to correctly implement the semantics of the old checkalias function. When a device vnode first comes into existence, check to see if an anonymous vnode for the same device was created at boot time by bdevvp(). If so, adopt the bdevvp vnode rather than creating a new vnode for the device. This corrects a problem which caused the kernel to panic when taking a snapshot of the root filesystem. Change the calling convention of vn_write_suspend_wait() to be the same as vn_start_write(). Split out softdep_flushworklist() from softdep_flushfiles() so that it can be used to clear the work queue when suspending filesystem operations. Access to buffers becomes recursive so that snapshots can recursively traverse their indirect blocks using ffs_copyonwrite() when checking for the need for copy on write when flushing one of their own indirect blocks. This eliminates a deadlock between the syncer daemon and a process taking a snapshot. Ensure that softdep_process_worklist() can never block because of a snapshot being taken. This eliminates a problem with buffer starvation. Cleanup change in ffs_sync() which did not synchronously wait when MNT_WAIT was specified. The result was an unclean filesystem panic when doing forcible unmount with heavy filesystem I/O in progress. Return a zero'ed block when reading a block that was not in use at the time that a snapshot was taken. Normally, these blocks should never be read. However, the readahead code will occationally read them which can cause unexpected behavior. Clean up the debugging code that ensures that no blocks be written on a filesystem while it is suspended. Snapshots must explicitly label the blocks that they are writing during the suspension so that they do not cause a `write on suspended filesystem' panic. Reorganize ffs_copyonwrite() to eliminate a deadlock and also to prevent a race condition that would permit the same block to be copied twice. This change eliminates an unexpected soft updates inconsistency in fsck caused by the double allocation. Use bqrelse rather than brelse for buffers that will be needed soon again by the snapshot code. This improves snapshot performance.
2000-07-24 05:28:33 +00:00
vp = addaliasu(vp, rdev);
np->n_vnode = vp;
1994-05-24 10:09:53 +00:00
}
np->n_mtime = mtime.tv_sec;
1994-05-24 10:09:53 +00:00
}
vap = &np->n_vattr;
vap->va_type = vtyp;
vap->va_mode = (vmode & 07777);
Divorce "dev_t" from the "major|minor" bitmap, which is now called udev_t in the kernel but still called dev_t in userland. Provide functions to manipulate both types: major() umajor() minor() uminor() makedev() umakedev() dev2udev() udev2dev() For now they're functions, they will become in-line functions after one of the next two steps in this process. Return major/minor/makedev to macro-hood for userland. Register a name in cdevsw[] for the "filedescriptor" driver. In the kernel the udev_t appears in places where we have the major/minor number combination, (ie: a potential device: we may not have the driver nor the device), like in inodes, vattr, cdevsw registration and so on, whereas the dev_t appears where we carry around a reference to a actual device. In the future the cdevsw and the aliased-from vnode will be hung directly from the dev_t, along with up to two softc pointers for the device driver and a few houskeeping bits. This will essentially replace the current "alias" check code (same buck, bigger bang). A little stunt has been provided to try to catch places where the wrong type is being used (dev_t vs udev_t), if you see something not working, #undef DEVT_FASCIST in kern/kern_conf.c and see if it makes a difference. If it does, please try to track it down (many hands make light work) or at least try to reproduce it as simply as possible, and describe how to do that. Without DEVT_FASCIST I belive this patch is a no-op. Stylistic/posixoid comments about the userland view of the <sys/*.h> files welcome now, from userland they now contain the end result. Next planned step: make all dev_t's refer to the same devsw[] which means convert BLK's to CHR's at the perimeter of the vnodes and other places where they enter the game (bootdev, mknod, sysctl).
1999-05-11 19:55:07 +00:00
vap->va_rdev = rdev;
1994-05-24 10:09:53 +00:00
vap->va_mtime = mtime;
vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
if (v3) {
vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
vap->va_size = fxdr_hyper(&fp->fa3_size);
vap->va_blocksize = NFS_FABLKSIZE;
vap->va_bytes = fxdr_hyper(&fp->fa3_used);
vap->va_fileid = fxdr_unsigned(int32_t,
fp->fa3_fileid.nfsuquad[1]);
fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime);
fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime);
vap->va_flags = 0;
vap->va_filerev = 0;
1994-05-24 10:09:53 +00:00
} else {
vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
vap->va_size = fxdr_unsigned(u_int32_t, fp->fa2_size);
vap->va_blocksize = fxdr_unsigned(int32_t, fp->fa2_blocksize);
vap->va_bytes = (u_quad_t)fxdr_unsigned(int32_t, fp->fa2_blocks)
* NFS_FABLKSIZE;
vap->va_fileid = fxdr_unsigned(int32_t, fp->fa2_fileid);
fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime);
1994-05-24 10:09:53 +00:00
vap->va_flags = 0;
vap->va_ctime.tv_sec = fxdr_unsigned(u_int32_t,
fp->fa2_ctime.nfsv2_sec);
vap->va_ctime.tv_nsec = 0;
vap->va_gen = fxdr_unsigned(u_int32_t, fp->fa2_ctime.nfsv2_usec);
1994-05-24 10:09:53 +00:00
vap->va_filerev = 0;
}
np->n_attrstamp = time_second;
1994-05-24 10:09:53 +00:00
if (vap->va_size != np->n_size) {
if (vap->va_type == VREG) {
if (dontshrink && vap->va_size < np->n_size) {
/*
* We've been told not to shrink the file;
* zero np->n_attrstamp to indicate that
* the attributes are stale.
*/
vap->va_size = np->n_size;
np->n_attrstamp = 0;
} else if (np->n_flag & NMODIFIED) {
/*
* We've modified the file: Use the larger
* of our size, and the server's size.
*/
if (vap->va_size < np->n_size) {
1994-05-24 10:09:53 +00:00
vap->va_size = np->n_size;
} else {
1994-05-24 10:09:53 +00:00
np->n_size = vap->va_size;
np->n_flag |= NSIZECHANGED;
}
Synopsis of problem being fixed: Dan Nelson originally reported that blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
1999-12-12 06:09:57 +00:00
} else {
1994-05-24 10:09:53 +00:00
np->n_size = vap->va_size;
np->n_flag |= NSIZECHANGED;
Synopsis of problem being fixed: Dan Nelson originally reported that blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
1999-12-12 06:09:57 +00:00
}
vnode_pager_setsize(vp, np->n_size);
Synopsis of problem being fixed: Dan Nelson originally reported that blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
1999-12-12 06:09:57 +00:00
} else {
1994-05-24 10:09:53 +00:00
np->n_size = vap->va_size;
Synopsis of problem being fixed: Dan Nelson originally reported that blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
1999-12-12 06:09:57 +00:00
}
1994-05-24 10:09:53 +00:00
}
if (vaper != NULL) {
bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
if (np->n_flag & NCHG) {
if (np->n_flag & NACC)
vaper->va_atime = np->n_atim;
if (np->n_flag & NUPD)
vaper->va_mtime = np->n_mtim;
1994-05-24 10:09:53 +00:00
}
}
return (0);
}
#ifdef NFS_ACDEBUG
#include <sys/sysctl.h>
1999-02-17 13:59:29 +00:00
SYSCTL_DECL(_vfs_nfs);
static int nfs_acdebug;
SYSCTL_INT(_vfs_nfs, OID_AUTO, acdebug, CTLFLAG_RW, &nfs_acdebug, 0, "");
#endif
1994-05-24 10:09:53 +00:00
/*
* Check the time stamp
* If the cache is valid, copy contents to *vap and return 0
* otherwise return an error
*/
int
nfs_getattrcache(struct vnode *vp, struct vattr *vaper)
1994-05-24 10:09:53 +00:00
{
struct nfsnode *np;
struct vattr *vap;
struct nfsmount *nmp;
int timeo;
np = VTONFS(vp);
vap = &np->n_vattr;
nmp = VFSTONFS(vp->v_mount);
/* XXX n_mtime doesn't seem to be updated on a miss-and-reload */
timeo = (time_second - np->n_mtime) / 10;
#ifdef NFS_ACDEBUG
if (nfs_acdebug>1)
printf("nfs_getattrcache: initial timeo = %d\n", timeo);
#endif
if (vap->va_type == VDIR) {
if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acdirmin)
timeo = nmp->nm_acdirmin;
else if (timeo > nmp->nm_acdirmax)
timeo = nmp->nm_acdirmax;
} else {
if ((np->n_flag & NMODIFIED) || timeo < nmp->nm_acregmin)
timeo = nmp->nm_acregmin;
else if (timeo > nmp->nm_acregmax)
timeo = nmp->nm_acregmax;
}
1994-05-24 10:09:53 +00:00
#ifdef NFS_ACDEBUG
if (nfs_acdebug > 2)
printf("acregmin %d; acregmax %d; acdirmin %d; acdirmax %d\n",
nmp->nm_acregmin, nmp->nm_acregmax,
nmp->nm_acdirmin, nmp->nm_acdirmax);
if (nfs_acdebug)
printf("nfs_getattrcache: age = %d; final timeo = %d\n",
(time_second - np->n_attrstamp), timeo);
#endif
if ((time_second - np->n_attrstamp) >= timeo) {
1994-05-24 10:09:53 +00:00
nfsstats.attrcache_misses++;
return (ENOENT);
}
nfsstats.attrcache_hits++;
if (vap->va_size != np->n_size) {
if (vap->va_type == VREG) {
if (np->n_flag & NMODIFIED) {
if (vap->va_size < np->n_size)
vap->va_size = np->n_size;
else
np->n_size = vap->va_size;
Synopsis of problem being fixed: Dan Nelson originally reported that blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
1999-12-12 06:09:57 +00:00
} else {
1994-05-24 10:09:53 +00:00
np->n_size = vap->va_size;
Synopsis of problem being fixed: Dan Nelson originally reported that blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
1999-12-12 06:09:57 +00:00
}
vnode_pager_setsize(vp, np->n_size);
Synopsis of problem being fixed: Dan Nelson originally reported that blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
1999-12-12 06:09:57 +00:00
} else {
1994-05-24 10:09:53 +00:00
np->n_size = vap->va_size;
Synopsis of problem being fixed: Dan Nelson originally reported that blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
1999-12-12 06:09:57 +00:00
}
1994-05-24 10:09:53 +00:00
}
bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
if (np->n_flag & NCHG) {
if (np->n_flag & NACC)
vaper->va_atime = np->n_atim;
if (np->n_flag & NUPD)
vaper->va_mtime = np->n_mtim;
1994-05-24 10:09:53 +00:00
}
return (0);
}
static nfsuint64 nfs_nullcookie = { { 0, 0 } };
/*
* This function finds the directory cookie that corresponds to the
* logical byte offset given.
*/
nfsuint64 *
nfs_getcookie(struct nfsnode *np, off_t off, int add)
{
struct nfsdmap *dp, *dp2;
int pos;
pos = (uoff_t)off / NFS_DIRBLKSIZ;
if (pos == 0 || off < 0) {
#ifdef DIAGNOSTIC
if (add)
panic("nfs getcookie add at <= 0");
#endif
return (&nfs_nullcookie);
}
pos--;
dp = LIST_FIRST(&np->n_cookies);
if (!dp) {
if (add) {
MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap),
M_NFSDIROFF, M_WAITOK);
dp->ndm_eocookie = 0;
LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list);
} else
return (NULL);
}
while (pos >= NFSNUMCOOKIES) {
pos -= NFSNUMCOOKIES;
if (LIST_NEXT(dp, ndm_list)) {
if (!add && dp->ndm_eocookie < NFSNUMCOOKIES &&
pos >= dp->ndm_eocookie)
return (NULL);
dp = LIST_NEXT(dp, ndm_list);
} else if (add) {
MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap),
M_NFSDIROFF, M_WAITOK);
dp2->ndm_eocookie = 0;
LIST_INSERT_AFTER(dp, dp2, ndm_list);
dp = dp2;
} else
return (NULL);
}
if (pos >= dp->ndm_eocookie) {
if (add)
dp->ndm_eocookie = pos + 1;
else
return (NULL);
}
return (&dp->ndm_cookies[pos]);
}
/*
* Invalidate cached directory information, except for the actual directory
* blocks (which are invalidated separately).
* Done mainly to avoid the use of stale offset cookies.
*/
void
nfs_invaldir(struct vnode *vp)
{
struct nfsnode *np = VTONFS(vp);
#ifdef DIAGNOSTIC
if (vp->v_type != VDIR)
panic("nfs: invaldir not dir");
#endif
np->n_direofoffset = 0;
np->n_cookieverf.nfsuquad[0] = 0;
np->n_cookieverf.nfsuquad[1] = 0;
if (LIST_FIRST(&np->n_cookies))
LIST_FIRST(&np->n_cookies)->ndm_eocookie = 0;
}
/*
* The write verifier has changed (probably due to a server reboot), so all
* B_NEEDCOMMIT blocks will have to be written again. Since they are on the
* dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
Synopsis of problem being fixed: Dan Nelson originally reported that blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
1999-12-12 06:09:57 +00:00
* and B_CLUSTEROK flags. Once done the new write verifier can be set for the
* mount point.
*
* B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data
Synopsis of problem being fixed: Dan Nelson originally reported that blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
1999-12-12 06:09:57 +00:00
* writes are not clusterable.
*/
void
nfs_clearcommit(struct mount *mp)
{
struct vnode *vp, *nvp;
struct buf *bp, *nbp;
int s;
GIANT_REQUIRED;
s = splbio();
MNT_ILOCK(mp);
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
MNT_VNODE_FOREACH(vp, mp, nvp) {
VI_LOCK(vp);
if (vp->v_iflag & VI_XLOCK) {
VI_UNLOCK(vp);
continue;
}
MNT_IUNLOCK(mp);
for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
nbp = TAILQ_NEXT(bp, b_vnbufs);
if (BUF_REFCNT(bp) == 0 &&
(bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
== (B_DELWRI | B_NEEDCOMMIT))
Synopsis of problem being fixed: Dan Nelson originally reported that blocks of zeros could wind up in a file written to over NFS by a client. The problem only occurs a few times per several gigabytes of data. This problem turned out to be bug #3 below. bug #1: B_CLUSTEROK must be cleared when an NFS buffer is reverted from stage 2 (ready for commit rpc) to stage 1 (ready for write). Reversions can occur when a dirty NFS buffer is redirtied with new data. Otherwise the VFS/BIO system may end up thinking that a stage 1 NFS buffer is clusterable. Stage 1 NFS buffers are not clusterable. bug #2: B_CLUSTEROK was inappropriately set for a 'short' NFS buffer (short buffers only occur near the EOF of the file). Change to only set when the buffer is a full biosize (usually 8K). This bug has no effect but should be fixed in -current anyway. It need not be backported. bug #3: B_NEEDCOMMIT was inappropriately set in nfs_flush() (which is typically only called by the update daemon). nfs_flush() does a multi-pass loop but due to the lack of vnode locking it is possible for new buffers to be added to the dirtyblkhd list while a flush operation is going on. This may result in nfs_flush() setting B_NEEDCOMMIT on a buffer which has *NOT* yet gone through its stage 1 write, causing only the commit rpc to be made and thus causing the contents of the buffer to be thrown away (never sent to the server). The patch also contains some cleanup, which only applies to the commit into -current. Reviewed by: dg, julian Originally Reported by: Dan Nelson <dnelson@emsphone.com>
1999-12-12 06:09:57 +00:00
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
}
VI_UNLOCK(vp);
MNT_ILOCK(mp);
}
MNT_IUNLOCK(mp);
splx(s);
}
/*
* Helper functions for former macros. Some of these should be
* moved to their callers.
*/
int
nfsm_mtofh_xx(struct vnode *d, struct vnode **v, int v3, int *f,
struct mbuf **md, caddr_t *dpos)
{
struct nfsnode *ttnp;
struct vnode *ttvp;
nfsfh_t *ttfhp;
u_int32_t *tl;
int ttfhsize;
int t1;
if (v3) {
tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
if (tl == NULL)
return EBADRPC;
*f = fxdr_unsigned(int, *tl);
} else
*f = 1;
if (*f) {
t1 = nfsm_getfh_xx(&ttfhp, &ttfhsize, (v3), md, dpos);
if (t1 != 0)
return t1;
t1 = nfs_nget(d->v_mount, ttfhp, ttfhsize, &ttnp);
if (t1 != 0)
return t1;
*v = NFSTOV(ttnp);
}
if (v3) {
tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
if (tl == NULL)
return EBADRPC;
if (*f)
*f = fxdr_unsigned(int, *tl);
else if (fxdr_unsigned(int, *tl))
nfsm_adv_xx(NFSX_V3FATTR, md, dpos);
}
if (*f) {
ttvp = *v;
t1 = nfs_loadattrcache(&ttvp, md, dpos, NULL, 0);
if (t1)
return t1;
*v = ttvp;
}
return 0;
}
int
nfsm_getfh_xx(nfsfh_t **f, int *s, int v3, struct mbuf **md, caddr_t *dpos)
{
u_int32_t *tl;
if (v3) {
tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
if (tl == NULL)
return EBADRPC;
*s = fxdr_unsigned(int, *tl);
if (*s <= 0 || *s > NFSX_V3FHMAX)
return EBADRPC;
} else
*s = NFSX_V2FH;
*f = nfsm_dissect_xx(nfsm_rndup(*s), md, dpos);
if (*f == NULL)
return EBADRPC;
else
return 0;
}
int
nfsm_loadattr_xx(struct vnode **v, struct vattr *va, struct mbuf **md,
caddr_t *dpos)
{
int t1;
struct vnode *ttvp = *v;
t1 = nfs_loadattrcache(&ttvp, md, dpos, va, 0);
if (t1 != 0)
return t1;
*v = ttvp;
return 0;
}
int
nfsm_postop_attr_xx(struct vnode **v, int *f, struct mbuf **md,
caddr_t *dpos)
{
u_int32_t *tl;
int t1;
struct vnode *ttvp = *v;
tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
if (tl == NULL)
return EBADRPC;
*f = fxdr_unsigned(int, *tl);
if (*f != 0) {
t1 = nfs_loadattrcache(&ttvp, md, dpos, NULL, 1);
if (t1 != 0) {
*f = 0;
return t1;
}
*v = ttvp;
}
return 0;
}
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
int
nfsm_wcc_data_xx(struct vnode **v, int *f, struct mbuf **md, caddr_t *dpos)
{
u_int32_t *tl;
int ttattrf, ttretf = 0;
int t1;
tl = nfsm_dissect_xx(NFSX_UNSIGNED, md, dpos);
if (tl == NULL)
return EBADRPC;
if (*tl == nfs_true) {
tl = nfsm_dissect_xx(6 * NFSX_UNSIGNED, md, dpos);
if (tl == NULL)
return EBADRPC;
if (*f)
ttretf = (VTONFS(*v)->n_mtime ==
fxdr_unsigned(u_int32_t, *(tl + 2)));
}
t1 = nfsm_postop_attr_xx(v, &ttattrf, md, dpos);
if (t1)
return t1;
if (*f)
*f = ttretf;
else
*f = ttattrf;
return 0;
}
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
int
nfsm_strtom_xx(const char *a, int s, int m, struct mbuf **mb, caddr_t *bpos)
{
u_int32_t *tl;
int t1;
if (s > m)
return ENAMETOOLONG;
t1 = nfsm_rndup(s) + NFSX_UNSIGNED;
if (t1 <= M_TRAILINGSPACE(*mb)) {
tl = nfsm_build_xx(t1, mb, bpos);
*tl++ = txdr_unsigned(s);
*(tl + ((t1 >> 2) - 2)) = 0;
bcopy(a, tl, s);
} else {
t1 = nfsm_strtmbuf(mb, bpos, a, s);
if (t1 != 0)
return t1;
}
return 0;
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
}
int
nfsm_fhtom_xx(struct vnode *v, int v3, struct mbuf **mb, caddr_t *bpos)
{
u_int32_t *tl;
int t1;
caddr_t cp;
if (v3) {
t1 = nfsm_rndup(VTONFS(v)->n_fhsize) + NFSX_UNSIGNED;
if (t1 < M_TRAILINGSPACE(*mb)) {
tl = nfsm_build_xx(t1, mb, bpos);
*tl++ = txdr_unsigned(VTONFS(v)->n_fhsize);
*(tl + ((t1 >> 2) - 2)) = 0;
bcopy(VTONFS(v)->n_fhp, tl, VTONFS(v)->n_fhsize);
} else {
t1 = nfsm_strtmbuf(mb, bpos,
(const char *)VTONFS(v)->n_fhp,
VTONFS(v)->n_fhsize);
if (t1 != 0)
return t1;
}
} else {
cp = nfsm_build_xx(NFSX_V2FH, mb, bpos);
bcopy(VTONFS(v)->n_fhp, cp, NFSX_V2FH);
}
return 0;
}
void
nfsm_v3attrbuild_xx(struct vattr *va, int full, struct mbuf **mb,
caddr_t *bpos)
{
u_int32_t *tl;
if (va->va_mode != (mode_t)VNOVAL) {
tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos);
*tl++ = nfs_true;
*tl = txdr_unsigned(va->va_mode);
} else {
tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
*tl = nfs_false;
}
if (full && va->va_uid != (uid_t)VNOVAL) {
tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos);
*tl++ = nfs_true;
*tl = txdr_unsigned(va->va_uid);
} else {
tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
*tl = nfs_false;
}
if (full && va->va_gid != (gid_t)VNOVAL) {
tl = nfsm_build_xx(2 * NFSX_UNSIGNED, mb, bpos);
*tl++ = nfs_true;
*tl = txdr_unsigned(va->va_gid);
} else {
tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
*tl = nfs_false;
}
if (full && va->va_size != VNOVAL) {
tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos);
*tl++ = nfs_true;
txdr_hyper(va->va_size, tl);
} else {
tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
*tl = nfs_false;
}
if (va->va_atime.tv_sec != VNOVAL) {
if (va->va_atime.tv_sec != time_second) {
tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos);
*tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT);
txdr_nfsv3time(&va->va_atime, tl);
} else {
tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
*tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER);
}
} else {
tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
*tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE);
}
if (va->va_mtime.tv_sec != VNOVAL) {
if (va->va_mtime.tv_sec != time_second) {
tl = nfsm_build_xx(3 * NFSX_UNSIGNED, mb, bpos);
*tl++ = txdr_unsigned(NFSV3SATTRTIME_TOCLIENT);
txdr_nfsv3time(&va->va_mtime, tl);
} else {
tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
*tl = txdr_unsigned(NFSV3SATTRTIME_TOSERVER);
}
} else {
tl = nfsm_build_xx(NFSX_UNSIGNED, mb, bpos);
*tl = txdr_unsigned(NFSV3SATTRTIME_DONTCHANGE);
}
}