Fix a serious deadlock with the NFS client. Given a large enough

atomic write request, it can fill the buffer cache with the entirety
of that write in order to handle retries.  However, it never drops
the vnode lock, or else it wouldn't be atomic, so it ends up waiting
indefinitely for more buf memory that cannot be gotten as it has it
all, and it waits in an uncancellable state.

To fix this, hibufspace is exported and scaled to a reasonable
fraction.  This is used as the limit of how much of an atomic write
request by the NFS client will be handled asynchronously.  If the
request is larger than this, it will be turned into a synchronous
request which won't deadlock the system.  It's possible this value is
far off from what is required by some, so it shall be tunable as soon
as mount_nfs(8) learns of the new field.

The slowdown between an asynchronous and a synchronous write on NFS
appears to be on the order of 2x-4x.

General nod by:	gad
MFC after:	2 weeks
More testing:	wes
PR:		kern/79208
This commit is contained in:
Brian Feldman 2005-06-10 23:50:41 +00:00
parent c7c0d2e3e4
commit cc3149b1ea
7 changed files with 81 additions and 5 deletions

View File

@ -127,7 +127,7 @@ SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
static int lobufspace;
SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
"Minimum amount of buffers we want to have");
static int hibufspace;
int hibufspace;
SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
"Maximum allowed value of bufspace (excluding buf_daemon)");
static int bufreusecnt;

View File

@ -873,6 +873,14 @@ nfs_write(struct vop_write_args *ap)
*/
if (ioflag & (IO_APPEND | IO_SYNC)) {
if (np->n_flag & NMODIFIED) {
/*
* Require non-blocking, synchronous writes to
* dirty files to inform the program it needs
* to fsync(2) explicitly.
*/
if (ioflag & IO_NDELAY)
return (EAGAIN);
flush_and_restart:
np->n_attrstamp = 0;
error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
if (error)
@ -953,6 +961,63 @@ nfs_write(struct vop_write_args *ap)
}
biosize = vp->v_mount->mnt_stat.f_iosize;
/*
* Find all of this file's B_NEEDCOMMIT buffers. If our writes
* would exceed the local maximum per-file write commit size when
* combined with those, we must decide whether to flush,
* go synchronous, or return error. We don't bother checking
* IO_UNIT -- we just make all writes atomic anyway, as there's
* no point optimizing for something that really won't ever happen.
*/
if (!(ioflag & IO_SYNC)) {
int needrestart = 0;
if (nmp->nm_wcommitsize < uio->uio_resid) {
/*
* If this request could not possibly be completed
* without exceeding the maximum outstanding write
* commit size, see if we can convert it into a
* synchronous write operation.
*/
if (ioflag & IO_NDELAY)
return (EAGAIN);
ioflag |= IO_SYNC;
if (np->n_flag & NMODIFIED)
needrestart = 1;
} else if (np->n_flag & NMODIFIED) {
int wouldcommit = 0;
BO_LOCK(&vp->v_bufobj);
if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
b_bobufs) {
if (bp->b_flags & B_NEEDCOMMIT)
wouldcommit += bp->b_bcount;
}
}
BO_UNLOCK(&vp->v_bufobj);
/*
* Since we're not operating synchronously and
* bypassing the buffer cache, we are in a commit
* and holding all of these buffers whether
* transmitted or not. If not limited, this
* will lead to the buffer cache deadlocking,
* as no one else can flush our uncommitted buffers.
*/
wouldcommit += uio->uio_resid;
/*
* If we would initially exceed the maximum
* outstanding write commit size, flush and restart.
*/
if (wouldcommit > nmp->nm_wcommitsize)
needrestart = 1;
}
if (needrestart) {
if (haverslock) {
nfs_rsunlock(np, td);
haverslock = 0;
}
goto flush_and_restart;
}
}
do {
nfsstats.biocache_writes++;

View File

@ -41,6 +41,8 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
@ -633,6 +635,12 @@ nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp)
else
nmp->nm_readahead = NFS_MAXRAHEAD;
}
if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) {
if (argp->wcommitsize < nmp->nm_wsize)
nmp->nm_wcommitsize = nmp->nm_wsize;
else
nmp->nm_wcommitsize = argp->wcommitsize;
}
if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) {
if (argp->deadthresh <= NFS_MAXDEADTHRESH)
nmp->nm_deadthresh = argp->deadthresh;
@ -815,6 +823,7 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam,
nmp->nm_wsize = NFS_WSIZE;
nmp->nm_rsize = NFS_RSIZE;
}
nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000);
nmp->nm_readdirsize = NFS_READDIRSIZE;
nmp->nm_numgrps = NFS_MAXGRPS;
nmp->nm_readahead = NFS_DEFRAHEAD;

View File

@ -56,7 +56,7 @@ struct nfs_args {
int retrans; /* times to retry send */
int maxgrouplist; /* Max. size of group list */
int readahead; /* # of blocks to readahead */
int __pad1; /* was "leaseterm" */
int wcommitsize; /* Max. write commit size in bytes */
int deadthresh; /* Retrans threshold */
char *hostname; /* server's name */
int acregmin; /* cache attrs for reg files min time */
@ -80,7 +80,7 @@ struct nfs_args {
#define NFSMNT_NFSV3 0x00000200 /* Use NFS Version 3 protocol */
/* 0x400 free, was NFSMNT_KERB */
#define NFSMNT_DUMBTIMR 0x00000800 /* Don't estimate rtt dynamically */
/* 0x1000 free, was NFSMNT_LEASETERM */
#define NFSMNT_WCOMMITSIZE 0x00001000 /* set max write commit size */
#define NFSMNT_READAHEAD 0x00002000 /* set read ahead */
#define NFSMNT_DEADTHRESH 0x00004000 /* set dead server retry thresh */
#define NFSMNT_RESVPORT 0x00008000 /* Allocate a reserved port */

View File

@ -74,6 +74,7 @@ struct nfsmount {
int nm_wsize; /* Max size of write rpc */
int nm_readdirsize; /* Size of a readdir rpc */
int nm_readahead; /* Num. of blocks to readahead */
int nm_wcommitsize; /* Max size of commit for write */
int nm_acdirmin; /* Directory attr cache min lifetime */
int nm_acdirmax; /* Directory attr cache max lifetime */
int nm_acregmin; /* Reg file attr cache min lifetime */

View File

@ -465,6 +465,7 @@ extern int nbuf; /* The number of buffer headers */
extern int maxswzone; /* Max KVA for swap structures */
extern int maxbcache; /* Max KVA for buffer cache */
extern int runningbufspace;
extern int hibufspace;
extern int buf_maxio; /* nominal maximum I/O for buffer */
extern struct buf *buf; /* The buffer headers. */
extern char *buffers; /* The buffer contents. */

View File

@ -109,13 +109,13 @@ struct bufobj {
#define BO_LOCK(bo) \
do { \
KASSERT (bo->bo_mtx != NULL, ("No lock in bufobj")); \
KASSERT((bo)->bo_mtx != NULL, ("No lock in bufobj")); \
mtx_lock((bo)->bo_mtx); \
} while (0)
#define BO_UNLOCK(bo) \
do { \
KASSERT (bo->bo_mtx != NULL, ("No lock in bufobj")); \
KASSERT((bo)->bo_mtx != NULL, ("No lock in bufobj")); \
mtx_unlock((bo)->bo_mtx); \
} while (0)