Fix a serious deadlock with the NFS client. Given a large enough
atomic write request, it can fill the buffer cache with the entirety of that write in order to handle retries. However, it never drops the vnode lock, or else it wouldn't be atomic, so it ends up waiting indefinitely for more buf memory that cannot be gotten as it has it all, and it waits in an uncancellable state. To fix this, hibufspace is exported and scaled to a reasonable fraction. This is used as the limit of how much of an atomic write request by the NFS client will be handled asynchronously. If the request is larger than this, it will be turned into a synchronous request which won't deadlock the system. It's possible this value is far off from what is required by some, so it shall be tunable as soon as mount_nfs(8) learns of the new field. The slowdown between an asynchronous and a synchronous write on NFS appears to be on the order of 2x-4x. General nod by: gad MFC after: 2 weeks More testing: wes PR: kern/79208
This commit is contained in:
parent
c7c0d2e3e4
commit
cc3149b1ea
@ -127,7 +127,7 @@ SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
|
||||
static int lobufspace;
|
||||
SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
|
||||
"Minimum amount of buffers we want to have");
|
||||
static int hibufspace;
|
||||
int hibufspace;
|
||||
SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
|
||||
"Maximum allowed value of bufspace (excluding buf_daemon)");
|
||||
static int bufreusecnt;
|
||||
|
@ -873,6 +873,14 @@ nfs_write(struct vop_write_args *ap)
|
||||
*/
|
||||
if (ioflag & (IO_APPEND | IO_SYNC)) {
|
||||
if (np->n_flag & NMODIFIED) {
|
||||
/*
|
||||
* Require non-blocking, synchronous writes to
|
||||
* dirty files to inform the program it needs
|
||||
* to fsync(2) explicitly.
|
||||
*/
|
||||
if (ioflag & IO_NDELAY)
|
||||
return (EAGAIN);
|
||||
flush_and_restart:
|
||||
np->n_attrstamp = 0;
|
||||
error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
|
||||
if (error)
|
||||
@ -953,6 +961,63 @@ nfs_write(struct vop_write_args *ap)
|
||||
}
|
||||
|
||||
biosize = vp->v_mount->mnt_stat.f_iosize;
|
||||
/*
|
||||
* Find all of this file's B_NEEDCOMMIT buffers. If our writes
|
||||
* would exceed the local maximum per-file write commit size when
|
||||
* combined with those, we must decide whether to flush,
|
||||
* go synchronous, or return error. We don't bother checking
|
||||
* IO_UNIT -- we just make all writes atomic anyway, as there's
|
||||
* no point optimizing for something that really won't ever happen.
|
||||
*/
|
||||
if (!(ioflag & IO_SYNC)) {
|
||||
int needrestart = 0;
|
||||
if (nmp->nm_wcommitsize < uio->uio_resid) {
|
||||
/*
|
||||
* If this request could not possibly be completed
|
||||
* without exceeding the maximum outstanding write
|
||||
* commit size, see if we can convert it into a
|
||||
* synchronous write operation.
|
||||
*/
|
||||
if (ioflag & IO_NDELAY)
|
||||
return (EAGAIN);
|
||||
ioflag |= IO_SYNC;
|
||||
if (np->n_flag & NMODIFIED)
|
||||
needrestart = 1;
|
||||
} else if (np->n_flag & NMODIFIED) {
|
||||
int wouldcommit = 0;
|
||||
BO_LOCK(&vp->v_bufobj);
|
||||
if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
|
||||
TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
|
||||
b_bobufs) {
|
||||
if (bp->b_flags & B_NEEDCOMMIT)
|
||||
wouldcommit += bp->b_bcount;
|
||||
}
|
||||
}
|
||||
BO_UNLOCK(&vp->v_bufobj);
|
||||
/*
|
||||
* Since we're not operating synchronously and
|
||||
* bypassing the buffer cache, we are in a commit
|
||||
* and holding all of these buffers whether
|
||||
* transmitted or not. If not limited, this
|
||||
* will lead to the buffer cache deadlocking,
|
||||
* as no one else can flush our uncommitted buffers.
|
||||
*/
|
||||
wouldcommit += uio->uio_resid;
|
||||
/*
|
||||
* If we would initially exceed the maximum
|
||||
* outstanding write commit size, flush and restart.
|
||||
*/
|
||||
if (wouldcommit > nmp->nm_wcommitsize)
|
||||
needrestart = 1;
|
||||
}
|
||||
if (needrestart) {
|
||||
if (haverslock) {
|
||||
nfs_rsunlock(np, td);
|
||||
haverslock = 0;
|
||||
}
|
||||
goto flush_and_restart;
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
nfsstats.biocache_writes++;
|
||||
|
@ -41,6 +41,8 @@ __FBSDID("$FreeBSD$");
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
#include <sys/kernel.h>
|
||||
#include <sys/bio.h>
|
||||
#include <sys/buf.h>
|
||||
#include <sys/lock.h>
|
||||
#include <sys/malloc.h>
|
||||
#include <sys/mbuf.h>
|
||||
@ -633,6 +635,12 @@ nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp)
|
||||
else
|
||||
nmp->nm_readahead = NFS_MAXRAHEAD;
|
||||
}
|
||||
if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) {
|
||||
if (argp->wcommitsize < nmp->nm_wsize)
|
||||
nmp->nm_wcommitsize = nmp->nm_wsize;
|
||||
else
|
||||
nmp->nm_wcommitsize = argp->wcommitsize;
|
||||
}
|
||||
if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) {
|
||||
if (argp->deadthresh <= NFS_MAXDEADTHRESH)
|
||||
nmp->nm_deadthresh = argp->deadthresh;
|
||||
@ -815,6 +823,7 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam,
|
||||
nmp->nm_wsize = NFS_WSIZE;
|
||||
nmp->nm_rsize = NFS_RSIZE;
|
||||
}
|
||||
nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000);
|
||||
nmp->nm_readdirsize = NFS_READDIRSIZE;
|
||||
nmp->nm_numgrps = NFS_MAXGRPS;
|
||||
nmp->nm_readahead = NFS_DEFRAHEAD;
|
||||
|
@ -56,7 +56,7 @@ struct nfs_args {
|
||||
int retrans; /* times to retry send */
|
||||
int maxgrouplist; /* Max. size of group list */
|
||||
int readahead; /* # of blocks to readahead */
|
||||
int __pad1; /* was "leaseterm" */
|
||||
int wcommitsize; /* Max. write commit size in bytes */
|
||||
int deadthresh; /* Retrans threshold */
|
||||
char *hostname; /* server's name */
|
||||
int acregmin; /* cache attrs for reg files min time */
|
||||
@ -80,7 +80,7 @@ struct nfs_args {
|
||||
#define NFSMNT_NFSV3 0x00000200 /* Use NFS Version 3 protocol */
|
||||
/* 0x400 free, was NFSMNT_KERB */
|
||||
#define NFSMNT_DUMBTIMR 0x00000800 /* Don't estimate rtt dynamically */
|
||||
/* 0x1000 free, was NFSMNT_LEASETERM */
|
||||
#define NFSMNT_WCOMMITSIZE 0x00001000 /* set max write commit size */
|
||||
#define NFSMNT_READAHEAD 0x00002000 /* set read ahead */
|
||||
#define NFSMNT_DEADTHRESH 0x00004000 /* set dead server retry thresh */
|
||||
#define NFSMNT_RESVPORT 0x00008000 /* Allocate a reserved port */
|
||||
|
@ -74,6 +74,7 @@ struct nfsmount {
|
||||
int nm_wsize; /* Max size of write rpc */
|
||||
int nm_readdirsize; /* Size of a readdir rpc */
|
||||
int nm_readahead; /* Num. of blocks to readahead */
|
||||
int nm_wcommitsize; /* Max size of commit for write */
|
||||
int nm_acdirmin; /* Directory attr cache min lifetime */
|
||||
int nm_acdirmax; /* Directory attr cache max lifetime */
|
||||
int nm_acregmin; /* Reg file attr cache min lifetime */
|
||||
|
@ -465,6 +465,7 @@ extern int nbuf; /* The number of buffer headers */
|
||||
extern int maxswzone; /* Max KVA for swap structures */
|
||||
extern int maxbcache; /* Max KVA for buffer cache */
|
||||
extern int runningbufspace;
|
||||
extern int hibufspace;
|
||||
extern int buf_maxio; /* nominal maximum I/O for buffer */
|
||||
extern struct buf *buf; /* The buffer headers. */
|
||||
extern char *buffers; /* The buffer contents. */
|
||||
|
@ -109,13 +109,13 @@ struct bufobj {
|
||||
|
||||
#define BO_LOCK(bo) \
|
||||
do { \
|
||||
KASSERT (bo->bo_mtx != NULL, ("No lock in bufobj")); \
|
||||
KASSERT((bo)->bo_mtx != NULL, ("No lock in bufobj")); \
|
||||
mtx_lock((bo)->bo_mtx); \
|
||||
} while (0)
|
||||
|
||||
#define BO_UNLOCK(bo) \
|
||||
do { \
|
||||
KASSERT (bo->bo_mtx != NULL, ("No lock in bufobj")); \
|
||||
KASSERT((bo)->bo_mtx != NULL, ("No lock in bufobj")); \
|
||||
mtx_unlock((bo)->bo_mtx); \
|
||||
} while (0)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user