Fix two issues with bufdaemon, often causing the processes to hang in

the "nbufkv" sleep.

First, ffs background cg group block write requests a new buffer for
the shadow copy. When ffs_bufwrite() is called from the bufdaemon due
to buffers shortage, requesting the buffer deadlock bufdaemon.
Introduce a new flag for getnewbuf(), GB_NOWAIT_BD, to request getblk
to not block while allocating the buffer, and return failure
instead. Add a flag argument to the geteblk to allow to pass the flags
to getblk(). Do not repeat the getnewbuf() call from geteblk if buffer
allocation failed and either GB_NOWAIT_BD is specified, or geteblk()
is called from bufdaemon (or its helper, see below). In
ffs_bufwrite(), fall back to synchronous cg block write if shadow
block allocation failed.

Since r107847, buffer write assumes that vnode owning the buffer is
locked. The second problem is that buffer cache may accumulate many
buffers belonging to limited number of vnodes. With such workload,
quite often threads that own the mentioned vnodes locks are trying to
read another block from the vnodes, and, due to buffer cache
exhaustion, are asking bufdaemon for help. Bufdaemon is unable to make
any substantial progress because the vnodes are locked.

Allow the threads owning vnode locks to help the bufdaemon by doing
the flush pass over the buffer cache before getnewbuf() is going to
uninterruptible sleep. Move the flushing code from buf_daemon() to new
helper function buf_do_flush(), that is called from getnewbuf().  The
number of buffers flushed by single call to buf_do_flush() from
getnewbuf() is limited by new sysctl vfs.flushbufqtarget.  Prevent
recursive calls to buf_do_flush() by marking the bufdaemon and threads
that temporarily help bufdaemon by TDP_BUFNEED flag.

In collaboration with:	pho
Reviewed by:	 tegge (previous version)
Tested by:	 glebius, yandex ...
MFC after:	 3 weeks
This commit is contained in:
Konstantin Belousov 2009-03-16 15:39:46 +00:00
parent ef39bc9f18
commit c1d8b5e82c
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=189878
5 changed files with 126 additions and 50 deletions

View File

@ -81,7 +81,7 @@ xfs_buf_get_empty(size_t size, xfs_buftarg_t *target)
{
struct buf *bp;
bp = geteblk(0);
bp = geteblk(0, 0);
if (bp != NULL) {
bp->b_bufsize = size;
bp->b_bcount = size;
@ -100,7 +100,7 @@ xfs_buf_get_noaddr(size_t len, xfs_buftarg_t *target)
if (len >= MAXPHYS)
return (NULL);
bp = geteblk(len);
bp = geteblk(len, 0);
if (bp != NULL) {
BUF_ASSERT_HELD(bp);

View File

@ -106,7 +106,8 @@ static void vfs_setdirty_locked_object(struct buf *bp);
static void vfs_vmio_release(struct buf *bp);
static int vfs_bio_clcheck(struct vnode *vp, int size,
daddr_t lblkno, daddr_t blkno);
static int flushbufqueues(int, int);
static int buf_do_flush(struct vnode *vp);
static int flushbufqueues(struct vnode *, int, int);
static void buf_daemon(void);
static void bremfreel(struct buf *bp);
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
@ -198,6 +199,9 @@ SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
static int getnewbufrestarts;
SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
"Number of times getnewbuf has had to restart a buffer aquisition");
static int flushbufqtarget = 100;
SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
"Amount of work to do in flushbufqueues when helping bufdaemon");
/*
* Wakeup point for bufdaemon, as well as indicator of whether it is already
@ -258,6 +262,7 @@ static struct mtx nblock;
#define QUEUE_DIRTY_GIANT 3 /* B_DELWRI buffers that need giant */
#define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */
#define QUEUE_EMPTY 5 /* empty buffer headers */
#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */
/* Queues for free buffers with various properties */
static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
@ -1710,21 +1715,23 @@ vfs_bio_awrite(struct buf *bp)
*/
static struct buf *
getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
int gbflags)
{
struct thread *td;
struct buf *bp;
struct buf *nbp;
int defrag = 0;
int nqindex;
static int flushingbufs;
td = curthread;
/*
* We can't afford to block since we might be holding a vnode lock,
* which may prevent system daemons from running. We deal with
* low-memory situations by proactively returning memory and running
* async I/O rather then sync I/O.
*/
atomic_add_int(&getnewbufcalls, 1);
atomic_subtract_int(&getnewbufrestarts, 1);
restart:
@ -1956,8 +1963,9 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
*/
if (bp == NULL) {
int flags;
int flags, norunbuf;
char *waitmsg;
int fl;
if (defrag) {
flags = VFS_BIO_NEED_BUFSPACE;
@ -1975,9 +1983,35 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
mtx_unlock(&bqlock);
bd_speedup(); /* heeeelp */
if (gbflags & GB_NOWAIT_BD)
return (NULL);
mtx_lock(&nblock);
while (needsbuffer & flags) {
if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
mtx_unlock(&nblock);
/*
* getblk() is called with a vnode
* locked, and some majority of the
* dirty buffers may as well belong to
* the vnode. Flushing the buffers
* there would make a progress that
* cannot be achieved by the
* buf_daemon, that cannot lock the
* vnode.
*/
norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
(td->td_pflags & TDP_NORUNNINGBUF);
/* play bufdaemon */
td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
fl = buf_do_flush(vp);
td->td_pflags &= norunbuf;
mtx_lock(&nblock);
if (fl != 0)
continue;
if ((needsbuffer & flags) == 0)
break;
}
if (msleep(&needsbuffer, &nblock,
(PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
mtx_unlock(&nblock);
@ -2046,6 +2080,35 @@ static struct kproc_desc buf_kp = {
};
SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
static int
buf_do_flush(struct vnode *vp)
{
int flushed;
flushed = flushbufqueues(vp, QUEUE_DIRTY, 0);
/* The list empty check here is slightly racy */
if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
mtx_lock(&Giant);
flushed += flushbufqueues(vp, QUEUE_DIRTY_GIANT, 0);
mtx_unlock(&Giant);
}
if (flushed == 0) {
/*
* Could not find any buffers without rollback
* dependencies, so just write the first one
* in the hopes of eventually making progress.
*/
flushbufqueues(vp, QUEUE_DIRTY, 1);
if (!TAILQ_EMPTY(
&bufqueues[QUEUE_DIRTY_GIANT])) {
mtx_lock(&Giant);
flushbufqueues(vp, QUEUE_DIRTY_GIANT, 1);
mtx_unlock(&Giant);
}
}
return (flushed);
}
static void
buf_daemon()
{
@ -2059,7 +2122,7 @@ buf_daemon()
/*
* This process is allowed to take the buffer cache to the limit
*/
curthread->td_pflags |= TDP_NORUNNINGBUF;
curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
mtx_lock(&bdlock);
for (;;) {
bd_request = 0;
@ -2074,30 +2137,8 @@ buf_daemon()
* normally would so they can run in parallel with our drain.
*/
while (numdirtybuffers > lodirtybuffers) {
int flushed;
flushed = flushbufqueues(QUEUE_DIRTY, 0);
/* The list empty check here is slightly racy */
if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
mtx_lock(&Giant);
flushed += flushbufqueues(QUEUE_DIRTY_GIANT, 0);
mtx_unlock(&Giant);
}
if (flushed == 0) {
/*
* Could not find any buffers without rollback
* dependencies, so just write the first one
* in the hopes of eventually making progress.
*/
flushbufqueues(QUEUE_DIRTY, 1);
if (!TAILQ_EMPTY(
&bufqueues[QUEUE_DIRTY_GIANT])) {
mtx_lock(&Giant);
flushbufqueues(QUEUE_DIRTY_GIANT, 1);
mtx_unlock(&Giant);
}
if (buf_do_flush(NULL) == 0)
break;
}
uio_yield();
}
@ -2143,7 +2184,7 @@ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
0, "Number of buffers flushed with dependecies that require rollbacks");
static int
flushbufqueues(int queue, int flushdeps)
flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
{
struct buf sentinel;
struct vnode *vp;
@ -2153,20 +2194,37 @@ flushbufqueues(int queue, int flushdeps)
int flushed;
int target;
target = numdirtybuffers - lodirtybuffers;
if (flushdeps && target > 2)
target /= 2;
if (lvp == NULL) {
target = numdirtybuffers - lodirtybuffers;
if (flushdeps && target > 2)
target /= 2;
} else
target = flushbufqtarget;
flushed = 0;
bp = NULL;
sentinel.b_qindex = QUEUE_SENTINEL;
mtx_lock(&bqlock);
TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist);
TAILQ_INSERT_HEAD(&bufqueues[queue], &sentinel, b_freelist);
while (flushed != target) {
bp = TAILQ_FIRST(&bufqueues[queue]);
if (bp == &sentinel)
bp = TAILQ_NEXT(&sentinel, b_freelist);
if (bp != NULL) {
TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist);
TAILQ_INSERT_AFTER(&bufqueues[queue], bp, &sentinel,
b_freelist);
} else
break;
TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist);
TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist);
/*
* Skip sentinels inserted by other invocations of the
* flushbufqueues(), taking care to not reorder them.
*/
if (bp->b_qindex == QUEUE_SENTINEL)
continue;
/*
* Only flush the buffers that belong to the
* vnode locked by the curthread.
*/
if (lvp != NULL && bp->b_vp != lvp)
continue;
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
continue;
if (bp->b_pin_count > 0) {
@ -2214,16 +2272,27 @@ flushbufqueues(int queue, int flushdeps)
BUF_UNLOCK(bp);
continue;
}
if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
mtx_unlock(&bqlock);
CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
bp, bp->b_vp, bp->b_flags);
vfs_bio_awrite(bp);
if (curproc == bufdaemonproc)
vfs_bio_awrite(bp);
else {
bremfree(bp);
bwrite(bp);
}
vn_finished_write(mp);
VOP_UNLOCK(vp, 0);
flushwithdeps += hasdeps;
flushed++;
waitrunningbufspace();
/*
* Sleeping on runningbufspace while holding
* vnode lock leads to deadlock.
*/
if (curproc == bufdaemonproc)
waitrunningbufspace();
numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
mtx_lock(&bqlock);
continue;
@ -2605,7 +2674,7 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo,
maxsize = vmio ? size + (offset & PAGE_MASK) : size;
maxsize = imax(maxsize, bsize);
bp = getnewbuf(slpflag, slptimeo, size, maxsize);
bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
if (bp == NULL) {
if (slpflag || slptimeo)
return NULL;
@ -2680,14 +2749,17 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo,
* set to B_INVAL.
*/
struct buf *
geteblk(int size)
geteblk(int size, int flags)
{
struct buf *bp;
int maxsize;
maxsize = (size + BKVAMASK) & ~BKVAMASK;
while ((bp = getnewbuf(0, 0, size, maxsize)) == 0)
continue;
while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
if ((flags & GB_NOWAIT_BD) &&
(curthread->td_pflags & TDP_BUFNEED) != 0)
return (NULL);
}
allocbuf(bp, size);
bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
BUF_ASSERT_HELD(bp);

View File

@ -443,6 +443,7 @@ buf_countdeps(struct buf *bp, int i)
*/
#define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */
#define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */
#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon */
#ifdef _KERNEL
extern int nbuf; /* The number of buffer headers */
@ -487,7 +488,7 @@ struct buf * getpbuf(int *);
struct buf *incore(struct bufobj *, daddr_t);
struct buf *gbincore(struct bufobj *, daddr_t);
struct buf *getblk(struct vnode *, daddr_t, int, int, int, int);
struct buf *geteblk(int);
struct buf *geteblk(int, int);
int bufwait(struct buf *);
int bufwrite(struct buf *);
void bufdone(struct buf *);

View File

@ -345,7 +345,7 @@ do { \
#define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */
#define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */
#define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */
#define TDP_UNUSED8 0x00000008 /* available */
#define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */
#define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */
#define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */
#define TDP_DEADLKTREAT 0x00000040 /* Lock aquisition - deadlock treatment. */

View File

@ -1845,7 +1845,9 @@ ffs_bufwrite(struct buf *bp)
("bufwrite: needs chained iodone (%p)", bp->b_iodone));
/* get a new block */
newbp = geteblk(bp->b_bufsize);
newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD);
if (newbp == NULL)
goto normal_write;
/*
* set it to be identical to the old block. We have to
@ -1885,6 +1887,7 @@ ffs_bufwrite(struct buf *bp)
}
/* Let the normal bufwrite do the rest for us */
normal_write:
return (bufwrite(bp));
}