Implement a low-memory deadlock solution.

Removed most of the hacks that were trying to deal with low-memory
    situations prior to now.

    The new code is based on the concept that I/O must be able to function in
    a low memory situation.  All major modules related to I/O (except
    networking) have been adjusted to allow allocation out of the system
    reserve memory pool.  These modules now detect a low memory situation but
    rather then block they instead continue to operate, then return resources
    to the memory pool instead of cache them or leave them wired.

    Code has been added to stall in a low-memory situation prior to a vnode
    being locked.

    Thus situations where a process blocks in a low-memory condition while
    holding a locked vnode have been reduced to near nothing.  Not only will
    I/O continue to operate, but many prior deadlock conditions simply no
    longer exist.

Implement a number of VFS/BIO fixes

	(found by Ian): in biodone(), bogus-page replacement code, the loop
        was not properly incrementing loop variables prior to a continue
        statement.  We do not believe this code can be hit anyway but we
        aren't taking any chances.  We'll turn the whole section into a
        panic (as it already is in brelse()) after the release is rolled.

	In biodone(), the foff calculation was incorrectly
        clamped to the iosize, causing the wrong foff to be calculated
        for pages in the case of an I/O error or biodone() called without
        initiating I/O.  The problem always caused a panic before.  Now it
        doesn't.  The problem is mainly an issue with NFS.

	Fixed casts for ~PAGE_MASK.  This code worked properly before only
        because the calculations use signed arithmatic.  Better to properly
        extend PAGE_MASK first before inverting it for the 64 bit masking
        op.

	In brelse(), the bogus_page fixup code was improperly throwing
        away the original contents of 'm' when it did the j-loop to
        fix the bogus pages.  The result was that it would potentially
        invalidate parts of the *WRONG* page(!), leading to corruption.

	There may still be cases where a background bitmap write is
        being duplicated, causing potential corruption.  We have identified
        a potentially serious bug related to this but the fix is still TBD.
        So instead this patch contains a KASSERT to detect the problem
  	and panic the machine rather then continue to corrupt the filesystem.
	The problem does not occur very often..  it is very hard to
	reproduce, and it may or may not be the cause of the corruption
	people have reported.

Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
This commit is contained in:
Matthew Dillon 2000-11-18 23:06:26 +00:00
parent ef0646f9d8
commit 936524aa02
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=68885
14 changed files with 315 additions and 185 deletions

View File

@ -597,8 +597,14 @@ bwrite(struct buf * bp)
* If this buffer is marked for background writing and we
* do not have to wait for it, make a copy and write the
* copy so as to leave this buffer ready for further use.
*
* This optimization eats a lot of memory. If we have a page
* or buffer shortfall we can't do it.
*/
if ((bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC)) {
if ((bp->b_xflags & BX_BKGRDWRITE) &&
(bp->b_flags & B_ASYNC) &&
!vm_page_count_severe() &&
!buf_dirty_count_severe()) {
if (bp->b_iodone != NULL) {
printf("bp->b_iodone = %p\n", bp->b_iodone);
panic("bwrite: need chained iodone");
@ -682,7 +688,10 @@ vfs_backgroundwritedone(bp)
/*
* Clear the BX_BKGRDINPROG flag in the original buffer
* and awaken it if it is waiting for the write to complete.
* If BX_BKGRDINPROG is not set in the original buffer it must
* have been released and re-instantiated - which is not legal.
*/
KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2"));
origbp->b_xflags &= ~BX_BKGRDINPROG;
if (origbp->b_xflags & BX_BKGRDWAIT) {
origbp->b_xflags &= ~BX_BKGRDWAIT;
@ -902,6 +911,15 @@ bwillwrite(void)
}
}
/*
* Return true if we have too many dirty buffers.
*/
int
buf_dirty_count_severe(void)
{
return(numdirtybuffers >= hidirtybuffers);
}
/*
* brelse:
*
@ -964,10 +982,14 @@ brelse(struct buf * bp)
*
* We still allow the B_INVAL case to call vfs_vmio_release(), even
* if B_DELWRI is set.
*
* If B_DELWRI is not set we may have to set B_RELBUF if we are low
* on pages to return pages to the VM page queues.
*/
if (bp->b_flags & B_DELWRI)
bp->b_flags &= ~B_RELBUF;
else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
bp->b_flags |= B_RELBUF;
/*
* VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
@ -989,8 +1011,7 @@ brelse(struct buf * bp)
if ((bp->b_flags & B_VMIO)
&& !(bp->b_vp->v_tag == VT_NFS &&
!vn_isdisk(bp->b_vp, NULL) &&
(bp->b_flags & B_DELWRI) &&
(bp->b_xflags & BX_BKGRDINPROG))
(bp->b_flags & B_DELWRI))
) {
int i, j, resid;
@ -1017,32 +1038,40 @@ brelse(struct buf * bp)
*
* See man buf(9) for more information
*/
resid = bp->b_bufsize;
foff = bp->b_offset;
for (i = 0; i < bp->b_npages; i++) {
int had_bogus = 0;
m = bp->b_pages[i];
vm_page_flag_clear(m, PG_ZERO);
if (m == bogus_page) {
/*
* If we hit a bogus page, fixup *all* the bogus pages
* now.
*/
if (m == bogus_page) {
VOP_GETVOBJECT(vp, &obj);
poff = OFF_TO_IDX(bp->b_offset);
had_bogus = 1;
for (j = i; j < bp->b_npages; j++) {
m = bp->b_pages[j];
if (m == bogus_page) {
m = vm_page_lookup(obj, poff + j);
if (!m) {
vm_page_t mtmp;
mtmp = bp->b_pages[j];
if (mtmp == bogus_page) {
mtmp = vm_page_lookup(obj, poff + j);
if (!mtmp) {
panic("brelse: page missing\n");
}
bp->b_pages[j] = m;
bp->b_pages[j] = mtmp;
}
}
if ((bp->b_flags & B_INVAL) == 0) {
pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
}
m = bp->b_pages[i];
}
if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) {
int poffset = foff & PAGE_MASK;
@ -1051,9 +1080,11 @@ brelse(struct buf * bp)
KASSERT(presid >= 0, ("brelse: extra page"));
vm_page_set_invalid(m, poffset, presid);
if (had_bogus)
printf("avoided corruption bug in bogus_page/brelse code\n");
}
resid -= PAGE_SIZE - (foff & PAGE_MASK);
foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
}
if (bp->b_flags & (B_INVAL | B_RELBUF))
@ -1171,7 +1202,7 @@ brelse(struct buf * bp)
/*
* Release a buffer back to the appropriate queue but do not try to free
* it.
* it. The buffer is expected to be used again soon.
*
* bqrelse() is used by bdwrite() to requeue a delayed write, and used by
* biodone() to requeue an async I/O on completion. It is also used when
@ -1203,6 +1234,15 @@ bqrelse(struct buf * bp)
} else if (bp->b_flags & B_DELWRI) {
bp->b_qindex = QUEUE_DIRTY;
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
} else if (vm_page_count_severe()) {
/*
* We are too low on memory, we have to try to free the
* buffer (most importantly: the wired pages making up its
* backing store) *now*.
*/
splx(s);
brelse(bp);
return;
} else {
bp->b_qindex = QUEUE_CLEAN;
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
@ -1264,6 +1304,8 @@ vfs_vmio_release(bp)
vm_page_busy(m);
vm_page_protect(m, VM_PROT_NONE);
vm_page_free(m);
} else if (vm_page_count_severe()) {
vm_page_try_to_cache(m);
}
}
}
@ -1419,15 +1461,15 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
struct buf *nbp;
int defrag = 0;
int nqindex;
int isspecial;
static int flushingbufs;
if (curproc != idleproc &&
(curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0)
isspecial = 0;
else
isspecial = 1;
/*
* We can't afford to block since we might be holding a vnode lock,
* which may prevent system daemons from running. We deal with
* low-memory situations by proactively returning memory and running
* async I/O rather then sync I/O.
*/
++getnewbufcalls;
--getnewbufrestarts;
restart:
@ -1445,42 +1487,28 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
* However, there are a number of cases (defragging, reusing, ...)
* where we cannot backup.
*/
nqindex = QUEUE_EMPTYKVA;
nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
if (isspecial == 0 && numfreebuffers < lofreebuffers) {
if (nbp == NULL) {
/*
* This will cause an immediate failure
* If no EMPTYKVA buffers and we are either
* defragging or reusing, locate a CLEAN buffer
* to free or reuse. If bufspace useage is low
* skip this step so we can allocate a new buffer.
*/
nqindex = QUEUE_CLEAN;
nbp = NULL;
} else {
if (defrag || bufspace >= lobufspace) {
nqindex = QUEUE_CLEAN;
nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
}
/*
* Locate a buffer which already has KVA assigned. First
* try EMPTYKVA buffers.
* Nada. If we are allowed to allocate an EMPTY
* buffer, go get one.
*/
nqindex = QUEUE_EMPTYKVA;
nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
if (nbp == NULL) {
/*
* If no EMPTYKVA buffers and we are either
* defragging or reusing, locate a CLEAN buffer
* to free or reuse. If bufspace useage is low
* skip this step so we can allocate a new buffer.
*/
if (defrag || bufspace >= lobufspace) {
nqindex = QUEUE_CLEAN;
nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
}
/*
* Nada. If we are allowed to allocate an EMPTY
* buffer, go get one.
*/
if (nbp == NULL && defrag == 0 &&
(isspecial || bufspace < hibufspace)) {
nqindex = QUEUE_EMPTY;
nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
}
if (nbp == NULL && defrag == 0 && bufspace < hibufspace) {
nqindex = QUEUE_EMPTY;
nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
}
}
@ -1610,26 +1638,16 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
goto restart;
}
/*
* If we are a normal process then deal with bufspace
* hysteresis. A normal process tries to keep bufspace
* between lobufspace and hibufspace. Note: if we encounter
* a buffer with b_kvasize == 0 then it means we started
* our scan on the EMPTY list and should allocate a new
* buffer.
*/
if (isspecial == 0) {
if (bufspace > hibufspace)
flushingbufs = 1;
if (flushingbufs && bp->b_kvasize != 0) {
bp->b_flags |= B_INVAL;
bfreekva(bp);
brelse(bp);
goto restart;
}
if (bufspace < lobufspace)
flushingbufs = 0;
if (bufspace >= hibufspace)
flushingbufs = 1;
if (flushingbufs && bp->b_kvasize != 0) {
bp->b_flags |= B_INVAL;
bfreekva(bp);
brelse(bp);
goto restart;
}
if (bufspace < lobufspace)
flushingbufs = 0;
break;
}
@ -1705,6 +1723,7 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
return(bp);
}
#if 0
/*
* waitfreebuffers:
*
@ -1723,6 +1742,8 @@ waitfreebuffers(int slpflag, int slptimeo)
}
}
#endif
/*
* buf_daemon:
*
@ -2073,8 +2094,12 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
* If this check ever becomes a bottleneck it may be better to
* move it into the else, when gbincore() fails. At the moment
* it isn't a problem.
*
* XXX remove if 0 sections (clean this up after its proven)
*/
#if 0
if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) {
#endif
if (numfreebuffers == 0) {
if (curproc == idleproc)
return NULL;
@ -2082,9 +2107,11 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
slptimeo);
}
#if 0
} else if (numfreebuffers < lofreebuffers) {
waitfreebuffers(slpflag, slptimeo);
}
#endif
if ((bp = gbincore(vp, blkno))) {
/*
@ -2468,7 +2495,13 @@ allocbuf(struct buf *bp, int size)
pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
if ((m = vm_page_lookup(obj, pi)) == NULL) {
m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
/*
* note: must allocate system pages
* since blocking here could intefere
* with paging I/O, no matter which
* process we are.
*/
m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM);
if (m == NULL) {
VM_WAIT;
vm_pageout_deficit += desiredpages - bp->b_npages;
@ -2671,7 +2704,7 @@ bufdone(struct buf *bp)
buf_complete(bp);
if (bp->b_flags & B_VMIO) {
int i, resid;
int i;
vm_ooffset_t foff;
vm_page_t m;
vm_object_t obj;
@ -2722,16 +2755,29 @@ bufdone(struct buf *bp)
for (i = 0; i < bp->b_npages; i++) {
int bogusflag = 0;
int resid;
resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
if (resid > iosize)
resid = iosize;
/*
* cleanup bogus pages, restoring the originals
*/
m = bp->b_pages[i];
if (m == bogus_page) {
bogusflag = 1;
m = vm_page_lookup(obj, OFF_TO_IDX(foff));
if (!m) {
panic("biodone: page disappeared!");
#if defined(VFS_BIO_DEBUG)
printf("biodone: page disappeared\n");
#endif
vm_object_pip_subtract(obj, 1);
bp->b_flags &= ~B_CACHE;
foff = (foff + PAGE_SIZE) &
~(off_t)PAGE_MASK;
iosize -= resid;
continue;
}
bp->b_pages[i] = m;
@ -2744,9 +2790,6 @@ bufdone(struct buf *bp)
(unsigned long)foff, m->pindex);
}
#endif
resid = IDX_TO_OFF(m->pindex + 1) - foff;
if (resid > iosize)
resid = iosize;
/*
* In the write case, the valid and clean bits are
@ -2784,7 +2827,7 @@ bufdone(struct buf *bp)
}
vm_page_io_finish(m);
vm_object_pip_subtract(obj, 1);
foff += resid;
foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
iosize -= resid;
}
if (obj)
@ -2862,7 +2905,7 @@ vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
* of the buffer.
*/
soff = off;
eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
if (eoff > bp->b_offset + bp->b_bcount)
eoff = bp->b_offset + bp->b_bcount;
@ -2948,7 +2991,7 @@ vfs_busy_pages(struct buf * bp, int clear_modify)
bp->b_pages[i] = bogus_page;
bogus++;
}
foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
}
if (bogus)
pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
@ -2976,7 +3019,7 @@ vfs_clean_pages(struct buf * bp)
("vfs_clean_pages: no buffer offset"));
for (i = 0; i < bp->b_npages; i++) {
vm_page_t m = bp->b_pages[i];
vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
vm_ooffset_t eoff = noff;
if (eoff > bp->b_offset + bp->b_bufsize)
@ -3104,9 +3147,14 @@ vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
tryagain:
/*
* note: must allocate system pages since blocking here
* could intefere with paging I/O, no matter which
* process we are.
*/
p = vm_page_alloc(kernel_object,
((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
VM_ALLOC_NORMAL);
VM_ALLOC_SYSTEM);
if (!p) {
vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
VM_WAIT;

View File

@ -48,6 +48,7 @@
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/resourcevar.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
@ -665,6 +666,11 @@ cluster_write(bp, filesize, seqcount)
cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
vp->v_clen = 0;
vp->v_cstart = lbn + 1;
} else if (vm_page_count_severe()) {
/*
* We are low on memory, get it going NOW
*/
bawrite(bp);
} else {
/*
* In the middle of a cluster, so just delay the I/O for now.

View File

@ -1438,10 +1438,14 @@ vget(vp, flags, p)
if ((flags & LK_INTERLOCK) == 0)
mtx_enter(&vp->v_interlock, MTX_DEF);
if (vp->v_flag & VXLOCK) {
vp->v_flag |= VXWANT;
mtx_exit(&vp->v_interlock, MTX_DEF);
tsleep((caddr_t)vp, PINOD, "vget", 0);
return (ENOENT);
if (vp->v_vxproc == curproc) {
printf("VXLOCK interlock avoided\n");
} else {
vp->v_flag |= VXWANT;
mtx_exit(&vp->v_interlock, MTX_DEF);
tsleep((caddr_t)vp, PINOD, "vget", 0);
return (ENOENT);
}
}
vp->v_usecount++;
@ -1731,6 +1735,7 @@ vclean(vp, flags, p)
if (vp->v_flag & VXLOCK)
panic("vclean: deadlock");
vp->v_flag |= VXLOCK;
vp->v_vxproc = curproc;
/*
* Even if the count is zero, the VOP_INACTIVE routine may still
* have the object locked while it cleans it out. The VOP_LOCK
@ -1807,6 +1812,7 @@ vclean(vp, flags, p)
vn_pollgone(vp);
vp->v_tag = VT_NON;
vp->v_flag &= ~VXLOCK;
vp->v_vxproc = NULL;
if (vp->v_flag & VXWANT) {
vp->v_flag &= ~VXWANT;
wakeup((caddr_t) vp);

View File

@ -1438,10 +1438,14 @@ vget(vp, flags, p)
if ((flags & LK_INTERLOCK) == 0)
mtx_enter(&vp->v_interlock, MTX_DEF);
if (vp->v_flag & VXLOCK) {
vp->v_flag |= VXWANT;
mtx_exit(&vp->v_interlock, MTX_DEF);
tsleep((caddr_t)vp, PINOD, "vget", 0);
return (ENOENT);
if (vp->v_vxproc == curproc) {
printf("VXLOCK interlock avoided\n");
} else {
vp->v_flag |= VXWANT;
mtx_exit(&vp->v_interlock, MTX_DEF);
tsleep((caddr_t)vp, PINOD, "vget", 0);
return (ENOENT);
}
}
vp->v_usecount++;
@ -1731,6 +1735,7 @@ vclean(vp, flags, p)
if (vp->v_flag & VXLOCK)
panic("vclean: deadlock");
vp->v_flag |= VXLOCK;
vp->v_vxproc = curproc;
/*
* Even if the count is zero, the VOP_INACTIVE routine may still
* have the object locked while it cleans it out. The VOP_LOCK
@ -1807,6 +1812,7 @@ vclean(vp, flags, p)
vn_pollgone(vp);
vp->v_tag = VT_NON;
vp->v_flag &= ~VXLOCK;
vp->v_vxproc = NULL;
if (vp->v_flag & VXWANT) {
vp->v_flag &= ~VXWANT;
wakeup((caddr_t) vp);

View File

@ -642,12 +642,14 @@ debug_vn_lock(vp, flags, p, filename, line)
do {
if ((flags & LK_INTERLOCK) == 0)
mtx_enter(&vp->v_interlock, MTX_DEF);
if (vp->v_flag & VXLOCK) {
if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curproc) {
vp->v_flag |= VXWANT;
mtx_exit(&vp->v_interlock, MTX_DEF);
tsleep((caddr_t)vp, PINOD, "vn_lock", 0);
error = ENOENT;
} else {
if (vp->v_vxproc != NULL)
printf("VXLOCK interlock avoided in vn_lock\n");
#ifdef DEBUG_LOCKS
vp->filename = filename;
vp->line = line;

View File

@ -494,6 +494,7 @@ struct uio;
caddr_t bufhashinit __P((caddr_t));
void bufinit __P((void));
void bwillwrite __P((void));
int buf_dirty_count_severe __P((void));
void bremfree __P((struct buf *));
int bread __P((struct vnode *, daddr_t, int,
struct ucred *, struct buf **));

View File

@ -129,6 +129,7 @@ struct vnode {
short vpi_events; /* what they are looking for */
short vpi_revents; /* what has happened */
} v_pollinfo;
struct proc *v_vxproc; /* proc owning VXLOCK */
#ifdef DEBUG_LOCKS
const char *filename; /* Source file doing locking */
int line; /* Line number doing locking */

View File

@ -45,6 +45,7 @@
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
#include <sys/vmmeter.h>
#include <sys/stat.h>
#include <vm/vm.h>
@ -111,6 +112,8 @@ ffs_update(vp, waitfor)
ino_to_fsbo(fs, ip->i_number)) = ip->i_din;
if (waitfor && !DOINGASYNC(vp)) {
return (bwrite(bp));
} else if (vm_page_count_severe() || buf_dirty_count_severe()) {
return (bwrite(bp));
} else {
if (bp->b_bufsize == fs->fs_bsize)
bp->b_flags |= B_CLUSTEROK;

View File

@ -91,6 +91,8 @@ MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
#define D_PAGEDEP 0
#define D_INODEDEP 1
#define D_NEWBLK 2
@ -802,7 +804,7 @@ pagedep_lookup(ip, lbn, flags, pagedeppp)
goto top;
}
MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
M_WAITOK);
M_SOFTDEP_FLAGS);
bzero(pagedep, sizeof(struct pagedep));
pagedep->pd_list.wk_type = D_PAGEDEP;
pagedep->pd_mnt = mp;
@ -879,7 +881,7 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
}
num_inodedep += 1;
MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
M_INODEDEP, M_WAITOK);
M_INODEDEP, M_SOFTDEP_FLAGS);
inodedep->id_list.wk_type = D_INODEDEP;
inodedep->id_fs = fs;
inodedep->id_ino = inum;
@ -941,7 +943,7 @@ newblk_lookup(fs, newblkno, flags, newblkpp)
if (sema_get(&newblk_in_progress, 0) == 0)
goto top;
MALLOC(newblk, struct newblk *, sizeof(struct newblk),
M_NEWBLK, M_WAITOK);
M_NEWBLK, M_SOFTDEP_FLAGS);
newblk->nb_state = 0;
newblk->nb_fs = fs;
newblk->nb_newblkno = newblkno;
@ -1127,7 +1129,7 @@ bmsafemap_lookup(bp)
return (WK_BMSAFEMAP(wk));
FREE_LOCK(&lk);
MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
M_BMSAFEMAP, M_WAITOK);
M_BMSAFEMAP, M_SOFTDEP_FLAGS);
bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
bmsafemap->sm_list.wk_state = 0;
bmsafemap->sm_buf = bp;
@ -1187,7 +1189,7 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
struct newblk *newblk;
MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
M_ALLOCDIRECT, M_WAITOK);
M_ALLOCDIRECT, M_SOFTDEP_FLAGS);
bzero(adp, sizeof(struct allocdirect));
adp->ad_list.wk_type = D_ALLOCDIRECT;
adp->ad_lbn = lbn;
@ -1339,7 +1341,7 @@ newfreefrag(ip, blkno, size)
if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
panic("newfreefrag: frag size");
MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
M_FREEFRAG, M_WAITOK);
M_FREEFRAG, M_SOFTDEP_FLAGS);
freefrag->ff_list.wk_type = D_FREEFRAG;
freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */
freefrag->ff_inum = ip->i_number;
@ -1408,7 +1410,7 @@ newallocindir(ip, ptrno, newblkno, oldblkno)
struct allocindir *aip;
MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
M_ALLOCINDIR, M_WAITOK);
M_ALLOCINDIR, M_SOFTDEP_FLAGS);
bzero(aip, sizeof(struct allocindir));
aip->ai_list.wk_type = D_ALLOCINDIR;
aip->ai_state = ATTACHED;
@ -1561,7 +1563,7 @@ setup_allocindir_phase2(bp, ip, aip)
if (indirdep)
break;
MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
M_INDIRDEP, M_WAITOK);
M_INDIRDEP, M_SOFTDEP_FLAGS);
newindirdep->ir_list.wk_type = D_INDIRDEP;
newindirdep->ir_state = ATTACHED;
LIST_INIT(&newindirdep->ir_deplisthd);
@ -1623,7 +1625,7 @@ softdep_setup_freeblocks(ip, length)
if (length != 0)
panic("softde_setup_freeblocks: non-zero length");
MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
M_FREEBLKS, M_WAITOK);
M_FREEBLKS, M_SOFTDEP_FLAGS);
bzero(freeblks, sizeof(struct freeblks));
freeblks->fb_list.wk_type = D_FREEBLKS;
freeblks->fb_uid = ip->i_uid;
@ -1870,7 +1872,7 @@ softdep_freefile(pvp, ino, mode)
* This sets up the inode de-allocation dependency.
*/
MALLOC(freefile, struct freefile *, sizeof(struct freefile),
M_FREEFILE, M_WAITOK);
M_FREEFILE, M_SOFTDEP_FLAGS);
freefile->fx_list.wk_type = D_FREEFILE;
freefile->fx_list.wk_state = 0;
freefile->fx_mode = mode;
@ -2186,7 +2188,7 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
fs = dp->i_fs;
lbn = lblkno(fs, diroffset);
offset = blkoff(fs, diroffset);
MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS);
bzero(dap, sizeof(struct diradd));
dap->da_list.wk_type = D_DIRADD;
dap->da_offset = offset;
@ -2198,12 +2200,12 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
} else {
dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
M_WAITOK);
M_SOFTDEP_FLAGS);
mkdir1->md_list.wk_type = D_MKDIR;
mkdir1->md_state = MKDIR_BODY;
mkdir1->md_diradd = dap;
MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
M_WAITOK);
M_SOFTDEP_FLAGS);
mkdir2->md_list.wk_type = D_MKDIR;
mkdir2->md_state = MKDIR_PARENT;
mkdir2->md_diradd = dap;
@ -2438,7 +2440,7 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
(void) request_cleanup(FLUSH_REMOVE, 0);
num_dirrem += 1;
MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
M_DIRREM, M_WAITOK);
M_DIRREM, M_SOFTDEP_FLAGS);
bzero(dirrem, sizeof(struct dirrem));
dirrem->dm_list.wk_type = D_DIRREM;
dirrem->dm_state = isrmdir ? RMDIR : 0;
@ -2535,7 +2537,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
*/
if (newinum != WINO) {
MALLOC(dap, struct diradd *, sizeof(struct diradd),
M_DIRADD, M_WAITOK);
M_DIRADD, M_SOFTDEP_FLAGS);
bzero(dap, sizeof(struct diradd));
dap->da_list.wk_type = D_DIRADD;
dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
@ -2841,7 +2843,7 @@ softdep_disk_io_initiation(bp)
* Replace up-to-date version with safe version.
*/
MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
M_INDIRDEP, M_WAITOK);
M_INDIRDEP, M_SOFTDEP_FLAGS);
ACQUIRE_LOCK(&lk);
indirdep->ir_state &= ~ATTACHED;
indirdep->ir_state |= UNDONE;
@ -2942,7 +2944,7 @@ initiate_write_inodeblock(inodedep, bp)
if (inodedep->id_savedino != NULL)
panic("initiate_write_inodeblock: already doing I/O");
MALLOC(inodedep->id_savedino, struct dinode *,
sizeof(struct dinode), M_INODEDEP, M_WAITOK);
sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
*inodedep->id_savedino = *dp;
bzero((caddr_t)dp, sizeof(struct dinode));
return;

View File

@ -48,6 +48,7 @@
#include <vm/vm_map.h>
#include <vm/vnode_pager.h>
#include <sys/event.h>
#include <sys/vmmeter.h>
#define VN_KNOTE(vp, b) \
KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
@ -501,6 +502,9 @@ WRITE(ap)
} else {
bawrite(bp);
}
} else if (vm_page_count_severe() || buf_dirty_count_severe()) {
bp->b_flags |= B_CLUSTEROK;
bawrite(bp);
} else {
bp->b_flags |= B_CLUSTEROK;
bdwrite(bp);

View File

@ -80,6 +80,7 @@
#include <sys/sysctl.h>
#include <sys/blist.h>
#include <sys/lock.h>
#include <sys/vmmeter.h>
#ifndef MAX_PAGEOUT_CLUSTER
#define MAX_PAGEOUT_CLUSTER 16
@ -1619,10 +1620,11 @@ swp_pager_async_iodone(bp)
* status, then finish the I/O ( which decrements the
* busy count and possibly wakes waiter's up ).
*/
vm_page_protect(m, VM_PROT_READ);
pmap_clear_modify(m);
vm_page_undirty(m);
vm_page_io_finish(m);
if (!vm_page_count_severe() || !vm_page_try_to_cache(m))
vm_page_protect(m, VM_PROT_READ);
}
}

View File

@ -860,7 +860,7 @@ vm_page_alloc(object, pindex, page_req)
* Don't wakeup too often - wakeup the pageout daemon when
* we would be nearly out of memory.
*/
if (vm_paging_needed() || cnt.v_free_count < cnt.v_pageout_free_min)
if (vm_paging_needed())
pagedaemon_wakeup();
splx(s);
@ -882,10 +882,10 @@ vm_wait()
s = splvm();
if (curproc == pageproc) {
vm_pageout_pages_needed = 1;
tsleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
tsleep(&vm_pageout_pages_needed, PSWP, "VMWait", 0);
} else {
if (!vm_pages_needed) {
vm_pages_needed++;
vm_pages_needed = 1;
wakeup(&vm_pages_needed);
}
tsleep(&cnt.v_free_count, PVM, "vmwait", 0);
@ -1030,7 +1030,8 @@ vm_page_free_wakeup()
* if pageout daemon needs pages, then tell it that there are
* some free.
*/
if (vm_pageout_pages_needed) {
if (vm_pageout_pages_needed &&
cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
wakeup(&vm_pageout_pages_needed);
vm_pageout_pages_needed = 0;
}
@ -1039,9 +1040,9 @@ vm_page_free_wakeup()
* high water mark. And wakeup scheduler process if we have
* lots of memory. this process will swapin processes.
*/
if (vm_pages_needed && vm_page_count_min()) {
wakeup(&cnt.v_free_count);
if (vm_pages_needed && !vm_page_count_min()) {
vm_pages_needed = 0;
wakeup(&cnt.v_free_count);
}
}
@ -1240,6 +1241,9 @@ vm_page_wire(m)
* processes. This optimization causes one-time-use metadata to be
* reused more quickly.
*
* BUT, if we are in a low-memory situation we have no choice but to
* put clean pages on the cache queue.
*
* A number of routines use vm_page_unwire() to guarantee that the page
* will go into either the inactive or active queues, and will NEVER
* be placed in the cache - for example, just after dirtying a page.
@ -1325,6 +1329,25 @@ vm_page_deactivate(vm_page_t m)
_vm_page_deactivate(m, 0);
}
/*
* vm_page_try_to_cache:
*
* Returns 0 on failure, 1 on success
*/
int
vm_page_try_to_cache(vm_page_t m)
{
if (m->dirty || m->hold_count || m->busy || m->wire_count ||
(m->flags & (PG_BUSY|PG_UNMANAGED))) {
return(0);
}
vm_page_test_dirty(m);
if (m->dirty)
return(0);
vm_page_cache(m);
return(1);
}
/*
* vm_page_cache
*

View File

@ -251,6 +251,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
#define PG_SWAPINPROG 0x0200 /* swap I/O in progress on page */
#define PG_NOSYNC 0x0400 /* do not collect for syncer */
#define PG_UNMANAGED 0x0800 /* No PV management for page */
#define PG_MARKER 0x1000 /* special queue marker page */
/*
* Misc constants.
@ -403,6 +404,7 @@ void vm_page_activate __P((vm_page_t));
vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
void vm_page_cache __P((register vm_page_t));
int vm_page_try_to_cache __P((vm_page_t));
void vm_page_dontneed __P((register vm_page_t));
static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
static __inline void vm_page_free __P((vm_page_t));

View File

@ -146,6 +146,7 @@ static int defer_swap_pageouts=0;
static int disable_swap_pageouts=0;
static int max_page_launder=100;
static int vm_pageout_actcmp=0;
#if defined(NO_SWAPPING)
static int vm_swap_enabled=0;
static int vm_swap_idle_enabled=0;
@ -189,6 +190,8 @@ SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
SYSCTL_INT(_vm, OID_AUTO, max_page_launder,
CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass");
SYSCTL_INT(_vm, OID_AUTO, vm_pageout_actcmp,
CTLFLAG_RD, &vm_pageout_actcmp, 0, "pagedaemon agressiveness");
#define VM_PAGEOUT_PAGE_COUNT 16
@ -372,6 +375,7 @@ vm_pageout_flush(mc, count, flags)
*/
for (i = 0; i < count; i++) {
KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL && mc[i]->dirty == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially dirty page", mc[i], i, count));
vm_page_io_start(mc[i]);
vm_page_protect(mc[i], VM_PROT_READ);
}
@ -424,6 +428,8 @@ vm_pageout_flush(mc, count, flags)
if (pageout_status[i] != VM_PAGER_PEND) {
vm_object_pip_wakeup(object);
vm_page_io_finish(mt);
if (!vm_page_count_severe() || !vm_page_try_to_cache(mt))
vm_page_protect(mt, VM_PROT_READ);
}
}
return numpagedout;
@ -621,10 +627,10 @@ static int
vm_pageout_scan()
{
vm_page_t m, next;
struct vm_page marker;
int page_shortage, maxscan, pcount;
int addl_page_shortage, addl_page_shortage_init;
int maxlaunder;
int launder_loop = 0;
struct proc *p, *bigproc;
vm_offset_t size, bigsize;
vm_object_t object;
@ -646,33 +652,37 @@ vm_pageout_scan()
/*
* Calculate the number of pages we want to either free or move
* to the cache.
* to the cache. Be more agressive if we aren't making our target.
*/
page_shortage = vm_paging_target() + addl_page_shortage_init;
page_shortage = vm_paging_target() +
addl_page_shortage_init + vm_pageout_actcmp;
/*
* Figure out what to do with dirty pages when they are encountered.
* Assume that 1/3 of the pages on the inactive list are clean. If
* we think we can reach our target, disable laundering (do not
* clean any dirty pages). If we miss the target we will loop back
* up and do a laundering run.
* Figure out how agressively we should flush dirty pages.
*/
{
int factor = vm_pageout_actcmp;
if (cnt.v_inactive_count / 3 > page_shortage) {
maxlaunder = 0;
launder_loop = 0;
} else {
maxlaunder =
(cnt.v_inactive_target > max_page_launder) ?
max_page_launder : cnt.v_inactive_target;
launder_loop = 1;
maxlaunder = cnt.v_inactive_target / 3 + factor;
if (maxlaunder > max_page_launder + factor)
maxlaunder = max_page_launder + factor;
}
/*
* Initialize our marker
*/
bzero(&marker, sizeof(marker));
marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
marker.queue = PQ_INACTIVE;
marker.wire_count = 1;
/*
* Start scanning the inactive queue for pages we can move to the
* cache or free. The scan will stop when the target is reached or
* we have scanned the entire inactive queue.
* we have scanned the entire inactive queue. Note that m->act_count
* is not used to form decisions for the inactive queue, only for the
* active queue.
*/
rescan0:
@ -690,6 +700,12 @@ vm_pageout_scan()
next = TAILQ_NEXT(m, pageq);
/*
* skip marker pages
*/
if (m->flags & PG_MARKER)
continue;
if (m->hold_count) {
s = splvm();
TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
@ -766,7 +782,8 @@ vm_pageout_scan()
--page_shortage;
/*
* Clean pages can be placed onto the cache queue.
* Clean pages can be placed onto the cache queue. This
* effectively frees them.
*/
} else if (m->dirty == 0) {
vm_page_cache(m);
@ -777,7 +794,6 @@ vm_pageout_scan()
* only a limited number of pages per pagedaemon pass.
*/
} else if (maxlaunder > 0) {
int written;
int swap_pageouts_ok;
struct vnode *vp = NULL;
struct mount *mp;
@ -805,29 +821,6 @@ vm_pageout_scan()
continue;
}
/*
* For now we protect against potential memory
* deadlocks by requiring significant memory to be
* free if the object is not OBJT_DEFAULT or OBJT_SWAP.
* We do not 'trust' any other object type to operate
* with low memory, not even OBJT_DEVICE. The VM
* allocator will special case allocations done by
* the pageout daemon so the check below actually
* does have some hysteresis in it. It isn't the best
* solution, though.
*/
if (object->type != OBJT_DEFAULT &&
object->type != OBJT_SWAP &&
cnt.v_free_count < cnt.v_free_reserved) {
s = splvm();
TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m,
pageq);
splx(s);
continue;
}
/*
* Presumably we have sufficient free memory to do
* the more sophisticated checks and locking required
@ -879,10 +872,15 @@ vm_pageout_scan()
}
/*
* The page might have been moved to another queue
* during potential blocking in vget() above.
* The page might have been moved to another
* queue during potential blocking in vget()
* above. The page might have been freed and
* reused for another vnode. The object might
* have been reused for another vnode.
*/
if (m->queue != PQ_INACTIVE) {
if (m->queue != PQ_INACTIVE ||
m->object != object ||
object->handle != vp) {
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
vput(vp);
@ -891,9 +889,10 @@ vm_pageout_scan()
}
/*
* The page may have been busied during the blocking in
* vput(); We don't move the page back onto the end of
* the queue so that statistics are more correct if we don't.
* The page may have been busied during the
* blocking in vput(); We don't move the
* page back onto the end of the queue so that
* statistics are more correct if we don't.
*/
if (m->busy || (m->flags & PG_BUSY)) {
vput(vp);
@ -921,42 +920,57 @@ vm_pageout_scan()
* If a page is dirty, then it is either being washed
* (but not yet cleaned) or it is still in the
* laundry. If it is still in the laundry, then we
* start the cleaning operation.
* start the cleaning operation. maxlaunder nominally
* counts I/O cost (seeks) rather then bytes.
*
* This operation may cluster, invalidating the 'next'
* pointer. To prevent an inordinate number of
* restarts we use our marker to remember our place.
*/
written = vm_pageout_clean(m);
s = splvm();
TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
splx(s);
if (vm_pageout_clean(m) != 0)
--maxlaunder;
s = splvm();
next = TAILQ_NEXT(&marker, pageq);
TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
splx(s);
if (vp) {
vput(vp);
vn_finished_write(mp);
}
maxlaunder -= written;
}
}
/*
* If we still have a page shortage and we didn't launder anything,
* run the inactive scan again and launder something this time.
* If we were not able to meet our target, increase actcmp
*/
if (launder_loop == 0 && page_shortage > 0) {
launder_loop = 1;
maxlaunder =
(cnt.v_inactive_target > max_page_launder) ?
max_page_launder : cnt.v_inactive_target;
goto rescan0;
if (vm_page_count_min()) {
if (vm_pageout_actcmp < ACT_MAX / 2)
vm_pageout_actcmp += ACT_ADVANCE;
} else {
if (vm_pageout_actcmp < ACT_DECLINE)
vm_pageout_actcmp = 0;
else
vm_pageout_actcmp -= ACT_DECLINE;
}
/*
* Compute the page shortage from the point of view of having to
* move pages from the active queue to the inactive queue.
* Compute the number of pages we want to try to move from the
* active queue to the inactive queue.
*/
page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) -
(cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
page_shortage = vm_paging_target() +
cnt.v_inactive_target - cnt.v_inactive_count;
page_shortage += addl_page_shortage;
page_shortage += vm_pageout_actcmp;
/*
* Scan the active queue for things we can deactivate
* Scan the active queue for things we can deactivate. We nominally
* track the per-page activity counter and use it to locate
* deactivation candidates.
*/
pcount = cnt.v_active_count;
@ -1026,7 +1040,8 @@ vm_pageout_scan()
} else {
m->act_count -= min(m->act_count, ACT_DECLINE);
if (vm_pageout_algorithm_lru ||
(m->object->ref_count == 0) || (m->act_count == 0)) {
(m->object->ref_count == 0) ||
(m->act_count <= vm_pageout_actcmp)) {
page_shortage--;
if (m->object->ref_count == 0) {
vm_page_protect(m, VM_PROT_NONE);
@ -1111,7 +1126,7 @@ vm_pageout_scan()
* make sure that we have swap space -- if we are low on memory and
* swap -- then kill the biggest process.
*/
if ((vm_swap_size == 0 || swap_pager_full) && vm_page_count_min()) {
if ((vm_swap_size < 64 || swap_pager_full) && vm_page_count_min()) {
bigproc = NULL;
bigsize = 0;
for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
@ -1349,20 +1364,31 @@ vm_pageout()
int error;
int s = splvm();
if (vm_pages_needed && vm_page_count_min()) {
/*
* If we have enough free memory, wakeup waiters. Do
* not clear vm_pages_needed until we reach our target,
* otherwise we may be woken up over and over again and
* waste a lot of cpu.
*/
if (vm_pages_needed && !vm_page_count_min()) {
if (vm_paging_needed() <= 0)
vm_pages_needed = 0;
wakeup(&cnt.v_free_count);
}
if (vm_pages_needed) {
/*
* Still not done, sleep a bit and go again
*/
vm_pages_needed = 0;
tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
} else {
/*
* Good enough, sleep & handle stats
*/
vm_pages_needed = 0;
error = tsleep(&vm_pages_needed,
PVM, "psleep", vm_pageout_stats_interval * hz);
if (error && !vm_pages_needed) {
if (vm_pageout_actcmp > 0)
--vm_pageout_actcmp;
splx(s);
vm_pageout_page_stats();
continue;
@ -1371,11 +1397,9 @@ vm_pageout()
if (vm_pages_needed)
cnt.v_pdwakeups++;
vm_pages_needed = 0;
splx(s);
vm_pageout_scan();
vm_pageout_deficit = 0;
wakeup(&cnt.v_free_count);
}
}