diff --git a/sys/fs/specfs/spec_vnops.c b/sys/fs/specfs/spec_vnops.c index 582bece3ba78..f3d7f11f0644 100644 --- a/sys/fs/specfs/spec_vnops.c +++ b/sys/fs/specfs/spec_vnops.c @@ -684,6 +684,8 @@ spec_getpages(ap) bp->b_bcount = size; bp->b_bufsize = size; bp->b_resid = 0; + bp->b_runningbufspace = bp->b_bufsize; + runningbufspace += bp->b_runningbufspace; cnt.v_vnodein++; cnt.v_vnodepgsin += pcount; diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 21d447d33816..99498138a3ff 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -85,22 +85,24 @@ static void buf_daemon __P((void)); * but the code is intricate enough already. */ vm_page_t bogus_page; -int runningbufspace; int vmiodirenable = FALSE; +int runningbufspace; static vm_offset_t bogus_offset; -static int bufspace, maxbufspace, +static int bufspace, maxbufspace, bufmallocspace, maxbufmallocspace, lobufspace, hibufspace; static int bufreusecnt, bufdefragcnt, buffreekvacnt; -static int maxbdrun; static int needsbuffer; -static int numdirtybuffers, hidirtybuffers; +static int lorunningspace, hirunningspace, runningbufreq; +static int numdirtybuffers, lodirtybuffers, hidirtybuffers; static int numfreebuffers, lofreebuffers, hifreebuffers; static int getnewbufcalls; static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, + &lodirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, @@ -111,6 +113,10 @@ SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, + &lorunningspace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, + &hirunningspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, @@ -119,8 +125,6 @@ SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, ""); -SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW, - &maxbdrun, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, @@ -170,9 +174,9 @@ bufhash(struct vnode *vnp, daddr_t bn) */ static __inline void -numdirtywakeup(void) +numdirtywakeup(int level) { - if (numdirtybuffers < hidirtybuffers) { + if (numdirtybuffers <= level) { if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; wakeup(&needsbuffer); @@ -203,6 +207,23 @@ bufspacewakeup(void) } } +/* + * runningbufwakeup() - in-progress I/O accounting. + * + */ +static __inline void +runningbufwakeup(struct buf *bp) +{ + if (bp->b_runningbufspace) { + runningbufspace -= bp->b_runningbufspace; + bp->b_runningbufspace = 0; + if (runningbufreq && runningbufspace <= lorunningspace) { + runningbufreq = 0; + wakeup(&runningbufreq); + } + } +} + /* * bufcountwakeup: * @@ -224,6 +245,31 @@ bufcountwakeup(void) } } +/* + * waitrunningbufspace() + * + * runningbufspace is a measure of the amount of I/O currently + * running. This routine is used in async-write situations to + * prevent creating huge backups of pending writes to a device. + * Only asynchronous writes are governed by this function. + * + * Reads will adjust runningbufspace, but will not block based on it. + * The read load has a side effect of reducing the allowed write load. + * + * This does NOT turn an async write into a sync write. It waits + * for earlier writes to complete and generally returns before the + * caller's write has reached the device. + */ +static __inline void +waitrunningbufspace(void) +{ + while (runningbufspace > hirunningspace) { + ++runningbufreq; + tsleep(&runningbufreq, PVM, "wdrain", 0); + } +} + + /* * vfs_buf_test_cache: * @@ -248,7 +294,7 @@ static __inline__ void bd_wakeup(int dirtybuflevel) { - if (numdirtybuffers >= dirtybuflevel && bd_request == 0) { + if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { bd_request = 1; wakeup(&bd_request); } @@ -330,6 +376,9 @@ bufinit(void) hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); lobufspace = hibufspace - MAXBSIZE; + lorunningspace = 512 * 1024; + hirunningspace = 1024 * 1024; + /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer @@ -354,6 +403,7 @@ bufinit(void) while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } + lodirtybuffers = hidirtybuffers / 2; /* * Try to keep the number of free buffers in the specified range, @@ -370,8 +420,6 @@ bufinit(void) * based on the number of bytes of I/O in-transit that were initiated * from buf_daemon. */ - if ((maxbdrun = nswbuf / 4) < 4) - maxbdrun = 4; bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); bogus_page = vm_page_alloc(kernel_object, @@ -419,7 +467,6 @@ bremfree(struct buf * bp) KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; - runningbufspace += bp->b_bufsize; } else { if (BUF_REFCNT(bp) <= 1) panic("bremfree: removing a buffer not on a queue"); @@ -659,6 +706,13 @@ bwrite(struct buf * bp) int rtval = bufwait(bp); brelse(bp); return (rtval); + } else { + /* + * don't allow the async write to saturate the I/O + * system. There is no chance of deadlock here because + * we are blocking on I/O that is already in-progress. + */ + waitrunningbufspace(); } return (0); @@ -774,11 +828,11 @@ bdwrite(struct buf * bp) bqrelse(bp); /* - * Wakeup the buffer flushing daemon if we have saturated the - * buffer cache. + * Wakeup the buffer flushing daemon if we have a lot of dirty + * buffers (midpoint between our recovery point and our stall + * point). */ - - bd_wakeup(hidirtybuffers); + bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, @@ -817,7 +871,7 @@ bdirty(bp) bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); ++numdirtybuffers; - bd_wakeup(hidirtybuffers); + bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); } } @@ -843,7 +897,7 @@ bundirty(bp) bp->b_flags &= ~B_DELWRI; reassignbuf(bp, bp->b_vp); --numdirtybuffers; - numdirtywakeup(); + numdirtywakeup(lodirtybuffers); } /* * Since it is now being written, we can clear its deferred write flag. @@ -896,14 +950,12 @@ bowrite(struct buf * bp) void bwillwrite(void) { - int slop = hidirtybuffers / 10; - - if (numdirtybuffers > hidirtybuffers + slop) { + if (numdirtybuffers >= hidirtybuffers) { int s; s = splbio(); - while (numdirtybuffers > hidirtybuffers) { - bd_wakeup(hidirtybuffers); + while (numdirtybuffers >= hidirtybuffers) { + bd_wakeup(1); needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0); } @@ -963,7 +1015,7 @@ brelse(struct buf * bp) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) { --numdirtybuffers; - numdirtywakeup(); + numdirtywakeup(lodirtybuffers); } bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { @@ -1169,11 +1221,9 @@ brelse(struct buf * bp) if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) { bp->b_flags &= ~B_DELWRI; --numdirtybuffers; - numdirtywakeup(); + numdirtywakeup(lodirtybuffers); } - runningbufspace -= bp->b_bufsize; - /* * Fixup numfreebuffers count. The bp is on an appropriate queue * unless locked. We then bump numfreebuffers if it is not B_DELWRI. @@ -1248,8 +1298,6 @@ bqrelse(struct buf * bp) TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); } - runningbufspace -= bp->b_bufsize; - if ((bp->b_flags & B_LOCKED) == 0 && ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { bufcountwakeup(); @@ -1309,13 +1357,13 @@ vfs_vmio_release(bp) } } } - runningbufspace -= bp->b_bufsize; splx(s); pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); - if (bp->b_bufsize) + if (bp->b_bufsize) { bufspacewakeup(); + bp->b_bufsize = 0; + } bp->b_npages = 0; - bp->b_bufsize = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); @@ -1723,27 +1771,6 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize) return(bp); } -#if 0 -/* - * waitfreebuffers: - * - * Wait for sufficient free buffers. Only called from normal processes. - */ - -static void -waitfreebuffers(int slpflag, int slptimeo) -{ - while (numfreebuffers < hifreebuffers) { - if (numfreebuffers >= hifreebuffers) - break; - needsbuffer |= VFS_BIO_NEED_FREE; - if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo)) - break; - } -} - -#endif - /* * buf_daemon: * @@ -1753,9 +1780,6 @@ waitfreebuffers(int slpflag, int slptimeo) */ static struct proc *bufdaemonproc; -static int bd_interval; -static int bd_flushto; -static int bd_flushinc; static struct kproc_desc buf_kp = { "bufdaemon", @@ -1783,65 +1807,50 @@ buf_daemon() curproc->p_flag |= P_BUFEXHAUST; s = splbio(); - bd_interval = 5 * hz; /* dynamically adjusted */ - bd_flushto = hidirtybuffers; /* dynamically adjusted */ - bd_flushinc = 1; - for (;;) { kthread_suspend_check(bufdaemonproc); bd_request = 0; /* - * Do the flush. Limit the number of buffers we flush in one - * go. The failure condition occurs when processes are writing - * buffers faster then we can dispose of them. In this case - * we may be flushing so often that the previous set of flushes - * have not had time to complete, causing us to run out of - * physical buffers and block. + * Do the flush. Limit the amount of in-transit I/O we + * allow to build up, otherwise we would completely saturate + * the I/O system. Wakeup any waiting processes before we + * normally would so they can run in parallel with our drain. */ - { - int runcount = maxbdrun; - - while (numdirtybuffers > bd_flushto && runcount) { - --runcount; - if (flushbufqueues() == 0) - break; - } - } - - if (bd_request || - tsleep(&bd_request, PVM, "psleep", bd_interval) == 0) { - /* - * Another request is pending or we were woken up - * without timing out. Flush more. - */ - --bd_flushto; - if (bd_flushto >= numdirtybuffers - 5) { - bd_flushto = numdirtybuffers - 10; - bd_flushinc = 1; - } - if (bd_flushto < 2) - bd_flushto = 2; - } else { - /* - * We slept and timed out, we can slow down. - */ - bd_flushto += bd_flushinc; - if (bd_flushto > hidirtybuffers) - bd_flushto = hidirtybuffers; - ++bd_flushinc; - if (bd_flushinc > hidirtybuffers / 20 + 1) - bd_flushinc = hidirtybuffers / 20 + 1; + while (numdirtybuffers > lodirtybuffers) { + if (flushbufqueues() == 0) + break; + waitrunningbufspace(); + numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); } /* - * Set the interval on a linear scale based on hidirtybuffers - * with a maximum frequency of 1/10 second. + * Only clear bd_request if we have reached our low water + * mark. The buf_daemon normally waits 5 seconds and + * then incrementally flushes any dirty buffers that have + * built up, within reason. + * + * If we were unable to hit our low water mark and couldn't + * find any flushable buffers, we sleep half a second. + * Otherwise we loop immediately. */ - bd_interval = bd_flushto * 5 * hz / hidirtybuffers; - if (bd_interval < hz / 10) - bd_interval = hz / 10; + if (numdirtybuffers <= lodirtybuffers) { + /* + * We reached our low water mark, reset the + * request and sleep until we are needed again. + * The sleep is just so the suspend code works. + */ + bd_request = 0; + tsleep(&bd_request, PVM, "psleep", hz); + } else { + /* + * We couldn't find any flushable dirty buffers but + * still have too many dirty buffers, we + * have to sleep and try again. (rare) + */ + tsleep(&bd_request, PVM, "qsleep", hz / 2); + } } } @@ -2097,21 +2106,11 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) * * XXX remove if 0 sections (clean this up after its proven) */ -#if 0 - if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) { -#endif - if (numfreebuffers == 0) { - if (curproc == idleproc) - return NULL; - needsbuffer |= VFS_BIO_NEED_ANY; - tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", - slptimeo); - } -#if 0 - } else if (numfreebuffers < lofreebuffers) { - waitfreebuffers(slpflag, slptimeo); + if (numfreebuffers == 0) { + if (curproc == idleproc) + return NULL; + needsbuffer |= VFS_BIO_NEED_ANY; } -#endif if ((bp = gbincore(vp, blkno))) { /* @@ -2357,12 +2356,12 @@ allocbuf(struct buf *bp, int size) bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); - bufmallocspace -= bp->b_bufsize; - runningbufspace -= bp->b_bufsize; - if (bp->b_bufsize) + if (bp->b_bufsize) { + bufmallocspace -= bp->b_bufsize; bufspacewakeup(); + bp->b_bufsize = 0; + } bp->b_data = bp->b_kvabase; - bp->b_bufsize = 0; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } @@ -2389,7 +2388,6 @@ allocbuf(struct buf *bp, int size) bp->b_bcount = size; bp->b_flags |= B_MALLOC; bufmallocspace += mbsize; - runningbufspace += bp->b_bufsize; return 1; } #endif @@ -2404,11 +2402,11 @@ allocbuf(struct buf *bp, int size) origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; - bufmallocspace -= bp->b_bufsize; - runningbufspace -= bp->b_bufsize; - if (bp->b_bufsize) + if (bp->b_bufsize) { + bufmallocspace -= bp->b_bufsize; bufspacewakeup(); - bp->b_bufsize = 0; + bp->b_bufsize = 0; + } bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } @@ -2601,7 +2599,6 @@ allocbuf(struct buf *bp, int size) (vm_offset_t)(bp->b_offset & PAGE_MASK)); } } - runningbufspace += (newbsize - bp->b_bufsize); if (newbsize < bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = newbsize; /* actual buffer allocation */ @@ -2681,6 +2678,7 @@ bufdone(struct buf *bp) KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); bp->b_flags |= B_DONE; + runningbufwakeup(bp); if (bp->b_iocmd == BIO_DELETE) { brelse(bp); @@ -2768,18 +2766,8 @@ bufdone(struct buf *bp) if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); - if (!m) { + if (m == NULL) panic("biodone: page disappeared!"); -#if defined(VFS_BIO_DEBUG) - printf("biodone: page disappeared\n"); -#endif - vm_object_pip_subtract(obj, 1); - bp->b_flags &= ~B_CACHE; - foff = (foff + PAGE_SIZE) & - ~(off_t)PAGE_MASK; - iosize -= resid; - continue; - } bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } @@ -2833,6 +2821,7 @@ bufdone(struct buf *bp) if (obj) vm_object_pip_wakeupn(obj, 0); } + /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup @@ -2860,6 +2849,7 @@ vfs_unbusy_pages(struct buf * bp) { int i; + runningbufwakeup(bp); if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj; @@ -2939,6 +2929,9 @@ vfs_busy_pages(struct buf * bp, int clear_modify) { int i, bogus; + bp->b_runningbufspace = bp->b_bufsize; + runningbufspace += bp->b_runningbufspace; + if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj; diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 29a1879a3e14..088dc405589b 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -247,8 +247,12 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) printf("S(%ld,%ld,%d) ", (long)bp->b_lblkno, bp->b_bcount, seqcount); #endif - if ((bp->b_flags & B_CLUSTER) == 0) + if ((bp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(bp, 0); + } else { + bp->b_runningbufspace = bp->b_bufsize; + runningbufspace += bp->b_runningbufspace; + } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) @@ -283,8 +287,12 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) } #endif - if ((rbp->b_flags & B_CLUSTER) == 0) + if ((rbp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(rbp, 0); + } else { + rbp->b_runningbufspace = rbp->b_bufsize; + runningbufspace += rbp->b_runningbufspace; + } rbp->b_flags &= ~B_INVAL; rbp->b_ioflags &= ~BIO_ERROR; if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 582bece3ba78..f3d7f11f0644 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -684,6 +684,8 @@ spec_getpages(ap) bp->b_bcount = size; bp->b_bufsize = size; bp->b_resid = 0; + bp->b_runningbufspace = bp->b_bufsize; + runningbufspace += bp->b_runningbufspace; cnt.v_vnodein++; cnt.v_vnodepgsin += pcount; diff --git a/sys/sys/buf.h b/sys/sys/buf.h index a10083fcd96c..223c03621bbb 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -110,6 +110,7 @@ struct buf { unsigned char b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ long b_bufsize; /* Allocated buffer size. */ + long b_runningbufspace; /* when I/O is running, pipelining */ caddr_t b_kvabase; /* base kva for buffer */ int b_kvasize; /* size of kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ @@ -480,6 +481,7 @@ buf_countdeps(struct buf *bp, int i) #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ +extern int runningbufspace; extern int buf_maxio; /* nominal maximum I/O for buffer */ extern struct buf *buf; /* The buffer headers. */ extern char *buffers; /* The buffer contents. */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 75462f62f526..2ab6f3f7f523 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -213,6 +213,7 @@ struct vattr { #define IO_NDELAY 0x10 /* FNDELAY flag set in file table */ #define IO_VMIO 0x20 /* data already in VMIO space */ #define IO_INVAL 0x40 /* invalidate after I/O */ +#define IO_ASYNC 0x80 /* bawrite rather then bdwrite */ /* * Modes. Some values same as Ixxx entries from inode.h for now. diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c index e1d775c8e97f..62ec9e309b40 100644 --- a/sys/ufs/ufs/ufs_readwrite.c +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -504,7 +504,9 @@ WRITE(ap) if (ioflag & IO_SYNC) { (void)bwrite(bp); - } else if (vm_page_count_severe() || buf_dirty_count_severe()) { + } else if (vm_page_count_severe() || + buf_dirty_count_severe() || + (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else if (xfersize + blkoffset == fs->fs_bsize) { diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 1b2db6e48dab..7cbe750f1a60 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1273,6 +1273,7 @@ vm_page_unwire(m, activate) vm_page_queues[PQ_ACTIVE].lcnt++; cnt.v_active_count++; } else { + vm_page_flag_clear(m, PG_WINATCFLS); TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); m->queue = PQ_INACTIVE; vm_page_queues[PQ_INACTIVE].lcnt++; @@ -1311,6 +1312,7 @@ _vm_page_deactivate(vm_page_t m, int athead) if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { if ((m->queue - m->pc) == PQ_CACHE) cnt.v_reactivated++; + vm_page_flag_clear(m, PG_WINATCFLS); vm_page_unqueue(m); if (athead) TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 4c31df9dd913..dc8290e34a07 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -242,6 +242,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT]; */ #define PG_BUSY 0x0001 /* page is in transit (O) */ #define PG_WANTED 0x0002 /* someone is waiting for page (O) */ +#define PG_WINATCFLS 0x0004 /* flush dirty page on inactive q */ #define PG_FICTITIOUS 0x0008 /* physical page doesn't exist (O) */ #define PG_WRITEABLE 0x0010 /* page is mapped writeable */ #define PG_MAPPED 0x0020 /* page is mapped */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index dbea3d64debe..943fb1178e52 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -106,7 +106,7 @@ /* the kernel process "vm_pageout"*/ static void vm_pageout __P((void)); static int vm_pageout_clean __P((vm_page_t)); -static int vm_pageout_scan __P((void)); +static void vm_pageout_scan __P((int pass)); static int vm_pageout_free_page_calc __P((vm_size_t count)); struct proc *pageproc; @@ -140,14 +140,13 @@ static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; #endif extern int vm_swap_size; +static int vm_max_launder = 32; static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; static int vm_pageout_full_stats_interval = 0; -static int vm_pageout_stats_free_max=0, vm_pageout_algorithm_lru=0; +static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; static int defer_swap_pageouts=0; static int disable_swap_pageouts=0; -static int max_page_launder=100; -static int vm_pageout_actcmp=0; #if defined(NO_SWAPPING) static int vm_swap_enabled=0; static int vm_swap_idle_enabled=0; @@ -157,7 +156,10 @@ static int vm_swap_idle_enabled=0; #endif SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm, - CTLFLAG_RW, &vm_pageout_algorithm_lru, 0, "LRU page mgmt"); + CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt"); + +SYSCTL_INT(_vm, OID_AUTO, max_launder, + CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); @@ -189,12 +191,6 @@ SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); -SYSCTL_INT(_vm, OID_AUTO, max_page_launder, - CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass"); -SYSCTL_INT(_vm, OID_AUTO, vm_pageout_actcmp, - CTLFLAG_RD, &vm_pageout_actcmp, 0, "pagedaemon agressiveness"); - - #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; @@ -509,7 +505,7 @@ vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only) } else if (p->queue == PQ_ACTIVE) { if ((p->flags & PG_REFERENCED) == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); - if (!remove_mode && (vm_pageout_algorithm_lru || (p->act_count == 0))) { + if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) { vm_page_protect(p, VM_PROT_NONE); vm_page_deactivate(p); } else { @@ -627,20 +623,21 @@ vm_pageout_page_free(vm_page_t m) { /* * vm_pageout_scan does the dirty work for the pageout daemon. */ -static int -vm_pageout_scan() +static void +vm_pageout_scan(int pass) { vm_page_t m, next; struct vm_page marker; + int save_page_shortage; + int save_inactive_count; int page_shortage, maxscan, pcount; int addl_page_shortage, addl_page_shortage_init; - int maxlaunder; struct proc *p, *bigproc; vm_offset_t size, bigsize; vm_object_t object; - int force_wakeup = 0; int actcount; int vnodes_skipped = 0; + int maxlaunder; int s; /* @@ -651,27 +648,13 @@ vm_pageout_scan() addl_page_shortage_init = vm_pageout_deficit; vm_pageout_deficit = 0; - if (max_page_launder == 0) - max_page_launder = 1; - /* * Calculate the number of pages we want to either free or move - * to the cache. Be more agressive if we aren't making our target. + * to the cache. */ - - page_shortage = vm_paging_target() + - addl_page_shortage_init + vm_pageout_actcmp; - - /* - * Figure out how agressively we should flush dirty pages. - */ - { - int factor = vm_pageout_actcmp; - - maxlaunder = cnt.v_inactive_target / 3 + factor; - if (maxlaunder > max_page_launder + factor) - maxlaunder = max_page_launder + factor; - } + page_shortage = vm_paging_target() + addl_page_shortage_init; + save_page_shortage = page_shortage; + save_inactive_count = cnt.v_inactive_count; /* * Initialize our marker @@ -687,8 +670,22 @@ vm_pageout_scan() * we have scanned the entire inactive queue. Note that m->act_count * is not used to form decisions for the inactive queue, only for the * active queue. + * + * maxlaunder limits the number of dirty pages we flush per scan. + * For most systems a smaller value (16 or 32) is more robust under + * extreme memory and disk pressure because any unnecessary writes + * to disk can result in extreme performance degredation. However, + * systems with excessive dirty pages (especially when MAP_NOSYNC is + * used) will die horribly with limited laundering. If the pageout + * daemon cannot clean enough pages in the first pass, we let it go + * all out in succeeding passes. */ + if ((maxlaunder = vm_max_launder) <= 1) + maxlaunder = 1; + if (pass) + maxlaunder = 10000; + rescan0: addl_page_shortage = addl_page_shortage_init; maxscan = cnt.v_inactive_count; @@ -792,12 +789,32 @@ vm_pageout_scan() } else if (m->dirty == 0) { vm_page_cache(m); --page_shortage; - - /* - * Dirty pages need to be paged out. Note that we clean - * only a limited number of pages per pagedaemon pass. - */ + } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { + /* + * Dirty pages need to be paged out, but flushing + * a page is extremely expensive verses freeing + * a clean page. Rather then artificially limiting + * the number of pages we can flush, we instead give + * dirty pages extra priority on the inactive queue + * by forcing them to be cycled through the queue + * twice before being flushed, after which the + * (now clean) page will cycle through once more + * before being freed. This significantly extends + * the thrash point for a heavily loaded machine. + */ + s = splvm(); + vm_page_flag_set(m, PG_WINATCFLS); + TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); + splx(s); } else if (maxlaunder > 0) { + /* + * We always want to try to flush some dirty pages if + * we encounter them, to keep the system stable. + * Normally this number is small, but under extreme + * pressure where there are insufficient clean pages + * on the inactive queue, we may have to go all out. + */ int swap_pageouts_ok; struct vnode *vp = NULL; struct mount *mp; @@ -826,29 +843,24 @@ vm_pageout_scan() } /* - * Presumably we have sufficient free memory to do - * the more sophisticated checks and locking required - * for vnodes. + * The object is already known NOT to be dead. It + * is possible for the vget() to block the whole + * pageout daemon, but the new low-memory handling + * code should prevent it. * - * The object is already known NOT to be dead. The - * vget() may still block, though, because - * VOP_ISLOCKED() doesn't check to see if an inode - * (v_data) is associated with the vnode. If it isn't, - * vget() will load in it from disk. Worse, vget() - * may actually get stuck waiting on "inode" if another - * process is in the process of bringing the inode in. - * This is bad news for us either way. + * The previous code skipped locked vnodes and, worse, + * reordered pages in the queue. This results in + * completely non-deterministic operation and, on a + * busy system, can lead to extremely non-optimal + * pageouts. For example, it can cause clean pages + * to be freed and dirty pages to be moved to the end + * of the queue. Since dirty pages are also moved to + * the end of the queue once-cleaned, this gives + * way too large a weighting to defering the freeing + * of dirty pages. * - * So for the moment we check v_data == NULL as a - * workaround. This means that vnodes which do not - * use v_data in the way we expect probably will not - * wind up being paged out by the pager and it will be - * up to the syncer to get them. That's better then - * us blocking here. - * - * This whole code section is bogus - we need to fix - * the vnode pager to handle vm_page_t's without us - * having to do any sophisticated VOP tests. + * XXX we need to be able to apply a timeout to the + * vget() lock attempt. */ if (object->type == OBJT_VNODE) { @@ -857,19 +869,8 @@ vm_pageout_scan() mp = NULL; if (vp->v_type == VREG) vn_start_write(vp, &mp, V_NOWAIT); - if (VOP_ISLOCKED(vp, NULL) || - vp->v_data == NULL || - vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) { + if (vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) { vn_finished_write(mp); - if ((m->queue == PQ_INACTIVE) && - (m->hold_count == 0) && - (m->busy == 0) && - (m->flags & PG_BUSY) == 0) { - s = splvm(); - TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); - TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); - splx(s); - } if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; continue; @@ -924,18 +925,23 @@ vm_pageout_scan() * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we - * start the cleaning operation. maxlaunder nominally - * counts I/O cost (seeks) rather then bytes. + * start the cleaning operation. * * This operation may cluster, invalidating the 'next' * pointer. To prevent an inordinate number of * restarts we use our marker to remember our place. + * + * decrement page_shortage on success to account for + * the (future) cleaned page. Otherwise we could wind + * up laundering or cleaning too many pages. */ s = splvm(); TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq); splx(s); - if (vm_pageout_clean(m) != 0) + if (vm_pageout_clean(m) != 0) { + --page_shortage; --maxlaunder; + } s = splvm(); next = TAILQ_NEXT(&marker, pageq); TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq); @@ -947,29 +953,13 @@ vm_pageout_scan() } } - /* - * If we were not able to meet our target, increase actcmp - */ - - if (vm_page_count_min()) { - if (vm_pageout_actcmp < ACT_MAX / 2) - vm_pageout_actcmp += ACT_ADVANCE; - } else { - if (vm_pageout_actcmp < ACT_DECLINE) - vm_pageout_actcmp = 0; - else - vm_pageout_actcmp -= ACT_DECLINE; - } - /* * Compute the number of pages we want to try to move from the * active queue to the inactive queue. */ - page_shortage = vm_paging_target() + cnt.v_inactive_target - cnt.v_inactive_count; page_shortage += addl_page_shortage; - page_shortage += vm_pageout_actcmp; /* * Scan the active queue for things we can deactivate. We nominally @@ -1043,9 +1033,9 @@ vm_pageout_scan() splx(s); } else { m->act_count -= min(m->act_count, ACT_DECLINE); - if (vm_pageout_algorithm_lru || - (m->object->ref_count == 0) || - (m->act_count <= vm_pageout_actcmp)) { + if (vm_pageout_algorithm || + m->object->ref_count == 0 || + m->act_count == 0) { page_shortage--; if (m->object->ref_count == 0) { vm_page_protect(m, VM_PROT_NONE); @@ -1175,7 +1165,6 @@ vm_pageout_scan() wakeup(&cnt.v_free_count); } } - return force_wakeup; } /* @@ -1254,11 +1243,13 @@ vm_pageout_page_stats() } else { if (m->act_count == 0) { /* - * We turn off page access, so that we have more accurate - * RSS stats. We don't do this in the normal page deactivation - * when the system is loaded VM wise, because the cost of - * the large number of page protect operations would be higher - * than the value of doing the operation. + * We turn off page access, so that we have + * more accurate RSS stats. We don't do this + * in the normal page deactivation when the + * system is loaded VM wise, because the + * cost of the large number of page protect + * operations would be higher than the value + * of doing the operation. */ vm_page_protect(m, VM_PROT_NONE); vm_page_deactivate(m); @@ -1307,6 +1298,7 @@ vm_size_t count; static void vm_pageout() { + int pass; mtx_enter(&Giant, MTX_DEF); @@ -1320,11 +1312,18 @@ vm_pageout() vm_pageout_free_page_calc(cnt.v_page_count); /* - * free_reserved needs to include enough for the largest swap pager - * structures plus enough for any pv_entry structs when paging. + * v_free_target and v_cache_min control pageout hysteresis. Note + * that these are more a measure of the VM cache queue hysteresis + * then the VM free queue. Specifically, v_free_target is the + * high water mark (free+cache pages). + * + * v_free_reserved + v_cache_min (mostly means v_cache_min) is the + * low water mark, while v_free_min is the stop. v_cache_min must + * be big enough to handle memory needs while the pageout daemon + * is signalled and run to free more pages. */ if (cnt.v_free_count > 6144) - cnt.v_free_target = 3 * cnt.v_free_min + cnt.v_free_reserved; + cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved; else cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved; @@ -1362,10 +1361,9 @@ vm_pageout() if (vm_pageout_stats_free_max == 0) vm_pageout_stats_free_max = 5; - max_page_launder = (cnt.v_page_count > 1800 ? 32 : 16); - curproc->p_flag |= P_BUFEXHAUST; swap_pager_swap_init(); + pass = 0; /* * The pageout daemon is never done, so loop forever. */ @@ -1386,19 +1384,27 @@ vm_pageout() } if (vm_pages_needed) { /* - * Still not done, sleep a bit and go again + * Still not done, take a second pass without waiting + * (unlimited dirty cleaning), otherwise sleep a bit + * and try again. */ - tsleep(&vm_pages_needed, PVM, "psleep", hz/2); + ++pass; + if (pass > 1) + tsleep(&vm_pages_needed, PVM, "psleep", hz/2); } else { /* - * Good enough, sleep & handle stats + * Good enough, sleep & handle stats. Prime the pass + * for the next run. */ + if (pass > 1) + pass = 1; + else + pass = 0; error = tsleep(&vm_pages_needed, PVM, "psleep", vm_pageout_stats_interval * hz); if (error && !vm_pages_needed) { - if (vm_pageout_actcmp > 0) - --vm_pageout_actcmp; splx(s); + pass = 0; vm_pageout_page_stats(); continue; } @@ -1407,7 +1413,7 @@ vm_pageout() if (vm_pages_needed) cnt.v_pdwakeups++; splx(s); - vm_pageout_scan(); + vm_pageout_scan(pass); vm_pageout_deficit = 0; } } diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index 3dd12ec9b8ad..c79f62ac8252 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -300,10 +300,29 @@ vnode_pager_setsize(vp, nsize) m = vm_page_lookup(object, OFF_TO_IDX(nsize)); if (m) { + int base = (int)nsize & PAGE_MASK; + int size = PAGE_SIZE - base; + + /* + * Clear out partial-page garbage in case + * the page has been mapped. + */ kva = vm_pager_map_page(m); - bzero((caddr_t) kva + (nsize & PAGE_MASK), - (int) (round_page(nsize) - nsize)); + bzero((caddr_t)kva + base, size); vm_pager_unmap_page(kva); + + /* + * Clear out partial-page dirty bits. This + * has the side effect of setting the valid + * bits, but that is ok. There are a bunch + * of places in the VM system where we expected + * m->dirty == VM_PAGE_BITS_ALL. The file EOF + * case is one of them. If the page is still + * partially dirty, make it fully dirty. + */ + vm_page_set_validclean(m, base, size); + if (m->dirty != 0) + m->dirty = VM_PAGE_BITS_ALL; } } } @@ -424,6 +443,8 @@ vnode_pager_input_smlfs(object, m) pbgetvp(dp, bp); bp->b_bcount = bsize; bp->b_bufsize = bsize; + bp->b_runningbufspace = bp->b_bufsize; + runningbufspace += bp->b_runningbufspace; /* do the input */ BUF_STRATEGY(bp); @@ -742,6 +763,8 @@ vnode_pager_generic_getpages(vp, m, bytecount, reqpage) pbgetvp(dp, bp); bp->b_bcount = size; bp->b_bufsize = size; + bp->b_runningbufspace = bp->b_bufsize; + runningbufspace += bp->b_runningbufspace; cnt.v_vnodein++; cnt.v_vnodepgsin += count; @@ -888,6 +911,11 @@ vnode_pager_putpages(object, m, count, sync, rtvals) /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_PUTPAGES. + * + * This is typically called indirectly via the pageout daemon and + * clustering has already typically occured, so in general we ask the + * underlying filesystem to write the data out asynchronously rather + * then delayed. */ int vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals) @@ -938,8 +966,13 @@ vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals) } } + /* + * pageouts are already clustered, use IO_ASYNC t o force a bawrite() + * rather then a bdwrite() to prevent paging I/O from saturating + * the buffer cache. + */ ioflags = IO_VMIO; - ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: 0; + ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: IO_ASYNC; ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0; aiov.iov_base = (caddr_t) 0;