diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h index e382d900f6df..2ae45a378cdc 100644 --- a/sys/sys/vmmeter.h +++ b/sys/sys/vmmeter.h @@ -91,9 +91,95 @@ struct vmmeter { u_int v_cache_max; /* max number of pages in cached obj */ u_int v_pageout_free_min; /* min number pages reserved for kernel */ u_int v_interrupt_free_min; /* reserved number of pages for int code */ + u_int v_free_severe; /* severe depletion of pages below this pt */ }; #ifdef KERNEL + extern struct vmmeter cnt; + +/* + * Return TRUE if we are under our reserved low-free-pages threshold + */ + +static __inline +int +vm_page_count_reserved(void) +{ + return (cnt.v_free_reserved > (cnt.v_free_count + cnt.v_cache_count)); +} + +/* + * Return TRUE if we are under our severe low-free-pages threshold + * + * This routine is typically used at the user<->system interface to determine + * whether we need to block in order to avoid a low memory deadlock. + */ + +static __inline +int +vm_page_count_severe(void) +{ + return (cnt.v_free_severe > (cnt.v_free_count + cnt.v_cache_count)); +} + +/* + * Return TRUE if we are under our minimum low-free-pages threshold. + * + * This routine is typically used within the system to determine whether + * we can execute potentially very expensive code in terms of memory. It + * is also used by the pageout daemon to calculate when to sleep, when + * to wake waiters up, and when (after making a pass) to become more + * desparate. + */ + +static __inline +int +vm_page_count_min(void) +{ + return (cnt.v_free_min > (cnt.v_free_count + cnt.v_cache_count)); +} + +/* + * Return TRUE if we have not reached our free page target during + * free page recovery operations. + */ + +static __inline +int +vm_page_count_target(void) +{ + return (cnt.v_free_target > (cnt.v_free_count + cnt.v_cache_count)); +} + +/* + * Return the number of pages we need to free-up or cache + * A positive number indicates that we do not have enough free pages. + */ + +static __inline +int +vm_paging_target(void) +{ + return ( + (cnt.v_free_target + cnt.v_cache_min) - + (cnt.v_free_count + cnt.v_cache_count) + ); +} + +/* + * Return a positive number if the pagedaemon needs to be woken up. + */ + +static __inline +int +vm_paging_needed(void) +{ + return ( + (cnt.v_free_reserved + cnt.v_cache_min) > + (cnt.v_free_count + cnt.v_cache_count) + ); +} + #endif /* systemwide totals computed every five seconds */ diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index e53079a43a0f..1d7157c7450a 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -209,19 +209,9 @@ vm_fork(p1, p2, flags) p1->p_vmspace->vm_refcnt++; } - /* - * Great, so we have a memory-heavy process and the - * entire machine comes to a screaching halt because - * nobody can fork/exec anything. What we really need - * to do is fix the process swapper so it swaps out the right - * processes. - */ -#if 0 - while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { - vm_pageout_deficit += (UPAGES + VM_INITIAL_PAGEIN); + while (vm_page_count_severe()) { VM_WAIT; } -#endif if ((flags & RFMEM) == 0) { p2->p_vmspace = vmspace_fork(p1->p_vmspace); @@ -339,8 +329,9 @@ scheduler(dummy) int ppri; loop: - while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { + if (vm_page_count_min()) { VM_WAIT; + goto loop; } pp = NULL; diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index 4e7f0fb12c3d..6c695624bf52 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -119,6 +119,8 @@ SYSCTL_INT(_vm, VM_V_CACHE_MAX, v_cache_max, CTLFLAG_RW, &cnt.v_cache_max, 0, ""); SYSCTL_INT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min, CTLFLAG_RW, &cnt.v_pageout_free_min, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, v_free_severe, + CTLFLAG_RW, &cnt.v_free_severe, 0, ""); SYSCTL_STRUCT(_vm, VM_LOADAVG, loadavg, CTLFLAG_RD, &averunnable, loadavg, "Machine loadaverage history"); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index f6db00ef98a0..533ba37fff6f 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -615,8 +615,7 @@ vm_page_unqueue(m) (*pq->cnt)--; pq->lcnt--; if ((queue - m->pc) == PQ_CACHE) { - if ((cnt.v_cache_count + cnt.v_free_count) < - (cnt.v_free_reserved + cnt.v_cache_min)) + if (vm_paging_needed()) pagedaemon_wakeup(); } } @@ -871,9 +870,7 @@ loop: * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ - if (((cnt.v_free_count + cnt.v_cache_count) < - (cnt.v_free_reserved + cnt.v_cache_min)) || - (cnt.v_free_count < cnt.v_pageout_free_min)) + if (vm_paging_needed() || cnt.v_free_count < cnt.v_pageout_free_min) pagedaemon_wakeup(); splx(s); @@ -991,6 +988,8 @@ vm_page_asleep(vm_page_t m, char *msg, char *busy) { * vm_page_activate: * * Put the specified page on the active list (if appropriate). + * Ensure that act_count is at least ACT_INIT but do not otherwise + * mess with it. * * The page queues must be locked. * This routine may not block. @@ -1050,8 +1049,7 @@ vm_page_free_wakeup() * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ - if (vm_pages_needed && - ((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) { + if (vm_pages_needed && vm_page_count_min()) { wakeup(&cnt.v_free_count); vm_pages_needed = 0; } @@ -1261,11 +1259,14 @@ vm_page_unwire(m, activate) * Move the specified page to the inactive queue. If the page has * any associated swap, the swap is deallocated. * + * Normally athead is 0 resulting in LRU operation. athead is set + * to 1 if we want this page to be 'as if it were placed in the cache', + * except without unmapping it from the process address space. + * * This routine may not block. */ -void -vm_page_deactivate(m) - register vm_page_t m; +static __inline void +_vm_page_deactivate(vm_page_t m, int athead) { int s; @@ -1280,7 +1281,10 @@ vm_page_deactivate(m) if ((m->queue - m->pc) == PQ_CACHE) cnt.v_reactivated++; vm_page_unqueue(m); - TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); + if (athead) + TAILQ_INSERT_HEAD(&vm_page_queue_inactive, m, pageq); + else + TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); m->queue = PQ_INACTIVE; vm_page_queues[PQ_INACTIVE].lcnt++; cnt.v_inactive_count++; @@ -1288,6 +1292,12 @@ vm_page_deactivate(m) splx(s); } +void +vm_page_deactivate(vm_page_t m) +{ + _vm_page_deactivate(m, 0); +} + /* * vm_page_cache * @@ -1332,6 +1342,70 @@ vm_page_cache(m) splx(s); } +/* + * vm_page_dontneed + * + * Cache, deactivate, or do nothing as appropriate. This routine + * is typically used by madvise() MADV_DONTNEED. + * + * Generally speaking we want to move the page into the cache so + * it gets reused quickly. However, this can result in a silly syndrome + * due to the page recycling too quickly. Small objects will not be + * fully cached. On the otherhand, if we move the page to the inactive + * queue we wind up with a problem whereby very large objects + * unnecessarily blow away our inactive and cache queues. + * + * The solution is to move the pages based on a fixed weighting. We + * either leave them alone, deactivate them, or move them to the cache, + * where moving them to the cache has the highest weighting. + * By forcing some pages into other queues we eventually force the + * system to balance the queues, potentially recovering other unrelated + * space from active. The idea is to not force this to happen too + * often. + */ + +void +vm_page_dontneed(m) + vm_page_t m; +{ + static int dnweight; + int dnw; + int head; + + dnw = ++dnweight; + + /* + * occassionally leave the page alone + */ + + if ((dnw & 0x01F0) == 0 || + m->queue == PQ_INACTIVE || + m->queue - m->pc == PQ_CACHE + ) { + if (m->act_count >= ACT_INIT) + --m->act_count; + return; + } + + if (m->dirty == 0) + vm_page_test_dirty(m); + + if (m->dirty || (dnw & 0x0070) == 0) { + /* + * Deactivate the page 3 times out of 32. + */ + head = 0; + } else { + /* + * Cache the page 28 times out of every 32. Note that + * the page is deactivated instead of cached, but placed + * at the head of the queue instead of the tail. + */ + head = 1; + } + _vm_page_deactivate(m, head); +} + /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 6ffb8676eebb..2d7e7401dfe2 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -136,7 +136,8 @@ struct vm_page { }; /* - * note SWAPBLK_NONE is a flag, basically the high bit. + * note: currently use SWAPBLK_NONE as an absolute value rather then + * a flag bit. */ #define SWAPBLK_MASK ((daddr_t)((u_daddr_t)-1 >> 1)) /* mask */ @@ -391,6 +392,7 @@ void vm_page_activate __P((vm_page_t)); vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int)); vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int)); void vm_page_cache __P((register vm_page_t)); +void vm_page_dontneed __P((register vm_page_t)); static __inline void vm_page_copy __P((vm_page_t, vm_page_t)); static __inline void vm_page_free __P((vm_page_t)); static __inline void vm_page_free_zero __P((vm_page_t)); diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index bc8784cc587d..d24e51cec6f8 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -219,7 +219,7 @@ vm_pageout_clean(m) register vm_object_t object; vm_page_t mc[2*vm_pageout_page_count]; int pageout_count; - int i, forward_okay, backward_okay, page_base; + int ib, is, page_base; vm_pindex_t pindex = m->pindex; object = m->object; @@ -243,11 +243,9 @@ vm_pageout_clean(m) mc[vm_pageout_page_count] = m; pageout_count = 1; page_base = vm_pageout_page_count; - forward_okay = TRUE; - if (pindex != 0) - backward_okay = TRUE; - else - backward_okay = FALSE; + ib = 1; + is = 1; + /* * Scan object for clusterable pages. * @@ -258,81 +256,83 @@ vm_pageout_clean(m) * active page. * -or- * 2) we force the issue. + * + * During heavy mmap/modification loads the pageout + * daemon can really fragment the underlying file + * due to flushing pages out of order and not trying + * align the clusters (which leave sporatic out-of-order + * holes). To solve this problem we do the reverse scan + * first and attempt to align our cluster, then do a + * forward scan if room remains. */ - for (i = 1; (i < vm_pageout_page_count) && (forward_okay || backward_okay); i++) { + +more: + while (ib && pageout_count < vm_pageout_page_count) { vm_page_t p; - /* - * See if forward page is clusterable. - */ - if (forward_okay) { - /* - * Stop forward scan at end of object. - */ - if ((pindex + i) > object->size) { - forward_okay = FALSE; - goto do_backward; - } - p = vm_page_lookup(object, pindex + i); - if (p) { - if (((p->queue - p->pc) == PQ_CACHE) || - (p->flags & PG_BUSY) || p->busy) { - forward_okay = FALSE; - goto do_backward; - } - vm_page_test_dirty(p); - if ((p->dirty & p->valid) != 0 && - (p->queue == PQ_INACTIVE) && - (p->wire_count == 0) && - (p->hold_count == 0)) { - mc[vm_pageout_page_count + i] = p; - pageout_count++; - if (pageout_count == vm_pageout_page_count) - break; - } else { - forward_okay = FALSE; - } - } else { - forward_okay = FALSE; - } + if (ib > pindex) { + ib = 0; + break; } -do_backward: - /* - * See if backward page is clusterable. - */ - if (backward_okay) { - /* - * Stop backward scan at beginning of object. - */ - if ((pindex - i) == 0) { - backward_okay = FALSE; - } - p = vm_page_lookup(object, pindex - i); - if (p) { - if (((p->queue - p->pc) == PQ_CACHE) || - (p->flags & PG_BUSY) || p->busy) { - backward_okay = FALSE; - continue; - } - vm_page_test_dirty(p); - if ((p->dirty & p->valid) != 0 && - (p->queue == PQ_INACTIVE) && - (p->wire_count == 0) && - (p->hold_count == 0)) { - mc[vm_pageout_page_count - i] = p; - pageout_count++; - page_base--; - if (pageout_count == vm_pageout_page_count) - break; - } else { - backward_okay = FALSE; - } - } else { - backward_okay = FALSE; - } + + if ((p = vm_page_lookup(object, pindex - ib)) == NULL) { + ib = 0; + break; } + if (((p->queue - p->pc) == PQ_CACHE) || + (p->flags & PG_BUSY) || p->busy) { + ib = 0; + break; + } + vm_page_test_dirty(p); + if ((p->dirty & p->valid) == 0 || + p->queue != PQ_INACTIVE || + p->wire_count != 0 || + p->hold_count != 0) { + ib = 0; + break; + } + mc[--page_base] = p; + ++pageout_count; + ++ib; + /* + * alignment boundry, stop here and switch directions. Do + * not clear ib. + */ + if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) + break; } + while (pageout_count < vm_pageout_page_count && + pindex + is < object->size) { + vm_page_t p; + + if ((p = vm_page_lookup(object, pindex + is)) == NULL) + break; + if (((p->queue - p->pc) == PQ_CACHE) || + (p->flags & PG_BUSY) || p->busy) { + break; + } + vm_page_test_dirty(p); + if ((p->dirty & p->valid) == 0 || + p->queue != PQ_INACTIVE || + p->wire_count != 0 || + p->hold_count != 0) { + break; + } + mc[page_base + pageout_count] = p; + ++pageout_count; + ++is; + } + + /* + * If we exhausted our forward scan, continue with the reverse scan + * when possible, even past a page boundry. This catches boundry + * conditions. + */ + if (ib && pageout_count < vm_pageout_page_count) + goto more; + /* * we allow reads during pageouts... */ @@ -397,7 +397,7 @@ vm_pageout_flush(mc, count, flags) * worked. */ pmap_clear_modify(VM_PAGE_TO_PHYS(mt)); - mt->dirty = 0; + vm_page_undirty(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: @@ -646,9 +646,7 @@ vm_pageout_scan() * to the cache. */ - page_shortage = (cnt.v_free_target + cnt.v_cache_min) - - (cnt.v_free_count + cnt.v_cache_count); - page_shortage += addl_page_shortage_init; + page_shortage = vm_paging_target() + addl_page_shortage_init; /* * Figure out what to do with dirty pages when they are encountered. @@ -787,7 +785,7 @@ rescan0: } else { swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && - (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min); + vm_page_count_min()); } @@ -1082,15 +1080,11 @@ rescan0: * in a writeable object, wakeup the sync daemon. And kick swapout * if we did not get enough free pages. */ - if ((cnt.v_cache_count + cnt.v_free_count) < - (cnt.v_free_target + cnt.v_cache_min) ) { - if (vnodes_skipped && - (cnt.v_cache_count + cnt.v_free_count) < cnt.v_free_min) { + if (vm_paging_target() > 0) { + if (vnodes_skipped && vm_page_count_min()) (void) speedup_syncer(); - } #if !defined(NO_SWAPPING) - if (vm_swap_enabled && - (cnt.v_free_count + cnt.v_cache_count < cnt.v_free_target)) { + if (vm_swap_enabled && vm_page_count_target()) { vm_req_vmdaemon(); vm_pageout_req_swapout |= VM_SWAP_NORMAL; } @@ -1101,8 +1095,7 @@ rescan0: * make sure that we have swap space -- if we are low on memory and * swap -- then kill the biggest process. */ - if ((vm_swap_size == 0 || swap_pager_full) && - ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min)) { + if ((vm_swap_size == 0 || swap_pager_full) && vm_page_count_min()) { bigproc = NULL; bigsize = 0; for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { @@ -1160,8 +1153,10 @@ vm_pageout_page_stats() static int fullintervalcount = 0; int page_shortage; - page_shortage = (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) - + page_shortage = + (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) - (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count); + if (page_shortage <= 0) return; @@ -1253,7 +1248,9 @@ vm_size_t count; cnt.v_interrupt_free_min; cnt.v_free_reserved = vm_pageout_page_count + cnt.v_pageout_free_min + (count / 768) + PQ_L2_SIZE; + cnt.v_free_severe = cnt.v_free_min / 2; cnt.v_free_min += cnt.v_free_reserved; + cnt.v_free_severe += cnt.v_free_reserved; return 1; } @@ -1326,8 +1323,17 @@ vm_pageout() while (TRUE) { int error; int s = splvm(); - if (!vm_pages_needed || - ((cnt.v_free_count + cnt.v_cache_count) > cnt.v_free_min)) { + + if (vm_pages_needed && vm_page_count_min()) { + /* + * Still not done, sleep a bit and go again + */ + vm_pages_needed = 0; + tsleep(&vm_pages_needed, PVM, "psleep", hz/2); + } else { + /* + * Good enough, sleep & handle stats + */ vm_pages_needed = 0; error = tsleep(&vm_pages_needed, PVM, "psleep", vm_pageout_stats_interval * hz); @@ -1336,9 +1342,6 @@ vm_pageout() vm_pageout_page_stats(); continue; } - } else if (vm_pages_needed) { - vm_pages_needed = 0; - tsleep(&vm_pages_needed, PVM, "psleep", hz/2); } if (vm_pages_needed)