Reviewed by: Alan Cox <alc@cs.rice.edu>, David Greenman <dg@root.com>

Replace various VM related page count calculations strewn over the VM code with inlines to aid in readability and to reduce fragility in the code where modules depend on the same test being performed to properly sleep and wakeup. Split out a portion of the page deactivation code into an inline in vm_page.c to support vm_page_dontneed(). add vm_page_dontneed(), which handles the madvise MADV_DONTNEED feature in a related commit coming up for vm_map.c/vm_object.c. This code prevents degenerate cases where an essentially active page may be rotated through a subset of the paging lists, resulting in premature disposal.
1999-09-17 04:56:40 +00:00 · 1999-09-17 04:56:40 +00:00 · 90ecac61c0
commit 90ecac61c0
parent 1ed9e51a4d
6 changed files with 276 additions and 118 deletions
--- a/sys/sys/vmmeter.h
+++ b/sys/sys/vmmeter.h
@ -91,9 +91,95 @@ struct vmmeter {
 	u_int v_cache_max;	/* max number of pages in cached obj */
 	u_int v_pageout_free_min;   /* min number pages reserved for kernel */
 	u_int v_interrupt_free_min; /* reserved number of pages for int code */
+	u_int v_free_severe;	/* severe depletion of pages below this pt */
 };
 #ifdef KERNEL
+
 extern struct vmmeter cnt;
+
+/*
+ * Return TRUE if we are under our reserved low-free-pages threshold
+ */
+
+static __inline 
+int
+vm_page_count_reserved(void)
+{
+    return (cnt.v_free_reserved > (cnt.v_free_count + cnt.v_cache_count));
+}
+
+/*
+ * Return TRUE if we are under our severe low-free-pages threshold
+ *
+ * This routine is typically used at the user<->system interface to determine
+ * whether we need to block in order to avoid a low memory deadlock.
+ */
+
+static __inline 
+int
+vm_page_count_severe(void)
+{
+    return (cnt.v_free_severe > (cnt.v_free_count + cnt.v_cache_count));
+}
+
+/*
+ * Return TRUE if we are under our minimum low-free-pages threshold.
+ *
+ * This routine is typically used within the system to determine whether
+ * we can execute potentially very expensive code in terms of memory.  It
+ * is also used by the pageout daemon to calculate when to sleep, when
+ * to wake waiters up, and when (after making a pass) to become more
+ * desparate.
+ */
+
+static __inline 
+int
+vm_page_count_min(void)
+{
+    return (cnt.v_free_min > (cnt.v_free_count + cnt.v_cache_count));
+}
+
+/*
+ * Return TRUE if we have not reached our free page target during
+ * free page recovery operations.
+ */
+
+static __inline 
+int
+vm_page_count_target(void)
+{
+    return (cnt.v_free_target > (cnt.v_free_count + cnt.v_cache_count));
+}
+
+/*
+ * Return the number of pages we need to free-up or cache
+ * A positive number indicates that we do not have enough free pages.
+ */
+
+static __inline 
+int
+vm_paging_target(void)
+{
+    return (
+	(cnt.v_free_target + cnt.v_cache_min) - 
+	(cnt.v_free_count + cnt.v_cache_count)
+    );
+}
+
+/*
+ * Return a positive number if the pagedaemon needs to be woken up.
+ */
+
+static __inline 
+int
+vm_paging_needed(void)
+{
+    return (
+	(cnt.v_free_reserved + cnt.v_cache_min) >
+	(cnt.v_free_count + cnt.v_cache_count)
+    );
+}
+
 #endif

 /* systemwide totals computed every five seconds */
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@ -209,19 +209,9 @@ vm_fork(p1, p2, flags)
 		p1->p_vmspace->vm_refcnt++;
 	}

-	/*
-	 * Great, so we have a memory-heavy process and the 
-	 * entire machine comes to a screaching halt because
-	 * nobody can fork/exec anything.  What we really need
-	 * to do is fix the process swapper so it swaps out the right
-	 * processes.
-	 */
-#if 0
-	while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
-		vm_pageout_deficit += (UPAGES + VM_INITIAL_PAGEIN);
+	while (vm_page_count_severe()) {
 		VM_WAIT;
 	}
-#endif

 	if ((flags & RFMEM) == 0) {
 		p2->p_vmspace = vmspace_fork(p1->p_vmspace);
@ -339,8 +329,9 @@ scheduler(dummy)
 	int ppri;

 loop:
-	while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
+	if (vm_page_count_min()) {
 		VM_WAIT;
+		goto loop;
 	}

 	pp = NULL;
--- a/sys/vm/vm_meter.c
+++ b/sys/vm/vm_meter.c
@ -119,6 +119,8 @@ SYSCTL_INT(_vm, VM_V_CACHE_MAX, v_cache_max,
 	CTLFLAG_RW, &cnt.v_cache_max, 0, "");
 SYSCTL_INT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,
 	CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, v_free_severe,
+	CTLFLAG_RW, &cnt.v_free_severe, 0, "");

 SYSCTL_STRUCT(_vm, VM_LOADAVG, loadavg, CTLFLAG_RD, 
    &averunnable, loadavg, "Machine loadaverage history");
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@ -615,8 +615,7 @@ vm_page_unqueue(m)
 		(*pq->cnt)--;
 		pq->lcnt--;
 		if ((queue - m->pc) == PQ_CACHE) {
-			if ((cnt.v_cache_count + cnt.v_free_count) <
-				(cnt.v_free_reserved + cnt.v_cache_min))
+			if (vm_paging_needed())
 				pagedaemon_wakeup();
 		}
 	}
@ -871,9 +870,7 @@ loop:
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
-	if (((cnt.v_free_count + cnt.v_cache_count) <
-		(cnt.v_free_reserved + cnt.v_cache_min)) ||
-			(cnt.v_free_count < cnt.v_pageout_free_min))
+	if (vm_paging_needed() || cnt.v_free_count < cnt.v_pageout_free_min)
 		pagedaemon_wakeup();

 	splx(s);
@ -991,6 +988,8 @@ vm_page_asleep(vm_page_t m, char *msg, char *busy) {
 *	vm_page_activate:
 *
 *	Put the specified page on the active list (if appropriate).
+ *	Ensure that act_count is at least ACT_INIT but do not otherwise
+ *	mess with it.
 *
 *	The page queues must be locked.
 *	This routine may not block.
@ -1050,8 +1049,7 @@ vm_page_free_wakeup()
 	 * high water mark. And wakeup scheduler process if we have
 	 * lots of memory. this process will swapin processes.
 	 */
-	if (vm_pages_needed &&
-		((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) {
+	if (vm_pages_needed && vm_page_count_min()) {
 		wakeup(&cnt.v_free_count);
 		vm_pages_needed = 0;
 	}
@ -1261,11 +1259,14 @@ vm_page_unwire(m, activate)
 * Move the specified page to the inactive queue.  If the page has
 * any associated swap, the swap is deallocated.
 *
+ * Normally athead is 0 resulting in LRU operation.  athead is set
+ * to 1 if we want this page to be 'as if it were placed in the cache',
+ * except without unmapping it from the process address space.
+ *
 * This routine may not block.
 */
-void
-vm_page_deactivate(m)
-	register vm_page_t m;
+static __inline void
+_vm_page_deactivate(vm_page_t m, int athead)
 {
 	int s;

@ -1280,7 +1281,10 @@ vm_page_deactivate(m)
 		if ((m->queue - m->pc) == PQ_CACHE)
 			cnt.v_reactivated++;
 		vm_page_unqueue(m);
-		TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
+		if (athead)
+			TAILQ_INSERT_HEAD(&vm_page_queue_inactive, m, pageq);
+		else
+			TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
 		m->queue = PQ_INACTIVE;
 		vm_page_queues[PQ_INACTIVE].lcnt++;
 		cnt.v_inactive_count++;
@ -1288,6 +1292,12 @@ vm_page_deactivate(m)
 	splx(s);
 }

+void
+vm_page_deactivate(vm_page_t m)
+{
+    _vm_page_deactivate(m, 0);
+}
+
 /*
 * vm_page_cache
 *
@ -1332,6 +1342,70 @@ vm_page_cache(m)
 	splx(s);
 }

+/*
+ * vm_page_dontneed
+ *
+ *	Cache, deactivate, or do nothing as appropriate.  This routine
+ *	is typically used by madvise() MADV_DONTNEED.
+ *
+ *	Generally speaking we want to move the page into the cache so
+ *	it gets reused quickly.  However, this can result in a silly syndrome
+ *	due to the page recycling too quickly.  Small objects will not be
+ *	fully cached.  On the otherhand, if we move the page to the inactive
+ *	queue we wind up with a problem whereby very large objects 
+ *	unnecessarily blow away our inactive and cache queues.
+ *
+ *	The solution is to move the pages based on a fixed weighting.  We
+ *	either leave them alone, deactivate them, or move them to the cache,
+ *	where moving them to the cache has the highest weighting.
+ *	By forcing some pages into other queues we eventually force the
+ *	system to balance the queues, potentially recovering other unrelated
+ *	space from active.  The idea is to not force this to happen too
+ *	often.
+ */
+
+void
+vm_page_dontneed(m)
+	vm_page_t m;
+{
+	static int dnweight;
+	int dnw;
+	int head;
+
+	dnw = ++dnweight;
+
+	/*
+	 * occassionally leave the page alone
+	 */
+
+	if ((dnw & 0x01F0) == 0 ||
+	    m->queue == PQ_INACTIVE || 
+	    m->queue - m->pc == PQ_CACHE
+	) {
+		if (m->act_count >= ACT_INIT)
+			--m->act_count;
+		return;
+	}
+
+	if (m->dirty == 0)
+		vm_page_test_dirty(m);
+
+	if (m->dirty || (dnw & 0x0070) == 0) {
+		/*
+		 * Deactivate the page 3 times out of 32.
+		 */
+		head = 0;
+	} else {
+		/*
+		 * Cache the page 28 times out of every 32.  Note that
+		 * the page is deactivated instead of cached, but placed
+		 * at the head of the queue instead of the tail.
+		 */
+		head = 1;
+	}
+	_vm_page_deactivate(m, head);
+}
+
 /*
 * Grab a page, waiting until we are waken up due to the page
 * changing state.  We keep on waiting, if the page continues
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@ -136,7 +136,8 @@ struct vm_page {
 };

 /*
- * note SWAPBLK_NONE is a flag, basically the high bit.
+ * note: currently use SWAPBLK_NONE as an absolute value rather then 
+ * a flag bit.
 */

 #define SWAPBLK_MASK	((daddr_t)((u_daddr_t)-1 >> 1))		/* mask */
@ -391,6 +392,7 @@ void vm_page_activate __P((vm_page_t));
 vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
 vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
 void vm_page_cache __P((register vm_page_t));
+void vm_page_dontneed __P((register vm_page_t));
 static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
 static __inline void vm_page_free __P((vm_page_t));
 static __inline void vm_page_free_zero __P((vm_page_t));
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@ -219,7 +219,7 @@ vm_pageout_clean(m)
 	register vm_object_t object;
 	vm_page_t mc[2*vm_pageout_page_count];
 	int pageout_count;
-	int i, forward_okay, backward_okay, page_base;
+	int ib, is, page_base;
 	vm_pindex_t pindex = m->pindex;

 	object = m->object;
@ -243,11 +243,9 @@ vm_pageout_clean(m)
 	mc[vm_pageout_page_count] = m;
 	pageout_count = 1;
 	page_base = vm_pageout_page_count;
-	forward_okay = TRUE;
-	if (pindex != 0)
-		backward_okay = TRUE;
-	else
-		backward_okay = FALSE;
+	ib = 1;
+	is = 1;
+
 	/*
 	 * Scan object for clusterable pages.
 	 *
@ -258,81 +256,83 @@ vm_pageout_clean(m)
 	 *    active page.
 	 * -or-
 	 * 2) we force the issue.
+	 *
+	 * During heavy mmap/modification loads the pageout
+	 * daemon can really fragment the underlying file
+	 * due to flushing pages out of order and not trying
+	 * align the clusters (which leave sporatic out-of-order
+	 * holes).  To solve this problem we do the reverse scan
+	 * first and attempt to align our cluster, then do a 
+	 * forward scan if room remains.
 	 */
-	for (i = 1; (i < vm_pageout_page_count) && (forward_okay || backward_okay); i++) {
+
+more:
+	while (ib && pageout_count < vm_pageout_page_count) {
 		vm_page_t p;

-		/*
-		 * See if forward page is clusterable.
-		 */
-		if (forward_okay) {
-			/*
-			 * Stop forward scan at end of object.
-			 */
-			if ((pindex + i) > object->size) {
-				forward_okay = FALSE;
-				goto do_backward;
-			}
-			p = vm_page_lookup(object, pindex + i);
-			if (p) {
-				if (((p->queue - p->pc) == PQ_CACHE) ||
-					(p->flags & PG_BUSY) || p->busy) {
-					forward_okay = FALSE;
-					goto do_backward;
-				}
-				vm_page_test_dirty(p);
-				if ((p->dirty & p->valid) != 0 &&
-				    (p->queue == PQ_INACTIVE) &&
-				    (p->wire_count == 0) &&
-				    (p->hold_count == 0)) {
-					mc[vm_pageout_page_count + i] = p;
-					pageout_count++;
-					if (pageout_count == vm_pageout_page_count)
-						break;
-				} else {
-					forward_okay = FALSE;
-				}
-			} else {
-				forward_okay = FALSE;
-			}
+		if (ib > pindex) {
+			ib = 0;
+			break;
 		}
-do_backward:
-		/*
-		 * See if backward page is clusterable.
-		 */
-		if (backward_okay) {
-			/*
-			 * Stop backward scan at beginning of object.
-			 */
-			if ((pindex - i) == 0) {
-				backward_okay = FALSE;
-			}
-			p = vm_page_lookup(object, pindex - i);
-			if (p) {
-				if (((p->queue - p->pc) == PQ_CACHE) ||
-					(p->flags & PG_BUSY) || p->busy) {
-					backward_okay = FALSE;
-					continue;
-				}
-				vm_page_test_dirty(p);
-				if ((p->dirty & p->valid) != 0 &&
-				    (p->queue == PQ_INACTIVE) &&
-				    (p->wire_count == 0) &&
-				    (p->hold_count == 0)) {
-					mc[vm_pageout_page_count - i] = p;
-					pageout_count++;
-					page_base--;
-					if (pageout_count == vm_pageout_page_count)
-						break;
-				} else {
-					backward_okay = FALSE;
-				}
-			} else {
-				backward_okay = FALSE;
-			}
+
+		if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {
+			ib = 0;
+			break;
 		}
+		if (((p->queue - p->pc) == PQ_CACHE) ||
+		    (p->flags & PG_BUSY) || p->busy) {
+			ib = 0;
+			break;
+		}
+		vm_page_test_dirty(p);
+		if ((p->dirty & p->valid) == 0 ||
+		    p->queue != PQ_INACTIVE ||
+		    p->wire_count != 0 ||
+		    p->hold_count != 0) {
+			ib = 0;
+			break;
+		}
+		mc[--page_base] = p;
+		++pageout_count;
+		++ib;
+		/*
+		 * alignment boundry, stop here and switch directions.  Do
+		 * not clear ib.
+		 */
+		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
+			break;
 	}

+	while (pageout_count < vm_pageout_page_count && 
+	    pindex + is < object->size) {
+		vm_page_t p;
+
+		if ((p = vm_page_lookup(object, pindex + is)) == NULL)
+			break;
+		if (((p->queue - p->pc) == PQ_CACHE) ||
+		    (p->flags & PG_BUSY) || p->busy) {
+			break;
+		}
+		vm_page_test_dirty(p);
+		if ((p->dirty & p->valid) == 0 ||
+		    p->queue != PQ_INACTIVE ||
+		    p->wire_count != 0 ||
+		    p->hold_count != 0) {
+			break;
+		}
+		mc[page_base + pageout_count] = p;
+		++pageout_count;
+		++is;
+	}
+
+	/*
+	 * If we exhausted our forward scan, continue with the reverse scan
+	 * when possible, even past a page boundry.  This catches boundry
+	 * conditions.
+	 */
+	if (ib && pageout_count < vm_pageout_page_count)
+		goto more;
+
 	/*
 	 * we allow reads during pageouts...
 	 */
@ -397,7 +397,7 @@ vm_pageout_flush(mc, count, flags)
 			 * worked.
 			 */
 			pmap_clear_modify(VM_PAGE_TO_PHYS(mt));
-			mt->dirty = 0;
+			vm_page_undirty(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
@ -646,9 +646,7 @@ vm_pageout_scan()
 	 * to the cache.
 	 */

-	page_shortage = (cnt.v_free_target + cnt.v_cache_min) -
-	    (cnt.v_free_count + cnt.v_cache_count);
-	page_shortage += addl_page_shortage_init;
+	page_shortage = vm_paging_target() + addl_page_shortage_init;

 	/*
 	 * Figure out what to do with dirty pages when they are encountered.
@ -787,7 +785,7 @@ rescan0:
 			} else {
 				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
 				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
-					(cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min);
+				vm_page_count_min());
 										
 			}

@ -1082,15 +1080,11 @@ rescan0:
 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
 	 * if we did not get enough free pages.
 	 */
-	if ((cnt.v_cache_count + cnt.v_free_count) <
-		(cnt.v_free_target + cnt.v_cache_min) ) {
-		if (vnodes_skipped &&
-		    (cnt.v_cache_count + cnt.v_free_count) < cnt.v_free_min) {
+	if (vm_paging_target() > 0) {
+		if (vnodes_skipped && vm_page_count_min())
 			(void) speedup_syncer();
-		}
 #if !defined(NO_SWAPPING)
-		if (vm_swap_enabled &&
-			(cnt.v_free_count + cnt.v_cache_count < cnt.v_free_target)) {
+		if (vm_swap_enabled && vm_page_count_target()) {
 			vm_req_vmdaemon();
 			vm_pageout_req_swapout |= VM_SWAP_NORMAL;
 		}
@ -1101,8 +1095,7 @@ rescan0:
 	 * make sure that we have swap space -- if we are low on memory and
 	 * swap -- then kill the biggest process.
 	 */
-	if ((vm_swap_size == 0 || swap_pager_full) &&
-	    ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min)) {
+	if ((vm_swap_size == 0 || swap_pager_full) && vm_page_count_min()) {
 		bigproc = NULL;
 		bigsize = 0;
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
@ -1160,8 +1153,10 @@ vm_pageout_page_stats()
 	static int fullintervalcount = 0;
 	int page_shortage;

-	page_shortage = (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
+	page_shortage = 
+	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
 	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
+
 	if (page_shortage <= 0)
 		return;

@ -1253,7 +1248,9 @@ vm_size_t count;
 		cnt.v_interrupt_free_min;
 	cnt.v_free_reserved = vm_pageout_page_count +
 		cnt.v_pageout_free_min + (count / 768) + PQ_L2_SIZE;
+	cnt.v_free_severe = cnt.v_free_min / 2;
 	cnt.v_free_min += cnt.v_free_reserved;
+	cnt.v_free_severe += cnt.v_free_reserved;
 	return 1;
 }

@ -1326,8 +1323,17 @@ vm_pageout()
 	while (TRUE) {
 		int error;
 		int s = splvm();
-		if (!vm_pages_needed ||
-			((cnt.v_free_count + cnt.v_cache_count) > cnt.v_free_min)) {
+
+		if (vm_pages_needed && vm_page_count_min()) {
+			/*
+			 * Still not done, sleep a bit and go again
+			 */
+			vm_pages_needed = 0;
+			tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
+		} else {
+			/*
+			 * Good enough, sleep & handle stats
+			 */
 			vm_pages_needed = 0;
 			error = tsleep(&vm_pages_needed,
 				PVM, "psleep", vm_pageout_stats_interval * hz);
@ -1336,9 +1342,6 @@ vm_pageout()
 				vm_pageout_page_stats();
 				continue;
 			}
-		} else if (vm_pages_needed) {
-			vm_pages_needed = 0;
-			tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
 		}

 		if (vm_pages_needed)