Rework the test which raises OOM condition. Right now, the code

checks for the swap space consumption plus checks that the amount of the free pages exceeds some limit, in case pagedeamon did not coped with the page shortage in one of the late passes. This is wrong because it does not account for the presence of the reclamaible pages in the queues which are not selectable for reclaim immediately. E.g., on the swap-less systems, large active queue easily triggered OOM. Instead, only raise OOM when pagedaemon is unable to produce a free page in several back-to-back passes. Track the failed passes per pagedaemon thread. The number of passes to trigger OOM was selected empirically and tested both on small (32M-64M i386 VM) and large (32G amd64) configurations. If the specifics of the load require tuning, sysctl vm.pageout_oom_seq sets the number of back-to-back passes which must fail before OOM is raised. Each pass takes 1/2 of seconds. Less the value, more sensible the pagedaemon is to the page shortage. In future, some heuristic to calculate the value of the tunable might be designed based on the system configuration and load. But before it can be done, the i/o system must be fixed to reliably time-out pagedaemon writes, even if waiting for the memory to proceed. Then, code can account for the in-flight page-outs and postpone OOM until all of them finished, which should reduce the need in tuning. Right now, ignoring the in-flight writes and the counter allows to break deadlocks due to write path doing sleepable memory allocations. Reported by: Dmitry Sivachenko, bde, many others Tested by: pho, bde, tuexen (arm) Reviewed by: alc Discussed with: bde, imp Sponsored by: The FreeBSD Foundation MFC after: 3 weeks
svn path=/head/; revision=290920
2015-11-16 06:26:26 +00:00 · 2015-11-16 06:26:26 +00:00 · 76386c7ecd · 2020-12-20 02:59:44 +00:00
commit 76386c7ecd
parent 650dae4419
2 changed files with 31 additions and 14 deletions
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@ -227,6 +227,7 @@ struct vm_domain {
 	long vmd_segs;	/* bitmask of the segments */
 	boolean_t vmd_oom;
 	int vmd_pass;	/* local pagedaemon pass */
+	int vmd_oom_seq;
 	int vmd_last_active_scan;
 	struct vm_page vmd_marker; /* marker for pagedaemon private use */
 	struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@ -122,7 +122,8 @@ static void vm_pageout_init(void);
 static int vm_pageout_clean(vm_page_t m);
 static int vm_pageout_cluster(vm_page_t m);
 static void vm_pageout_scan(struct vm_domain *vmd, int pass);
-static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
+static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
+    int starting_page_shortage);

 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
    NULL);
@ -158,6 +159,7 @@ SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
 int vm_pages_needed;		/* Event on which pageout daemon sleeps */
 int vm_pageout_deficit;		/* Estimated number of pages deficit */
 int vm_pageout_wakeup_thresh;
+static int vm_pageout_oom_seq = 12;

 #if !defined(NO_SWAPPING)
 static int vm_pageout_req_swapout;	/* XXX */
@ -223,6 +225,10 @@ static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");

+SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
+	CTLFLAG_RW, &vm_pageout_oom_seq, 0,
+	"back-to-back calls to oom detector to start OOM");
+
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;

@ -1041,7 +1047,8 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 	vm_object_t object;
 	long min_scan;
 	int act_delta, addl_page_shortage, deficit, error, maxlaunder, maxscan;
-	int page_shortage, scan_tick, scanned, vnodes_skipped;
+	int page_shortage, scan_tick, scanned, starting_page_shortage;
+	int vnodes_skipped;
 	boolean_t pageout_ok, queues_locked;

 	/*
@ -1080,6 +1087,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 		page_shortage = vm_paging_target() + deficit;
 	} else
 		page_shortage = deficit = 0;
+	starting_page_shortage = page_shortage;

 	/*
 	 * maxlaunder limits the number of dirty pages we flush per scan.
@ -1342,6 +1350,12 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 	    vm_cnt.v_free_min)
 		(void)speedup_syncer();

+	/*
+	 * If the inactive queue scan fails repeatedly to meet its
+	 * target, kill the largest process.
+	 */
+	vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
+
 	/*
 	 * Compute the number of pages we want to try to move from the
 	 * active queue to the inactive queue.
@ -1453,15 +1467,6 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 		}
 	}
 #endif
-
-	/*
-	 * If we are critically low on one of RAM or swap and low on
-	 * the other, kill the largest process.  However, we avoid
-	 * doing this on the first pass in order to give ourselves a
-	 * chance to flush out dirty vnode-backed pages and to allow
-	 * active pages to be moved to the inactive queue and reclaimed.
-	 */
-	vm_pageout_mightbe_oom(vmd, pass);
 }

 static int vm_pageout_oom_vote;
@ -1472,12 +1477,17 @@ static int vm_pageout_oom_vote;
 * failed to reach free target is premature.
 */
 static void
-vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
+vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
+    int starting_page_shortage)
 {
 	int old_vote;

-	if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min()) ||
-	    (swap_pager_full && vm_paging_target() > 0))) {
+	if (starting_page_shortage <= 0 || starting_page_shortage !=
+	    page_shortage)
+		vmd->vmd_oom_seq = 0;
+	else
+		vmd->vmd_oom_seq++;
+	if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
 		if (vmd->vmd_oom) {
 			vmd->vmd_oom = FALSE;
 			atomic_subtract_int(&vm_pageout_oom_vote, 1);
@ -1485,6 +1495,12 @@ vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
 		return;
 	}

+	/*
+	 * Do not follow the call sequence until OOM condition is
+	 * cleared.
+	 */
+	vmd->vmd_oom_seq = 0;
+
 	if (vmd->vmd_oom)
 		return;