Rework the test which raises OOM condition. Right now, the code

checks for the swap space consumption plus checks that the amount of
the free pages exceeds some limit, in case pagedeamon did not coped
with the page shortage in one of the late passes.  This is wrong
because it does not account for the presence of the reclamaible pages
in the queues which are not selectable for reclaim immediately.  E.g.,
on the swap-less systems, large active queue easily triggered OOM.

Instead, only raise OOM when pagedaemon is unable to produce a free
page in several back-to-back passes.  Track the failed passes per
pagedaemon thread.

The number of passes to trigger OOM was selected empirically and
tested both on small (32M-64M i386 VM) and large (32G amd64)
configurations.  If the specifics of the load require tuning, sysctl
vm.pageout_oom_seq sets the number of back-to-back passes which must
fail before OOM is raised.  Each pass takes 1/2 of seconds.  Less the
value, more sensible the pagedaemon is to the page shortage.

In future, some heuristic to calculate the value of the tunable might
be designed based on the system configuration and load.  But before it
can be done, the i/o system must be fixed to reliably time-out
pagedaemon writes, even if waiting for the memory to proceed.  Then,
code can account for the in-flight page-outs and postpone OOM until
all of them finished, which should reduce the need in tuning.  Right
now, ignoring the in-flight writes and the counter allows to break
deadlocks due to write path doing sleepable memory allocations.

Reported by:	Dmitry Sivachenko, bde, many others
Tested by:	pho, bde, tuexen (arm)
Reviewed by:	alc
Discussed with:	bde, imp
Sponsored by:	The FreeBSD Foundation
MFC after:	3 weeks
This commit is contained in:
Konstantin Belousov 2015-11-16 06:26:26 +00:00
parent 650dae4419
commit 76386c7ecd
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=290920
2 changed files with 31 additions and 14 deletions

View File

@ -227,6 +227,7 @@ struct vm_domain {
long vmd_segs; /* bitmask of the segments */
boolean_t vmd_oom;
int vmd_pass; /* local pagedaemon pass */
int vmd_oom_seq;
int vmd_last_active_scan;
struct vm_page vmd_marker; /* marker for pagedaemon private use */
struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */

View File

@ -122,7 +122,8 @@ static void vm_pageout_init(void);
static int vm_pageout_clean(vm_page_t m);
static int vm_pageout_cluster(vm_page_t m);
static void vm_pageout_scan(struct vm_domain *vmd, int pass);
static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
int starting_page_shortage);
SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
NULL);
@ -158,6 +159,7 @@ SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
int vm_pages_needed; /* Event on which pageout daemon sleeps */
int vm_pageout_deficit; /* Estimated number of pages deficit */
int vm_pageout_wakeup_thresh;
static int vm_pageout_oom_seq = 12;
#if !defined(NO_SWAPPING)
static int vm_pageout_req_swapout; /* XXX */
@ -223,6 +225,10 @@ static int pageout_lock_miss;
SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
CTLFLAG_RW, &vm_pageout_oom_seq, 0,
"back-to-back calls to oom detector to start OOM");
#define VM_PAGEOUT_PAGE_COUNT 16
int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
@ -1041,7 +1047,8 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
vm_object_t object;
long min_scan;
int act_delta, addl_page_shortage, deficit, error, maxlaunder, maxscan;
int page_shortage, scan_tick, scanned, vnodes_skipped;
int page_shortage, scan_tick, scanned, starting_page_shortage;
int vnodes_skipped;
boolean_t pageout_ok, queues_locked;
/*
@ -1080,6 +1087,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
page_shortage = vm_paging_target() + deficit;
} else
page_shortage = deficit = 0;
starting_page_shortage = page_shortage;
/*
* maxlaunder limits the number of dirty pages we flush per scan.
@ -1342,6 +1350,12 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
vm_cnt.v_free_min)
(void)speedup_syncer();
/*
* If the inactive queue scan fails repeatedly to meet its
* target, kill the largest process.
*/
vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
/*
* Compute the number of pages we want to try to move from the
* active queue to the inactive queue.
@ -1453,15 +1467,6 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
}
}
#endif
/*
* If we are critically low on one of RAM or swap and low on
* the other, kill the largest process. However, we avoid
* doing this on the first pass in order to give ourselves a
* chance to flush out dirty vnode-backed pages and to allow
* active pages to be moved to the inactive queue and reclaimed.
*/
vm_pageout_mightbe_oom(vmd, pass);
}
static int vm_pageout_oom_vote;
@ -1472,12 +1477,17 @@ static int vm_pageout_oom_vote;
* failed to reach free target is premature.
*/
static void
vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
int starting_page_shortage)
{
int old_vote;
if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min()) ||
(swap_pager_full && vm_paging_target() > 0))) {
if (starting_page_shortage <= 0 || starting_page_shortage !=
page_shortage)
vmd->vmd_oom_seq = 0;
else
vmd->vmd_oom_seq++;
if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
if (vmd->vmd_oom) {
vmd->vmd_oom = FALSE;
atomic_subtract_int(&vm_pageout_oom_vote, 1);
@ -1485,6 +1495,12 @@ vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
return;
}
/*
* Do not follow the call sequence until OOM condition is
* cleared.
*/
vmd->vmd_oom_seq = 0;
if (vmd->vmd_oom)
return;