diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index e316dd5cae0d..d7d4d8986f15 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -552,6 +552,9 @@ vm_domain_stats_init(struct vm_domain *vmd, struct sysctl_oid *parent) SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_severe", CTLFLAG_RD, &vmd->vmd_free_severe, 0, "Severe free pages"); + SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "inactive_pps", CTLFLAG_RD, &vmd->vmd_inactive_pps, 0, + "inactive pages freed/second"); } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 593af4b5a2e4..036d0075ca7e 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -421,7 +421,7 @@ sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) * In principle, this function only needs to set the flag PG_MARKER. * Nonetheless, it write busies the page as a safety precaution. */ -static void +void vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags) { @@ -2488,7 +2488,7 @@ vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags) * main purpose is to replenish the store of free pages. */ if (vmd->vmd_severeset || curproc == pageproc || - !_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt)) + !_vm_domain_allocate(vmd, VM_ALLOC_SYSTEM, cnt)) return (0); domain = vmd->vmd_domain; vm_domain_free_lock(vmd); diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 039e467491d0..a44e31f506d0 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -630,6 +630,7 @@ vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); void vm_page_free_invalid(vm_page_t); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); +void vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); void vm_page_invalid(vm_page_t m); void vm_page_launder(vm_page_t m); diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index db2aa5f1c1cf..286528905eaa 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -82,6 +82,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -163,6 +164,12 @@ SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RWTUN, &vm_pageout_update_period, 0, "Maximum active LRU update period"); + +/* Access with get_pageout_threads_per_domain(). */ +static int pageout_threads_per_domain = 1; +SYSCTL_INT(_vm, OID_AUTO, pageout_threads_per_domain, CTLFLAG_RDTUN, + &pageout_threads_per_domain, 0, + "Number of worker threads comprising each per-domain pagedaemon"); SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0, "Low memory callback period"); @@ -1414,22 +1421,22 @@ vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq, vm_batchqueue_init(bq); } -/* - * Attempt to reclaim the requested number of pages from the inactive queue. - * Returns true if the shortage was addressed. - */ -static int -vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, - int *addl_shortage) +static void +vm_pageout_scan_inactive(struct vm_domain *vmd, int page_shortage) { + struct timeval start, end; struct scan_state ss; struct vm_batchqueue rq; + struct vm_page marker_page; vm_page_t m, marker; struct vm_pagequeue *pq; vm_object_t object; vm_page_astate_t old, new; - int act_delta, addl_page_shortage, deficit, page_shortage, refs; - int starting_page_shortage; + int act_delta, addl_page_shortage, starting_page_shortage, refs; + + object = NULL; + vm_batchqueue_init(&rq); + getmicrouptime(&start); /* * The addl_page_shortage is an estimate of the number of temporarily @@ -1439,25 +1446,15 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, */ addl_page_shortage = 0; - /* - * vmd_pageout_deficit counts the number of pages requested in - * allocations that failed because of a free page shortage. We assume - * that the allocations will be reattempted and thus include the deficit - * in our scan target. - */ - deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit); - starting_page_shortage = page_shortage = shortage + deficit; - - object = NULL; - vm_batchqueue_init(&rq); - /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->a.act_count is not used to make * decisions for the inactive queue, only for the active queue.) */ - marker = &vmd->vmd_markers[PQ_INACTIVE]; + starting_page_shortage = page_shortage; + marker = &marker_page; + vm_page_init_marker(marker, PQ_INACTIVE, 0); pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; vm_pagequeue_lock(pq); vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt); @@ -1637,7 +1634,97 @@ vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); - VM_CNT_ADD(v_dfree, starting_page_shortage - page_shortage); + /* + * Record the remaining shortage and the progress and rate it was made. + */ + atomic_add_int(&vmd->vmd_addl_shortage, addl_page_shortage); + getmicrouptime(&end); + timevalsub(&end, &start); + atomic_add_int(&vmd->vmd_inactive_us, + end.tv_sec * 1000000 + end.tv_usec); + atomic_add_int(&vmd->vmd_inactive_freed, + starting_page_shortage - page_shortage); +} + +/* + * Dispatch a number of inactive threads according to load and collect the + * results to prevent a coherent (CEM: incoherent?) view of paging activity on + * this domain. + */ +static int +vm_pageout_inactive_dispatch(struct vm_domain *vmd, int shortage) +{ + u_int freed, pps, threads, us; + + vmd->vmd_inactive_shortage = shortage; + + /* + * If we have more work than we can do in a quarter of our interval, we + * fire off multiple threads to process it. + */ + if (vmd->vmd_inactive_threads > 1 && vmd->vmd_inactive_pps != 0 && + shortage > vmd->vmd_inactive_pps / VM_INACT_SCAN_RATE / 4) { + threads = vmd->vmd_inactive_threads; + vm_domain_pageout_lock(vmd); + vmd->vmd_inactive_shortage /= threads; + blockcount_acquire(&vmd->vmd_inactive_starting, threads - 1); + blockcount_acquire(&vmd->vmd_inactive_running, threads - 1); + wakeup(&vmd->vmd_inactive_shortage); + vm_domain_pageout_unlock(vmd); + } + + /* Run the local thread scan. */ + vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage); + + /* + * Block until helper threads report results and then accumulate + * totals. + */ + blockcount_wait(&vmd->vmd_inactive_running, NULL, "vmpoid", PVM); + freed = atomic_readandclear_int(&vmd->vmd_inactive_freed); + VM_CNT_ADD(v_dfree, freed); + + /* + * Calculate the per-thread paging rate with an exponential decay of + * prior results. Careful to avoid integer rounding errors with large + * us values. + */ + us = max(atomic_readandclear_int(&vmd->vmd_inactive_us), 1); + if (us > 1000000) + /* Keep rounding to tenths */ + pps = (freed * 10) / ((us * 10) / 1000000); + else + pps = (1000000 / us) * freed; + vmd->vmd_inactive_pps = (vmd->vmd_inactive_pps / 2) + (pps / 2); + + return (shortage - freed); +} + +/* + * Attempt to reclaim the requested number of pages from the inactive queue. + * Returns true if the shortage was addressed. + */ +static int +vm_pageout_inactive(struct vm_domain *vmd, int shortage, int *addl_shortage) +{ + struct vm_pagequeue *pq; + u_int addl_page_shortage, deficit, page_shortage; + u_int starting_page_shortage; + + /* + * vmd_pageout_deficit counts the number of pages requested in + * allocations that failed because of a free page shortage. We assume + * that the allocations will be reattempted and thus include the deficit + * in our scan target. + */ + deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit); + starting_page_shortage = shortage + deficit; + + /* + * Run the inactive scan on as many threads as is necessary. + */ + page_shortage = vm_pageout_inactive_dispatch(vmd, starting_page_shortage); + addl_page_shortage = atomic_readandclear_int(&vmd->vmd_addl_shortage); /* * Wake up the laundry thread so that it can perform any needed @@ -2066,7 +2153,7 @@ vm_pageout_worker(void *arg) if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree) shortage -= min(vmd->vmd_free_count - ofree, (u_int)shortage); - target_met = vm_pageout_scan_inactive(vmd, shortage, + target_met = vm_pageout_inactive(vmd, shortage, &addl_shortage); } else addl_shortage = 0; @@ -2081,6 +2168,72 @@ vm_pageout_worker(void *arg) } } +/* + * vm_pageout_helper runs additional pageout daemons in times of high paging + * activity. + */ +static void +vm_pageout_helper(void *arg) +{ + struct vm_domain *vmd; + int domain; + + domain = (uintptr_t)arg; + vmd = VM_DOMAIN(domain); + + vm_domain_pageout_lock(vmd); + for (;;) { + msleep(&vmd->vmd_inactive_shortage, + vm_domain_pageout_lockptr(vmd), PVM, "psleep", 0); + blockcount_release(&vmd->vmd_inactive_starting, 1); + + vm_domain_pageout_unlock(vmd); + vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage); + vm_domain_pageout_lock(vmd); + + /* + * Release the running count while the pageout lock is held to + * prevent wakeup races. + */ + blockcount_release(&vmd->vmd_inactive_running, 1); + } +} + +static int +get_pageout_threads_per_domain(void) +{ + static bool resolved = false; + int half_cpus_per_dom; + + /* + * This is serialized externally by the sorted autoconfig portion of + * boot. + */ + if (__predict_true(resolved)) + return (pageout_threads_per_domain); + + /* + * Semi-arbitrarily constrain pagedaemon threads to less than half the + * total number of threads in the system as an insane upper limit. + */ + half_cpus_per_dom = (mp_ncpus / vm_ndomains) / 2; + + if (pageout_threads_per_domain < 1) { + printf("Invalid tuneable vm.pageout_threads_per_domain value: " + "%d out of valid range: [1-%d]; clamping to 1\n", + pageout_threads_per_domain, half_cpus_per_dom); + pageout_threads_per_domain = 1; + } else if (pageout_threads_per_domain > half_cpus_per_dom) { + printf("Invalid tuneable vm.pageout_threads_per_domain value: " + "%d out of valid range: [1-%d]; clamping to %d\n", + pageout_threads_per_domain, half_cpus_per_dom, + half_cpus_per_dom); + pageout_threads_per_domain = half_cpus_per_dom; + } + resolved = true; + return (pageout_threads_per_domain); +} + /* * Initialize basic pageout daemon settings. See the comment above the * definition of vm_domain for some explanation of how these thresholds are @@ -2134,6 +2287,8 @@ vm_pageout_init_domain(int domain) oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO, "pidctrl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid)); + + vmd->vmd_inactive_threads = get_pageout_threads_per_domain(); } static void @@ -2184,10 +2339,11 @@ vm_pageout(void) { struct proc *p; struct thread *td; - int error, first, i; + int error, first, i, j, pageout_threads; p = curproc; td = curthread; + pageout_threads = get_pageout_threads_per_domain(); mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF); swap_pager_swap_init(); @@ -2207,6 +2363,14 @@ vm_pageout(void) panic("starting pageout for domain %d: %d\n", i, error); } + for (j = 0; j < pageout_threads - 1; j++) { + error = kthread_add(vm_pageout_helper, + (void *)(uintptr_t)i, p, NULL, 0, 0, + "dom%d helper%d", i, j); + if (error != 0) + panic("starting pageout helper %d for domain " + "%d: %d\n", j, i, error); + } error = kthread_add(vm_pageout_laundry_worker, (void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i); if (error != 0) diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h index 0573457db675..0e5d1c911b8d 100644 --- a/sys/vm/vm_pagequeue.h +++ b/sys/vm/vm_pagequeue.h @@ -84,6 +84,7 @@ struct vm_batchqueue { } __aligned(CACHE_LINE_SIZE); #include +#include #include struct sysctl_oid; @@ -254,6 +255,14 @@ struct vm_domain { /* Paging control variables, used within single threaded page daemon. */ struct pidctrl vmd_pid; /* Pageout controller. */ boolean_t vmd_oom; + u_int vmd_inactive_threads; + u_int vmd_inactive_shortage; /* Per-thread shortage. */ + blockcount_t vmd_inactive_running; /* Number of inactive threads. */ + blockcount_t vmd_inactive_starting; /* Number of threads started. */ + volatile u_int vmd_addl_shortage; /* Shortage accumulator. */ + volatile u_int vmd_inactive_freed; /* Successful inactive frees. */ + volatile u_int vmd_inactive_us; /* Microseconds for above. */ + u_int vmd_inactive_pps; /* Exponential decay frees/second. */ int vmd_oom_seq; int vmd_last_active_scan; struct vm_page vmd_markers[PQ_COUNT]; /* (q) markers for queue scans */