Eliminate pageout wakeup races. Take another step towards lockless

vmd_free_count manipulation.  Reduce the scope of the free lock by
using a pageout lock to synchronize sleep and wakeup.  Only trigger
the pageout daemon on transitions between states.  Drive all wakeup
operations directly as side-effects from freeing memory rather than
requiring an additional function call.

Reviewed by:	markj, kib
Tested by:	pho
Sponsored by:	Netflix, Dell/EMC Isilon
Differential Revision:	https://reviews.freebsd.org/D14612
This commit is contained in:
Jeff Roberson 2018-03-15 19:23:07 +00:00
parent d09fcbd30e
commit 30fbfdda6c
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=331018
4 changed files with 151 additions and 118 deletions

View File

@ -139,14 +139,15 @@ extern int vmem_startup_count(void);
struct vm_domain vm_dom[MAXMEMDOM];
struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
struct mtx_padalign __exclusive_cache_line vm_domainset_lock;
/* The following fields are protected by the domainset lock. */
domainset_t __exclusive_cache_line vm_min_domains;
domainset_t __exclusive_cache_line vm_severe_domains;
static int vm_min_waiters;
static int vm_severe_waiters;
static int vm_pageproc_waiters;
/*
* bogus page -- for I/O to/from partially complete buffers,
* or for paging into sparsely invalid regions.
@ -184,7 +185,6 @@ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
vm_page_t mpred);
static int vm_page_reclaim_run(int req_class, int domain, u_long npages,
vm_page_t m_run, vm_paddr_t high);
static void vm_domain_free_wakeup(struct vm_domain *);
static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object,
int req);
@ -430,6 +430,7 @@ vm_page_domain_init(int domain)
MTX_DEF | MTX_DUPOK);
}
mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF);
mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF);
snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
}
@ -731,8 +732,8 @@ vm_page_startup(vm_offset_t vaddr)
vmd = VM_DOMAIN(seg->domain);
vm_domain_free_lock(vmd);
vm_phys_free_contig(m, pagecount);
vm_domain_freecnt_adj(vmd, (int)pagecount);
vm_domain_free_unlock(vmd);
vm_domain_freecnt_inc(vmd, pagecount);
vm_cnt.v_page_count += (u_int)pagecount;
vmd = VM_DOMAIN(seg->domain);
@ -1694,7 +1695,6 @@ vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain,
struct vm_domain *vmd;
vm_page_t m;
int flags;
u_int free_count;
KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
(object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
@ -1747,6 +1747,9 @@ vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain,
#endif
}
}
if (m != NULL)
vm_domain_freecnt_dec(vmd, 1);
vm_domain_free_unlock(vmd);
if (m == NULL) {
/*
* Not allocatable, give up.
@ -1760,15 +1763,7 @@ vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain,
* At this point we had better have found a good page.
*/
KASSERT(m != NULL, ("missing page"));
free_count = vm_domain_freecnt_adj(vmd, -1);
vm_domain_free_unlock(vmd);
/*
* Don't wakeup too often - wakeup the pageout daemon when
* we would be nearly out of memory.
*/
if (vm_paging_needed(vmd, free_count))
pagedaemon_wakeup(vmd->vmd_domain);
#if VM_NRESERVLEVEL > 0
found:
#endif
@ -1804,7 +1799,6 @@ vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain,
if (object != NULL) {
if (vm_page_insert_after(m, object, pindex, mpred)) {
pagedaemon_wakeup(domain);
if (req & VM_ALLOC_WIRED) {
vm_wire_sub(1);
m->wire_count = 0;
@ -1961,13 +1955,14 @@ vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain,
goto retry;
#endif
}
if (m_ret != NULL)
vm_domain_freecnt_dec(vmd, npages);
vm_domain_free_unlock(vmd);
if (m_ret == NULL) {
if (vm_domain_alloc_fail(vmd, object, req))
goto again;
return (NULL);
}
vm_domain_freecnt_adj(vmd, -npages);
vm_domain_free_unlock(vmd);
#if VM_NRESERVLEVEL > 0
found:
#endif
@ -2006,7 +2001,6 @@ vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain,
m->oflags = oflags;
if (object != NULL) {
if (vm_page_insert_after(m, object, pindex, mpred)) {
pagedaemon_wakeup(domain);
if ((req & VM_ALLOC_WIRED) != 0)
vm_wire_sub(npages);
KASSERT(m->object == NULL,
@ -2035,9 +2029,6 @@ vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain,
pmap_page_set_memattr(m, memattr);
pindex++;
}
vmd = VM_DOMAIN(domain);
if (vm_paging_needed(vmd, vmd->vmd_free_count))
pagedaemon_wakeup(domain);
return (m_ret);
}
@ -2100,7 +2091,7 @@ vm_page_alloc_freelist_domain(int domain, int freelist, int req)
{
struct vm_domain *vmd;
vm_page_t m;
u_int flags, free_count;
u_int flags;
/*
* Do not allocate reserved pages unless the req has asked for it.
@ -2111,13 +2102,14 @@ vm_page_alloc_freelist_domain(int domain, int freelist, int req)
if (vm_domain_available(vmd, req, 1))
m = vm_phys_alloc_freelist_pages(domain, freelist,
VM_FREEPOOL_DIRECT, 0);
if (m != NULL)
vm_domain_freecnt_dec(vmd, 1);
vm_domain_free_unlock(vmd);
if (m == NULL) {
if (vm_domain_alloc_fail(vmd, NULL, req))
goto again;
return (NULL);
}
free_count = vm_domain_freecnt_adj(vmd, -1);
vm_domain_free_unlock(vmd);
vm_page_alloc_check(m);
/*
@ -2138,8 +2130,6 @@ vm_page_alloc_freelist_domain(int domain, int freelist, int req)
}
/* Unmanaged pages don't use "act_count". */
m->oflags = VPO_UNMANAGED;
if (vm_paging_needed(vmd, free_count))
pagedaemon_wakeup(domain);
return (m);
}
@ -2539,15 +2529,19 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
if (m_mtx != NULL)
mtx_unlock(m_mtx);
if ((m = SLIST_FIRST(&free)) != NULL) {
int cnt;
vmd = VM_DOMAIN(domain);
cnt = 0;
vm_domain_free_lock(vmd);
do {
MPASS(vm_phys_domain(m) == domain);
SLIST_REMOVE_HEAD(&free, plinks.s.ss);
vm_page_free_phys(vmd, m);
cnt++;
} while ((m = SLIST_FIRST(&free)) != NULL);
vm_domain_free_wakeup(vmd);
vm_domain_free_unlock(vmd);
vm_domain_freecnt_inc(vmd, cnt);
}
return (error);
}
@ -2710,7 +2704,7 @@ vm_domain_set(struct vm_domain *vmd)
/*
* Clear the domain from the appropriate page level domainset.
*/
static void
void
vm_domain_clear(struct vm_domain *vmd)
{
@ -2731,6 +2725,22 @@ vm_domain_clear(struct vm_domain *vmd)
wakeup(&vm_severe_domains);
}
}
/*
* If pageout daemon needs pages, then tell it that there are
* some free.
*/
if (vmd->vmd_pageout_pages_needed &&
vmd->vmd_free_count >= vmd->vmd_pageout_free_min) {
wakeup(&vmd->vmd_pageout_pages_needed);
vmd->vmd_pageout_pages_needed = 0;
}
/* See comments in vm_wait_doms(). */
if (vm_pageproc_waiters) {
vm_pageproc_waiters = 0;
wakeup(&vm_pageproc_waiters);
}
mtx_unlock(&vm_domainset_lock);
}
@ -2769,7 +2779,7 @@ u_int
vm_wait_count(void)
{
return (vm_severe_waiters + vm_min_waiters);
return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters);
}
static void
@ -2787,9 +2797,8 @@ vm_wait_doms(const domainset_t *wdoms)
if (curproc == pageproc) {
mtx_lock(&vm_domainset_lock);
vm_pageproc_waiters++;
msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM,
msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM | PDROP,
"pageprocwait", 1);
mtx_unlock(&vm_domainset_lock);
} else {
/*
* XXX Ideally we would wait only until the allocation could
@ -2819,14 +2828,17 @@ vm_wait_domain(int domain)
domainset_t wdom;
vmd = VM_DOMAIN(domain);
vm_domain_free_assert_locked(vmd);
vm_domain_free_assert_unlocked(vmd);
if (curproc == pageproc) {
vmd->vmd_pageout_pages_needed = 1;
msleep(&vmd->vmd_pageout_pages_needed,
vm_domain_free_lockptr(vmd), PDROP | PSWP, "VMWait", 0);
mtx_lock(&vm_domainset_lock);
if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) {
vmd->vmd_pageout_pages_needed = 1;
msleep(&vmd->vmd_pageout_pages_needed,
&vm_domainset_lock, PDROP | PSWP, "VMWait", 0);
} else
mtx_unlock(&vm_domainset_lock);
} else {
vm_domain_free_unlock(vmd);
if (pageproc == NULL)
panic("vm_wait in early boot");
DOMAINSET_ZERO(&wdom);
@ -2876,7 +2888,7 @@ static int
vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req)
{
vm_domain_free_assert_locked(vmd);
vm_domain_free_assert_unlocked(vmd);
atomic_add_int(&vmd->vmd_pageout_deficit,
max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
@ -2888,10 +2900,8 @@ vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req)
VM_OBJECT_WLOCK(object);
if (req & VM_ALLOC_WAITOK)
return (EAGAIN);
} else {
vm_domain_free_unlock(vmd);
pagedaemon_wakeup(vmd->vmd_domain);
}
return (0);
}
@ -3062,46 +3072,6 @@ vm_page_activate(vm_page_t m)
}
}
/*
* vm_domain_free_wakeup:
*
* Helper routine for vm_page_free_toq(). This routine is called
* when a page is added to the free queues.
*
* The page queues must be locked.
*/
static void
vm_domain_free_wakeup(struct vm_domain *vmd)
{
vm_domain_free_assert_locked(vmd);
/*
* if pageout daemon needs pages, then tell it that there are
* some free.
*/
if (vmd->vmd_pageout_pages_needed &&
vmd->vmd_free_count >= vmd->vmd_pageout_free_min) {
wakeup(&vmd->vmd_pageout_pages_needed);
vmd->vmd_pageout_pages_needed = 0;
}
/*
* wakeup processes that are waiting on memory if we hit a
* high water mark. And wakeup scheduler process if we have
* lots of memory. this process will swapin processes.
*/
if ((vmd->vmd_minset && !vm_paging_min(vmd)) ||
(vmd->vmd_severeset && !vm_paging_severe(vmd)))
vm_domain_clear(vmd);
/* See comments in vm_wait(); */
if (vm_pageproc_waiters) {
vm_pageproc_waiters = 0;
wakeup(&vm_pageproc_waiters);
}
}
/*
* vm_page_free_prep:
*
@ -3183,7 +3153,8 @@ vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
/*
* Insert the page into the physical memory allocator's free page
* queues. This is the last step to free a page.
* queues. This is the last step to free a page. The caller is
* responsible for adjusting the free page count.
*/
static void
vm_page_free_phys(struct vm_domain *vmd, vm_page_t m)
@ -3191,7 +3162,6 @@ vm_page_free_phys(struct vm_domain *vmd, vm_page_t m)
vm_domain_free_assert_locked(vmd);
vm_domain_freecnt_adj(vmd, 1);
#if VM_NRESERVLEVEL > 0
if (!vm_reserv_free_page(m))
#endif
@ -3203,24 +3173,28 @@ vm_page_free_phys_pglist(struct pglist *tq)
{
struct vm_domain *vmd;
vm_page_t m;
int cnt;
if (TAILQ_EMPTY(tq))
return;
vmd = NULL;
cnt = 0;
TAILQ_FOREACH(m, tq, listq) {
if (vmd != vm_pagequeue_domain(m)) {
if (vmd != NULL) {
vm_domain_free_wakeup(vmd);
vm_domain_free_unlock(vmd);
vm_domain_freecnt_inc(vmd, cnt);
cnt = 0;
}
vmd = vm_pagequeue_domain(m);
vm_domain_free_lock(vmd);
}
vm_page_free_phys(vmd, m);
cnt++;
}
if (vmd != NULL) {
vm_domain_free_wakeup(vmd);
vm_domain_free_unlock(vmd);
vm_domain_freecnt_inc(vmd, cnt);
}
}
@ -3243,8 +3217,8 @@ vm_page_free_toq(vm_page_t m)
vmd = vm_pagequeue_domain(m);
vm_domain_free_lock(vmd);
vm_page_free_phys(vmd, m);
vm_domain_free_wakeup(vmd);
vm_domain_free_unlock(vmd);
vm_domain_freecnt_inc(vmd, 1);
}
/*

View File

@ -1790,7 +1790,13 @@ vm_pageout_worker(void *arg)
* The pageout daemon worker is never done, so loop forever.
*/
while (TRUE) {
vm_domain_free_lock(vmd);
vm_domain_pageout_lock(vmd);
/*
* We need to clear wanted before we check the limits. This
* prevents races with wakers who will check wanted after they
* reach the limit.
*/
atomic_store_int(&vmd->vmd_pageout_wanted, 0);
/*
* Might the page daemon need to run again?
@ -1801,7 +1807,7 @@ vm_pageout_worker(void *arg)
* we have performed a level >= 1 (page reclamation)
* scan, then sleep a bit and try again.
*/
vm_domain_free_unlock(vmd);
vm_domain_pageout_unlock(vmd);
if (pass > 1)
pause("pwait", hz / VM_INACT_SCAN_RATE);
} else {
@ -1809,12 +1815,18 @@ vm_pageout_worker(void *arg)
* No, sleep until the next wakeup or until pages
* need to have their reference stats updated.
*/
vmd->vmd_pageout_wanted = false;
if (mtx_sleep(&vmd->vmd_pageout_wanted,
vm_domain_free_lockptr(vmd), PDROP | PVM,
vm_domain_pageout_lockptr(vmd), PDROP | PVM,
"psleep", hz / VM_INACT_SCAN_RATE) == 0)
VM_CNT_INC(v_pdwakeups);
}
/* Prevent spurious wakeups by ensuring that wanted is set. */
atomic_store_int(&vmd->vmd_pageout_wanted, 1);
/*
* Use the controller to calculate how many pages to free in
* this interval.
*/
shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count);
if (shortage && pass == 0)
pass = 1;
@ -1970,10 +1982,14 @@ pagedaemon_wakeup(int domain)
struct vm_domain *vmd;
vmd = VM_DOMAIN(domain);
vm_domain_free_assert_unlocked(vmd);
vm_domain_pageout_assert_unlocked(vmd);
if (curproc == pageproc)
return;
if (!vmd->vmd_pageout_wanted && curthread->td_proc != pageproc) {
vmd->vmd_pageout_wanted = true;
if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) {
vm_domain_pageout_lock(vmd);
atomic_store_int(&vmd->vmd_pageout_wanted, 1);
wakeup(&vmd->vmd_pageout_wanted);
vm_domain_pageout_unlock(vmd);
}
}

View File

@ -76,17 +76,31 @@ struct vm_pagequeue {
#include <sys/pidctrl.h>
struct sysctl_oid;
/*
* One vm_domain per-numa domain. Contains pagequeues, free page structures,
* and accounting.
*
* Lock Key:
* f vmd_free_mtx
* p vmd_pageout_mtx
* d vm_domainset_lock
* a atomic
* c const after boot
*/
struct vm_domain {
struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
struct mtx_padalign vmd_free_mtx;
struct vmem *vmd_kernel_arena;
u_int vmd_domain; /* Domain number. */
u_int vmd_page_count;
long vmd_segs; /* bitmask of the segments */
struct mtx_padalign vmd_pageout_mtx;
struct vmem *vmd_kernel_arena; /* (c) per-domain kva arena. */
u_int vmd_domain; /* (c) Domain number. */
u_int vmd_page_count; /* (c) Total page count. */
long vmd_segs; /* (c) bitmask of the segments */
u_int __aligned(CACHE_LINE_SIZE) vmd_free_count; /* (a,f) free page count */
u_int vmd_pageout_deficit; /* (a) Estimated number of pages deficit */
uint8_t vmd_pad[CACHE_LINE_SIZE - (sizeof(u_int) * 2)];
/* Paging control variables, locked by domain_free_mtx. */
/* Paging control variables, used within single threaded page daemon. */
struct pidctrl vmd_pid; /* Pageout controller. */
u_int vmd_free_count;
boolean_t vmd_oom;
int vmd_oom_seq;
int vmd_last_active_scan;
@ -94,11 +108,10 @@ struct vm_domain {
struct vm_page vmd_marker; /* marker for pagedaemon private use */
struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
int vmd_pageout_pages_needed; /* page daemon waiting for pages? */
int vmd_pageout_deficit; /* Estimated number of pages deficit */
bool vmd_pageout_wanted; /* pageout daemon wait channel */
bool vmd_minset; /* Are we in vm_min_domains? */
bool vmd_severeset; /* Are we in vm_severe_domains? */
int vmd_pageout_wanted; /* (a, p) pageout daemon wait channel */
int vmd_pageout_pages_needed; /* (d) page daemon waiting for pages? */
bool vmd_minset; /* (d) Are we in vm_min_domains? */
bool vmd_severeset; /* (d) Are we in vm_severe_domains? */
int vmd_inactq_scans;
enum {
VM_LAUNDRY_IDLE = 0,
@ -142,6 +155,17 @@ extern struct vm_domain vm_dom[MAXMEMDOM];
#define vm_domain_free_unlock(d) \
mtx_unlock(vm_domain_free_lockptr((d)))
#define vm_domain_pageout_lockptr(d) \
(&(d)->vmd_pageout_mtx)
#define vm_domain_pageout_assert_locked(n) \
mtx_assert(vm_domain_pageout_lockptr((n)), MA_OWNED)
#define vm_domain_pageout_assert_unlocked(n) \
mtx_assert(vm_domain_pageout_lockptr((n)), MA_NOTOWNED)
#define vm_domain_pageout_lock(d) \
mtx_lock(vm_domain_pageout_lockptr((d)))
#define vm_domain_pageout_unlock(d) \
mtx_unlock(vm_domain_pageout_lockptr((d)))
static __inline void
vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
{
@ -155,6 +179,7 @@ vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1)
void vm_domain_set(struct vm_domain *vmd);
void vm_domain_clear(struct vm_domain *vmd);
int vm_domain_available(struct vm_domain *vmd, int req, int npages);
/*
@ -221,18 +246,40 @@ vm_laundry_target(struct vm_domain *vmd)
return (vm_paging_target(vmd));
}
static inline u_int
vm_domain_freecnt_adj(struct vm_domain *vmd, int adj)
void pagedaemon_wakeup(int domain);
static inline void
vm_domain_freecnt_inc(struct vm_domain *vmd, int adj)
{
u_int ret;
u_int old, new;
vm_domain_free_assert_locked(vmd);
ret = vmd->vmd_free_count += adj;
if ((!vmd->vmd_minset && vm_paging_min(vmd)) ||
(!vmd->vmd_severeset && vm_paging_severe(vmd)))
old = atomic_fetchadd_int(&vmd->vmd_free_count, adj);
new = old + adj;
/*
* Only update bitsets on transitions. Notice we short-circuit the
* rest of the checks if we're above min already.
*/
if (old < vmd->vmd_free_min && (new >= vmd->vmd_free_min ||
(old < vmd->vmd_free_severe && new >= vmd->vmd_free_severe) ||
(old < vmd->vmd_pageout_free_min &&
new >= vmd->vmd_pageout_free_min)))
vm_domain_clear(vmd);
}
static inline void
vm_domain_freecnt_dec(struct vm_domain *vmd, int adj)
{
u_int old, new;
old = atomic_fetchadd_int(&vmd->vmd_free_count, -adj);
new = old - adj;
KASSERT(new >= 0, ("vm_domain_freecnt_dec: free count underflow"));
if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old))
pagedaemon_wakeup(vmd->vmd_domain);
/* Only update bitsets on transitions. */
if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) ||
(old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe))
vm_domain_set(vmd);
return (ret);
}

View File

@ -593,7 +593,7 @@ vm_reserv_extend_contig(int req, vm_object_t object, vm_pindex_t pindex,
}
for (i = 0; i < npages; i++)
vm_reserv_populate(rv, index + i);
vm_domain_freecnt_adj(vmd, -npages);
vm_domain_freecnt_dec(vmd, npages);
out:
vm_domain_free_unlock(vmd);
return (m);
@ -789,7 +789,7 @@ vm_reserv_extend(int req, vm_object_t object, vm_pindex_t pindex, int domain,
struct vm_domain *vmd;
vm_page_t m, msucc;
vm_reserv_t rv;
int index, free_count;
int index;
VM_OBJECT_ASSERT_WLOCKED(object);
@ -822,14 +822,10 @@ vm_reserv_extend(int req, vm_object_t object, vm_pindex_t pindex, int domain,
m = NULL;
if (m != NULL) {
vm_reserv_populate(rv, index);
free_count = vm_domain_freecnt_adj(vmd, -1);
} else
free_count = vmd->vmd_free_count;
vm_domain_freecnt_dec(vmd, 1);
}
vm_domain_free_unlock(vmd);
if (vm_paging_needed(vmd, free_count))
pagedaemon_wakeup(domain);
return (m);
}