Improve VM page queue scalability.

Currently both the page lock and a page queue lock must be held in
order to enqueue, dequeue or requeue a page in a given page queue.
The queue locks are a scalability bottleneck in many workloads. This
change reduces page queue lock contention by batching queue operations.
To detangle the page and page queue locks, per-CPU batch queues are
used to reference pages with pending queue operations. The requested
operation is encoded in the page's aflags field with the page lock
held, after which the page is enqueued for a deferred batch operation.
Page queue scans are similarly optimized to minimize the amount of
work performed with a page queue lock held.

Reviewed by:	kib, jeff (previous versions)
Tested by:	pho
Sponsored by:	Dell EMC Isilon
Differential Revision:	https://reviews.freebsd.org/D14893
This commit is contained in:
Mark Johnston 2018-04-24 21:15:54 +00:00
parent 55ba21d4fd
commit 5cd29d0f3c
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=332974
9 changed files with 905 additions and 559 deletions

View File

@ -227,4 +227,10 @@
#define ZERO_REGION_SIZE (2 * 1024 * 1024) /* 2MB */
/*
* Use a fairly large batch size since we expect amd64 systems to have lots of
* memory.
*/
#define VM_BATCHQUEUE_SIZE 31
#endif /* _MACHINE_VMPARAM_H_ */

View File

@ -603,7 +603,6 @@ static struct witness_order_list_entry order_lists[] = {
* CDEV
*/
{ "vm map (system)", &lock_class_mtx_sleep },
{ "vm pagequeue", &lock_class_mtx_sleep },
{ "vnode interlock", &lock_class_mtx_sleep },
{ "cdev", &lock_class_mtx_sleep },
{ NULL, NULL },
@ -613,11 +612,11 @@ static struct witness_order_list_entry order_lists[] = {
{ "vm map (user)", &lock_class_sx },
{ "vm object", &lock_class_rw },
{ "vm page", &lock_class_mtx_sleep },
{ "vm pagequeue", &lock_class_mtx_sleep },
{ "pmap pv global", &lock_class_rw },
{ "pmap", &lock_class_mtx_sleep },
{ "pmap pv list", &lock_class_rw },
{ "vm page free queue", &lock_class_mtx_sleep },
{ "vm pagequeue", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* kqueue/VFS interaction

View File

@ -720,14 +720,11 @@ static void
vm_object_terminate_pages(vm_object_t object)
{
vm_page_t p, p_next;
struct mtx *mtx, *mtx1;
struct vm_pagequeue *pq, *pq1;
int dequeued;
struct mtx *mtx;
VM_OBJECT_ASSERT_WLOCKED(object);
mtx = NULL;
pq = NULL;
/*
* Free any remaining pageable pages. This also removes them from the
@ -737,60 +734,21 @@ vm_object_terminate_pages(vm_object_t object)
*/
TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
vm_page_assert_unbusied(p);
if ((object->flags & OBJ_UNMANAGED) == 0) {
if ((object->flags & OBJ_UNMANAGED) == 0)
/*
* vm_page_free_prep() only needs the page
* lock for managed pages.
*/
mtx1 = vm_page_lockptr(p);
if (mtx1 != mtx) {
if (mtx != NULL)
mtx_unlock(mtx);
if (pq != NULL) {
vm_pagequeue_cnt_add(pq, dequeued);
vm_pagequeue_unlock(pq);
pq = NULL;
}
mtx = mtx1;
mtx_lock(mtx);
}
}
vm_page_change_lock(p, &mtx);
p->object = NULL;
if (p->wire_count != 0)
goto unlist;
VM_CNT_INC(v_pfree);
p->flags &= ~PG_ZERO;
if (p->queue != PQ_NONE) {
KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: "
"page %p is not queued", p));
pq1 = vm_page_pagequeue(p);
if (pq != pq1) {
if (pq != NULL) {
vm_pagequeue_cnt_add(pq, dequeued);
vm_pagequeue_unlock(pq);
}
pq = pq1;
vm_pagequeue_lock(pq);
dequeued = 0;
}
p->queue = PQ_NONE;
TAILQ_REMOVE(&pq->pq_pl, p, plinks.q);
dequeued--;
}
if (vm_page_free_prep(p, true))
continue;
unlist:
TAILQ_REMOVE(&object->memq, p, listq);
}
if (pq != NULL) {
vm_pagequeue_cnt_add(pq, dequeued);
vm_pagequeue_unlock(pq);
VM_CNT_INC(v_pfree);
vm_page_free(p);
}
if (mtx != NULL)
mtx_unlock(mtx);
vm_page_free_phys_pglist(&object->memq);
/*
* If the object contained any pages, then reset it to an empty state.
* None of the object's fields, including "resident_page_count", were
@ -1973,7 +1931,6 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
{
vm_page_t p, next;
struct mtx *mtx;
struct pglist pgl;
VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
@ -1982,7 +1939,6 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
if (object->resident_page_count == 0)
return;
vm_object_pip_add(object, 1);
TAILQ_INIT(&pgl);
again:
p = vm_page_find_least(object, start);
mtx = NULL;
@ -2036,13 +1992,10 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
}
if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0)
pmap_remove_all(p);
p->flags &= ~PG_ZERO;
if (vm_page_free_prep(p, false))
TAILQ_INSERT_TAIL(&pgl, p, listq);
vm_page_free(p);
}
if (mtx != NULL)
mtx_unlock(mtx);
vm_page_free_phys_pglist(&pgl);
vm_object_pip_wakeup(object);
}

View File

@ -102,6 +102,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sbuf.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
@ -131,13 +132,10 @@ extern int uma_startup_count(int);
extern void uma_startup(void *, int);
extern int vmem_startup_count(void);
/*
* Associated with page of user-allocatable memory is a
* page structure.
*/
struct vm_domain vm_dom[MAXMEMDOM];
static DPCPU_DEFINE(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]);
struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
struct mtx_padalign __exclusive_cache_line vm_domainset_lock;
@ -176,7 +174,8 @@ static uma_zone_t fakepg_zone;
static void vm_page_alloc_check(vm_page_t m);
static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
static void vm_page_enqueue(uint8_t queue, vm_page_t m);
static void vm_page_dequeue_complete(vm_page_t m);
static void vm_page_enqueue(vm_page_t m, uint8_t queue);
static void vm_page_init(void *dummy);
static int vm_page_insert_after(vm_page_t m, vm_object_t object,
vm_pindex_t pindex, vm_page_t mpred);
@ -443,12 +442,13 @@ sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
* Nonetheless, it write busies and initializes the hold count to one as
* safety precautions.
*/
void
vm_page_init_marker(vm_page_t marker, int queue)
static void
vm_page_init_marker(vm_page_t marker, int queue, uint8_t aflags)
{
bzero(marker, sizeof(*marker));
marker->flags = PG_MARKER;
marker->aflags = aflags;
marker->busy_lock = VPB_SINGLE_EXCLUSIVER;
marker->queue = queue;
marker->hold_count = 1;
@ -481,14 +481,32 @@ vm_page_domain_init(int domain)
TAILQ_INIT(&pq->pq_pl);
mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
MTX_DEF | MTX_DUPOK);
vm_page_init_marker(&vmd->vmd_markers[i], i);
vm_page_init_marker(&vmd->vmd_markers[i], i, 0);
}
mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF);
mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF);
vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE);
snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
/*
* inacthead is used to provide FIFO ordering for LRU-bypassing
* insertions.
*/
vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED);
TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl,
&vmd->vmd_inacthead, plinks.q);
snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
/*
* The clock pages are used to implement active queue scanning without
* requeues. Scans start at clock[0], which is advanced after the scan
* ends. When the two clock hands meet, they are reset and scanning
* resumes from the head of the queue.
*/
vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED);
vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED);
TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
&vmd->vmd_clock[0], plinks.q);
TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl,
&vmd->vmd_clock[1], plinks.q);
}
/*
@ -1847,6 +1865,7 @@ vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain,
KASSERT(m != NULL, ("missing page"));
found:
vm_page_dequeue(m);
vm_page_alloc_check(m);
/*
@ -2043,8 +2062,10 @@ vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain,
#if VM_NRESERVLEVEL > 0
found:
#endif
for (m = m_ret; m < &m_ret[npages]; m++)
for (m = m_ret; m < &m_ret[npages]; m++) {
vm_page_dequeue(m);
vm_page_alloc_check(m);
}
/*
* Initialize the pages. Only the PG_ZERO flag is inherited.
@ -2188,6 +2209,7 @@ vm_page_alloc_freelist_domain(int domain, int freelist, int req)
goto again;
return (NULL);
}
vm_page_dequeue(m);
vm_page_alloc_check(m);
/*
@ -2381,7 +2403,7 @@ vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
vm_reserv_size(level)) - pa);
#endif
} else if (object->memattr == VM_MEMATTR_DEFAULT &&
m->queue != PQ_NONE && !vm_page_busied(m)) {
vm_page_enqueued(m) && !vm_page_busied(m)) {
/*
* The page is allocated but eligible for
* relocation. Extend the current run by one
@ -2532,7 +2554,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
error = EINVAL;
else if (object->memattr != VM_MEMATTR_DEFAULT)
error = EINVAL;
else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
else if (vm_page_enqueued(m) && !vm_page_busied(m)) {
KASSERT(pmap_page_get_memattr(m) ==
VM_MEMATTR_DEFAULT,
("page %p has an unexpected memattr", m));
@ -2592,7 +2614,8 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
*/
if (object->ref_count != 0)
pmap_remove_all(m);
m_new->aflags = m->aflags;
m_new->aflags = m->aflags &
~PGA_QUEUE_STATE_MASK;
KASSERT(m_new->oflags == VPO_UNMANAGED,
("page %p is managed", m_new));
m_new->oflags = m->oflags & VPO_NOSYNC;
@ -2604,7 +2627,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
vm_page_remque(m);
vm_page_replace_checked(m_new, object,
m->pindex, m);
if (vm_page_free_prep(m, false))
if (vm_page_free_prep(m))
SLIST_INSERT_HEAD(&free, m,
plinks.s.ss);
@ -2618,7 +2641,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
m->flags &= ~PG_ZERO;
vm_page_remque(m);
vm_page_remove(m);
if (vm_page_free_prep(m, false))
if (vm_page_free_prep(m))
SLIST_INSERT_HEAD(&free, m,
plinks.s.ss);
KASSERT(m->dirty == 0,
@ -3061,113 +3084,297 @@ vm_page_pagequeue(vm_page_t m)
return (&vm_pagequeue_domain(m)->vmd_pagequeues[m->queue]);
}
static struct mtx *
vm_page_pagequeue_lockptr(vm_page_t m)
{
if (m->queue == PQ_NONE)
return (NULL);
return (&vm_page_pagequeue(m)->pq_mutex);
}
static inline void
vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m)
{
struct vm_domain *vmd;
uint8_t aflags;
vm_pagequeue_assert_locked(pq);
KASSERT(pq == vm_page_pagequeue(m),
("page %p doesn't belong to %p", m, pq));
aflags = m->aflags;
if ((aflags & PGA_DEQUEUE) != 0) {
if (__predict_true((aflags & PGA_ENQUEUED) != 0)) {
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
vm_pagequeue_cnt_dec(pq);
}
vm_page_dequeue_complete(m);
} else if ((aflags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)) != 0) {
if ((aflags & PGA_ENQUEUED) != 0)
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
else {
vm_pagequeue_cnt_inc(pq);
vm_page_aflag_set(m, PGA_ENQUEUED);
}
if ((aflags & PGA_REQUEUE_HEAD) != 0) {
KASSERT(m->queue == PQ_INACTIVE,
("head enqueue not supported for page %p", m));
vmd = vm_pagequeue_domain(m);
TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
} else
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
/*
* PGA_REQUEUE and PGA_REQUEUE_HEAD must be cleared after
* setting PGA_ENQUEUED in order to synchronize with the
* page daemon.
*/
vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
}
}
static void
vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq,
uint8_t queue)
{
vm_page_t m;
int i;
for (i = 0; i < bq->bq_cnt; i++) {
m = bq->bq_pa[i];
if (__predict_false(m->queue != queue))
continue;
vm_pqbatch_process_page(pq, m);
}
vm_batchqueue_init(bq);
}
static void
vm_pqbatch_submit_page(vm_page_t m, uint8_t queue)
{
struct vm_batchqueue *bq;
struct vm_pagequeue *pq;
int domain;
vm_page_assert_locked(m);
KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue));
domain = vm_phys_domain(m);
pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue];
critical_enter();
bq = DPCPU_PTR(pqbatch[domain][queue]);
if (vm_batchqueue_insert(bq, m)) {
critical_exit();
return;
}
if (!vm_pagequeue_trylock(pq)) {
critical_exit();
vm_pagequeue_lock(pq);
critical_enter();
bq = DPCPU_PTR(pqbatch[domain][queue]);
}
vm_pqbatch_process(pq, bq, queue);
/*
* The page may have been logically dequeued before we acquired the
* page queue lock. In this case, the page lock prevents the page
* from being logically enqueued elsewhere.
*/
if (__predict_true(m->queue == queue))
vm_pqbatch_process_page(pq, m);
else {
KASSERT(m->queue == PQ_NONE,
("invalid queue transition for page %p", m));
KASSERT((m->aflags & PGA_ENQUEUED) == 0,
("page %p is enqueued with invalid queue index", m));
vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK);
}
vm_pagequeue_unlock(pq);
critical_exit();
}
/*
* vm_page_dequeue:
* vm_page_drain_pqbatch: [ internal use only ]
*
* Remove the given page from its current page queue.
* Force all per-CPU page queue batch queues to be drained. This is
* intended for use in severe memory shortages, to ensure that pages
* do not remain stuck in the batch queues.
*/
void
vm_page_drain_pqbatch(void)
{
struct thread *td;
struct vm_domain *vmd;
struct vm_pagequeue *pq;
int cpu, domain, queue;
td = curthread;
CPU_FOREACH(cpu) {
thread_lock(td);
sched_bind(td, cpu);
thread_unlock(td);
for (domain = 0; domain < vm_ndomains; domain++) {
vmd = VM_DOMAIN(domain);
for (queue = 0; queue < PQ_COUNT; queue++) {
pq = &vmd->vmd_pagequeues[queue];
vm_pagequeue_lock(pq);
critical_enter();
vm_pqbatch_process(pq,
DPCPU_PTR(pqbatch[domain][queue]), queue);
critical_exit();
vm_pagequeue_unlock(pq);
}
}
}
thread_lock(td);
sched_unbind(td);
thread_unlock(td);
}
/*
* Complete the logical removal of a page from a page queue. We must be
* careful to synchronize with the page daemon, which may be concurrently
* examining the page with only the page lock held. The page must not be
* in a state where it appears to be logically enqueued.
*/
static void
vm_page_dequeue_complete(vm_page_t m)
{
m->queue = PQ_NONE;
atomic_thread_fence_rel();
vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK);
}
/*
* vm_page_dequeue_deferred: [ internal use only ]
*
* Request removal of the given page from its current page
* queue. Physical removal from the queue may be deferred
* indefinitely.
*
* The page must be locked.
*/
void
vm_page_dequeue(vm_page_t m)
vm_page_dequeue_deferred(vm_page_t m)
{
struct vm_pagequeue *pq;
int queue;
vm_page_assert_locked(m);
KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
m));
pq = vm_page_pagequeue(m);
vm_pagequeue_lock(pq);
m->queue = PQ_NONE;
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
vm_pagequeue_cnt_dec(pq);
vm_pagequeue_unlock(pq);
queue = m->queue;
if (queue == PQ_NONE) {
KASSERT((m->aflags & PGA_QUEUE_STATE_MASK) == 0,
("page %p has queue state", m));
return;
}
if ((m->aflags & PGA_DEQUEUE) == 0)
vm_page_aflag_set(m, PGA_DEQUEUE);
vm_pqbatch_submit_page(m, queue);
}
/*
* vm_page_dequeue_locked:
*
* Remove the given page from its current page queue.
* Remove the page from its page queue, which must be locked.
* If the page lock is not held, there is no guarantee that the
* page will not be enqueued by another thread before this function
* returns. In this case, it is up to the caller to ensure that
* no other threads hold a reference to the page.
*
* The page and page queue must be locked.
* The page queue lock must be held. If the page is not already
* logically dequeued, the page lock must be held as well.
*/
void
vm_page_dequeue_locked(vm_page_t m)
{
struct vm_pagequeue *pq;
vm_page_lock_assert(m, MA_OWNED);
pq = vm_page_pagequeue(m);
KASSERT(m->queue != PQ_NONE,
("%s: page %p queue field is PQ_NONE", __func__, m));
vm_pagequeue_assert_locked(pq);
m->queue = PQ_NONE;
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
vm_pagequeue_cnt_dec(pq);
KASSERT((m->aflags & PGA_DEQUEUE) != 0 ||
mtx_owned(vm_page_lockptr(m)),
("%s: queued unlocked page %p", __func__, m));
if ((m->aflags & PGA_ENQUEUED) != 0) {
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
vm_pagequeue_cnt_dec(pq);
}
vm_page_dequeue_complete(m);
}
/*
* vm_page_enqueue:
* vm_page_dequeue:
*
* Add the given page to the specified page queue.
*
* The page must be locked.
* Remove the page from whichever page queue it's in, if any.
* If the page lock is not held, there is no guarantee that the
* page will not be enqueued by another thread before this function
* returns. In this case, it is up to the caller to ensure that
* no other threads hold a reference to the page.
*/
void
vm_page_dequeue(vm_page_t m)
{
struct mtx *lock, *lock1;
lock = vm_page_pagequeue_lockptr(m);
for (;;) {
if (lock == NULL)
return;
mtx_lock(lock);
if ((lock1 = vm_page_pagequeue_lockptr(m)) == lock)
break;
mtx_unlock(lock);
lock = lock1;
}
KASSERT(lock == vm_page_pagequeue_lockptr(m),
("%s: page %p migrated directly between queues", __func__, m));
vm_page_dequeue_locked(m);
mtx_unlock(lock);
}
/*
* Schedule the given page for insertion into the specified page queue.
* Physical insertion of the page may be deferred indefinitely.
*/
static void
vm_page_enqueue(uint8_t queue, vm_page_t m)
vm_page_enqueue(vm_page_t m, uint8_t queue)
{
struct vm_pagequeue *pq;
vm_page_lock_assert(m, MA_OWNED);
KASSERT(queue < PQ_COUNT,
("vm_page_enqueue: invalid queue %u request for page %p",
queue, m));
pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue];
vm_pagequeue_lock(pq);
vm_page_assert_locked(m);
KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0,
("%s: page %p is already enqueued", __func__, m));
m->queue = queue;
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
vm_pagequeue_cnt_inc(pq);
vm_pagequeue_unlock(pq);
if ((m->aflags & PGA_REQUEUE) == 0)
vm_page_aflag_set(m, PGA_REQUEUE);
vm_pqbatch_submit_page(m, queue);
}
/*
* vm_page_requeue:
* vm_page_requeue: [ internal use only ]
*
* Move the given page to the tail of its current page queue.
* Schedule a requeue of the given page.
*
* The page must be locked.
*/
void
vm_page_requeue(vm_page_t m)
{
struct vm_pagequeue *pq;
vm_page_lock_assert(m, MA_OWNED);
vm_page_assert_locked(m);
KASSERT(m->queue != PQ_NONE,
("vm_page_requeue: page %p is not queued", m));
pq = vm_page_pagequeue(m);
vm_pagequeue_lock(pq);
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
vm_pagequeue_unlock(pq);
}
("%s: page %p is not logically enqueued", __func__, m));
/*
* vm_page_requeue_locked:
*
* Move the given page to the tail of its current page queue.
*
* The page queue must be locked.
*/
void
vm_page_requeue_locked(vm_page_t m)
{
struct vm_pagequeue *pq;
KASSERT(m->queue != PQ_NONE,
("vm_page_requeue_locked: page %p is not queued", m));
pq = vm_page_pagequeue(m);
vm_pagequeue_assert_locked(pq);
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
if ((m->aflags & PGA_REQUEUE) == 0)
vm_page_aflag_set(m, PGA_REQUEUE);
vm_pqbatch_submit_page(m, m->queue);
}
/*
@ -3185,18 +3392,18 @@ vm_page_activate(vm_page_t m)
int queue;
vm_page_lock_assert(m, MA_OWNED);
if ((queue = m->queue) != PQ_ACTIVE) {
if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
if (m->act_count < ACT_INIT)
m->act_count = ACT_INIT;
if (queue != PQ_NONE)
vm_page_dequeue(m);
vm_page_enqueue(PQ_ACTIVE, m);
}
} else {
if (m->act_count < ACT_INIT)
if ((queue = m->queue) == PQ_ACTIVE || m->wire_count > 0 ||
(m->oflags & VPO_UNMANAGED) != 0) {
if (queue == PQ_ACTIVE && m->act_count < ACT_INIT)
m->act_count = ACT_INIT;
return;
}
vm_page_remque(m);
if (m->act_count < ACT_INIT)
m->act_count = ACT_INIT;
vm_page_enqueue(m, PQ_ACTIVE);
}
/*
@ -3207,11 +3414,10 @@ vm_page_activate(vm_page_t m)
* the page to the free list only if this function returns true.
*
* The object must be locked. The page must be locked if it is
* managed. For a queued managed page, the pagequeue_locked
* argument specifies whether the page queue is already locked.
* managed.
*/
bool
vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
vm_page_free_prep(vm_page_t m)
{
#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
@ -3227,14 +3433,14 @@ vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
if ((m->oflags & VPO_UNMANAGED) == 0) {
vm_page_lock_assert(m, MA_OWNED);
KASSERT(!pmap_page_is_mapped(m),
("vm_page_free_toq: freeing mapped page %p", m));
("vm_page_free_prep: freeing mapped page %p", m));
} else
KASSERT(m->queue == PQ_NONE,
("vm_page_free_toq: unmanaged page %p is queued", m));
("vm_page_free_prep: unmanaged page %p is queued", m));
VM_CNT_INC(v_tfree);
if (vm_page_sbusied(m))
panic("vm_page_free: freeing busy page %p", m);
panic("vm_page_free_prep: freeing busy page %p", m);
vm_page_remove(m);
@ -3250,21 +3456,23 @@ vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
return (false);
}
if (m->queue != PQ_NONE) {
if (pagequeue_locked)
vm_page_dequeue_locked(m);
else
vm_page_dequeue(m);
}
/*
* Pages need not be dequeued before they are returned to the physical
* memory allocator, but they must at least be marked for a deferred
* dequeue.
*/
if ((m->oflags & VPO_UNMANAGED) == 0)
vm_page_dequeue_deferred(m);
m->valid = 0;
vm_page_undirty(m);
if (m->wire_count != 0)
panic("vm_page_free: freeing wired page %p", m);
panic("vm_page_free_prep: freeing wired page %p", m);
if (m->hold_count != 0) {
m->flags &= ~PG_ZERO;
KASSERT((m->flags & PG_UNHOLDFREE) == 0,
("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
("vm_page_free_prep: freeing PG_UNHOLDFREE page %p", m));
m->flags |= PG_UNHOLDFREE;
return (false);
}
@ -3283,36 +3491,6 @@ vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
return (true);
}
void
vm_page_free_phys_pglist(struct pglist *tq)
{
struct vm_domain *vmd;
vm_page_t m;
int cnt;
if (TAILQ_EMPTY(tq))
return;
vmd = NULL;
cnt = 0;
TAILQ_FOREACH(m, tq, listq) {
if (vmd != vm_pagequeue_domain(m)) {
if (vmd != NULL) {
vm_domain_free_unlock(vmd);
vm_domain_freecnt_inc(vmd, cnt);
cnt = 0;
}
vmd = vm_pagequeue_domain(m);
vm_domain_free_lock(vmd);
}
vm_phys_free_pages(m, 0);
cnt++;
}
if (vmd != NULL) {
vm_domain_free_unlock(vmd);
vm_domain_freecnt_inc(vmd, cnt);
}
}
/*
* vm_page_free_toq:
*
@ -3327,7 +3505,7 @@ vm_page_free_toq(vm_page_t m)
{
struct vm_domain *vmd;
if (!vm_page_free_prep(m, false))
if (!vm_page_free_prep(m))
return;
vmd = vm_pagequeue_domain(m);
@ -3425,22 +3603,25 @@ vm_page_unwire(vm_page_t m, uint8_t queue)
KASSERT(queue < PQ_COUNT || queue == PQ_NONE,
("vm_page_unwire: invalid queue %u request for page %p",
queue, m));
if ((m->oflags & VPO_UNMANAGED) == 0)
vm_page_assert_locked(m);
unwired = vm_page_unwire_noq(m);
if (unwired && (m->oflags & VPO_UNMANAGED) == 0 && m->object != NULL) {
if (m->queue == queue) {
if (!unwired || (m->oflags & VPO_UNMANAGED) != 0 || m->object == NULL)
return (unwired);
if (m->queue == queue) {
if (queue == PQ_ACTIVE)
vm_page_reference(m);
else if (queue != PQ_NONE)
vm_page_requeue(m);
} else {
vm_page_dequeue(m);
if (queue != PQ_NONE) {
vm_page_enqueue(m, queue);
if (queue == PQ_ACTIVE)
vm_page_reference(m);
else if (queue != PQ_NONE)
vm_page_requeue(m);
} else {
vm_page_remque(m);
if (queue != PQ_NONE) {
vm_page_enqueue(queue, m);
if (queue == PQ_ACTIVE)
/* Initialize act_count. */
vm_page_activate(m);
}
/* Initialize act_count. */
vm_page_activate(m);
}
}
return (unwired);
@ -3476,52 +3657,8 @@ vm_page_unwire_noq(vm_page_t m)
}
/*
* Move the specified page to the inactive queue, or requeue the page if it is
* already in the inactive queue.
*
* Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive
* queue. However, setting "noreuse" to TRUE will accelerate the specified
* page's reclamation, but it will not unmap the page from any address space.
* This is implemented by inserting the page near the head of the inactive
* queue, using a marker page to guide FIFO insertion ordering.
*
* The page must be locked.
*/
static inline void
_vm_page_deactivate(vm_page_t m, boolean_t noreuse)
{
struct vm_pagequeue *pq;
int queue;
vm_page_assert_locked(m);
if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
pq = &vm_pagequeue_domain(m)->vmd_pagequeues[PQ_INACTIVE];
/* Avoid multiple acquisitions of the inactive queue lock. */
queue = m->queue;
if (queue == PQ_INACTIVE) {
vm_pagequeue_lock(pq);
vm_page_dequeue_locked(m);
} else {
if (queue != PQ_NONE)
vm_page_dequeue(m);
vm_pagequeue_lock(pq);
}
m->queue = PQ_INACTIVE;
if (noreuse)
TAILQ_INSERT_BEFORE(
&vm_pagequeue_domain(m)->vmd_inacthead, m,
plinks.q);
else
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
vm_pagequeue_cnt_inc(pq);
vm_pagequeue_unlock(pq);
}
}
/*
* Move the specified page to the inactive queue, or requeue the page if it is
* already in the inactive queue.
* Move the specified page to the tail of the inactive queue, or requeue
* the page if it is already in the inactive queue.
*
* The page must be locked.
*/
@ -3529,12 +3666,23 @@ void
vm_page_deactivate(vm_page_t m)
{
_vm_page_deactivate(m, FALSE);
vm_page_assert_locked(m);
if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0)
return;
if (!vm_page_inactive(m)) {
vm_page_remque(m);
vm_page_enqueue(m, PQ_INACTIVE);
} else
vm_page_requeue(m);
}
/*
* Move the specified page to the inactive queue with the expectation
* that it is unlikely to be reused.
* Move the specified page close to the head of the inactive queue,
* bypassing LRU. A marker page is used to maintain FIFO ordering.
* As with regular enqueues, we use a per-CPU batch queue to reduce
* contention on the page queue lock.
*
* The page must be locked.
*/
@ -3542,7 +3690,17 @@ void
vm_page_deactivate_noreuse(vm_page_t m)
{
_vm_page_deactivate(m, TRUE);
vm_page_assert_locked(m);
if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0)
return;
if (!vm_page_inactive(m))
vm_page_remque(m);
m->queue = PQ_INACTIVE;
if ((m->aflags & PGA_REQUEUE_HEAD) == 0)
vm_page_aflag_set(m, PGA_REQUEUE_HEAD);
vm_pqbatch_submit_page(m, PQ_INACTIVE);
}
/*
@ -3555,13 +3713,14 @@ vm_page_launder(vm_page_t m)
{
vm_page_assert_locked(m);
if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
if (m->queue == PQ_LAUNDRY)
vm_page_requeue(m);
else {
vm_page_remque(m);
vm_page_enqueue(PQ_LAUNDRY, m);
}
if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0)
return;
if (m->queue == PQ_LAUNDRY)
vm_page_requeue(m);
else {
vm_page_remque(m);
vm_page_enqueue(m, PQ_LAUNDRY);
}
}
@ -3577,9 +3736,9 @@ vm_page_unswappable(vm_page_t m)
vm_page_assert_locked(m);
KASSERT(m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0,
("page %p already unswappable", m));
if (m->queue != PQ_NONE)
vm_page_dequeue(m);
vm_page_enqueue(PQ_UNSWAPPABLE, m);
vm_page_remque(m);
vm_page_enqueue(m, PQ_UNSWAPPABLE);
}
/*

View File

@ -93,8 +93,11 @@
*
* In general, operations on this structure's mutable fields are
* synchronized using either one of or a combination of the lock on the
* object that the page belongs to (O), the pool lock for the page (P),
* or the lock for either the free or paging queue (Q). If a field is
* object that the page belongs to (O), the page lock (P),
* the per-domain lock for the free queues (F), or the page's queue
* lock (Q). The physical address of a page is used to select its page
* lock from a pool. The queue lock for a page depends on the value of
* its queue field and described in detail below. If a field is
* annotated below with two of these locks, then holding either lock is
* sufficient for read access, but both locks are required for write
* access. An annotation of (C) indicates that the field is immutable.
@ -143,6 +146,29 @@
* causing the thread to block. vm_page_sleep_if_busy() can be used to
* sleep until the page's busy state changes, after which the caller
* must re-lookup the page and re-evaluate its state.
*
* The queue field is the index of the page queue containing the
* page, or PQ_NONE if the page is not enqueued. The queue lock of a
* page is the page queue lock corresponding to the page queue index,
* or the page lock (P) for the page if it is not enqueued. To modify
* the queue field, the queue lock for the old value of the field must
* be held. It is invalid for a page's queue field to transition
* between two distinct page queue indices. That is, when updating
* the queue field, either the new value or the old value must be
* PQ_NONE.
*
* To avoid contention on page queue locks, page queue operations
* (enqueue, dequeue, requeue) are batched using per-CPU queues.
* A deferred operation is requested by inserting an entry into a
* batch queue; the entry is simply a pointer to the page, and the
* request type is encoded in the page's aflags field using the values
* in PGA_QUEUE_STATE_MASK. The type-stability of struct vm_pages is
* crucial to this scheme since the processing of entries in a given
* batch queue may be deferred indefinitely. In particular, a page
* may be freed before its pending batch queue entries have been
* processed. The page lock (P) must be held to schedule a batched
* queue operation, and the page queue lock must be held in order to
* process batch queue entries for the page queue.
*/
#if PAGE_SIZE == 4096
@ -174,7 +200,7 @@ struct vm_page {
TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */
vm_object_t object; /* which object am I in (O,P) */
vm_pindex_t pindex; /* offset into object (O,P) */
vm_paddr_t phys_addr; /* physical address of page */
vm_paddr_t phys_addr; /* physical address of page (C) */
struct md_page md; /* machine dependent stuff */
u_int wire_count; /* wired down maps refs (P) */
volatile u_int busy_lock; /* busy owners lock */
@ -182,11 +208,11 @@ struct vm_page {
uint16_t flags; /* page PG_* flags (P) */
uint8_t aflags; /* access is atomic */
uint8_t oflags; /* page VPO_* flags (O) */
uint8_t queue; /* page queue index (P,Q) */
uint8_t queue; /* page queue index (Q) */
int8_t psind; /* pagesizes[] index (O) */
int8_t segind; /* vm_phys segment index (C) */
uint8_t order; /* index of the buddy queue */
uint8_t pool; /* vm_phys freepool index (Q) */
uint8_t order; /* index of the buddy queue (F) */
uint8_t pool; /* vm_phys freepool index (F) */
u_char act_count; /* page usage count (P) */
/* NOTE that these must support one bit per DEV_BSIZE in a page */
/* so, on normal X86 kernels, they must be at least 8 bits wide */
@ -314,10 +340,38 @@ extern struct mtx_padalign pa_lock[];
*
* PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has
* at least one executable mapping. It is not consumed by the MI VM layer.
*
* PGA_ENQUEUED is set and cleared when a page is inserted into or removed
* from a page queue, respectively. It determines whether the plinks.q field
* of the page is valid. To set or clear this flag, the queue lock for the
* page must be held: the page queue lock corresponding to the page's "queue"
* field if its value is not PQ_NONE, and the page lock otherwise.
*
* PGA_DEQUEUE is set when the page is scheduled to be dequeued from a page
* queue, and cleared when the dequeue request is processed. A page may
* have PGA_DEQUEUE set and PGA_ENQUEUED cleared, for instance if a dequeue
* is requested after the page is scheduled to be enqueued but before it is
* actually inserted into the page queue. The page lock must be held to set
* this flag, and the queue lock for the page must be held to clear it.
*
* PGA_REQUEUE is set when the page is scheduled to be enqueued or requeued
* in its page queue. The page lock must be held to set this flag, and the
* queue lock for the page must be held to clear it.
*
* PGA_REQUEUE_HEAD is a special flag for enqueuing pages near the head of
* the inactive queue, thus bypassing LRU. The page lock must be held to
* set this flag, and the queue lock for the page must be held to clear it.
*/
#define PGA_WRITEABLE 0x01 /* page may be mapped writeable */
#define PGA_REFERENCED 0x02 /* page has been referenced */
#define PGA_EXECUTABLE 0x04 /* page may be mapped executable */
#define PGA_ENQUEUED 0x08 /* page is enqueued in a page queue */
#define PGA_DEQUEUE 0x10 /* page is due to be dequeued */
#define PGA_REQUEUE 0x20 /* page is due to be requeued */
#define PGA_REQUEUE_HEAD 0x40 /* page requeue should bypass LRU */
#define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED | PGA_DEQUEUE | PGA_REQUEUE | \
PGA_REQUEUE_HEAD)
/*
* Page flags. If changed at any other time than page allocation or
@ -484,13 +538,13 @@ int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
void vm_page_deactivate(vm_page_t);
void vm_page_deactivate_noreuse(vm_page_t);
void vm_page_dequeue(vm_page_t m);
void vm_page_dequeue_deferred(vm_page_t m);
void vm_page_dequeue_locked(vm_page_t m);
void vm_page_drain_pqbatch(void);
vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
void vm_page_free_phys_pglist(struct pglist *tq);
bool vm_page_free_prep(vm_page_t m, bool pagequeue_locked);
bool vm_page_free_prep(vm_page_t m);
vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
void vm_page_init_marker(vm_page_t m, int queue);
int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
void vm_page_launder(vm_page_t m);
vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
@ -752,6 +806,24 @@ vm_page_in_laundry(vm_page_t m)
return (m->queue == PQ_LAUNDRY || m->queue == PQ_UNSWAPPABLE);
}
/*
* vm_page_enqueued:
*
* Return true if the page is logically enqueued and no deferred
* dequeue is pending.
*/
static inline bool
vm_page_enqueued(vm_page_t m)
{
vm_page_assert_locked(m);
if ((m->aflags & PGA_DEQUEUE) != 0)
return (false);
atomic_thread_fence_acq();
return (m->queue != PQ_NONE);
}
/*
* vm_page_held:
*

View File

@ -201,103 +201,134 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired,
CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
static u_int isqrt(u_int num);
static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
static int vm_pageout_launder(struct vm_domain *vmd, int launder,
bool in_shortfall);
static void vm_pageout_laundry_worker(void *arg);
static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
/*
* vm_pageout_fallback_object_lock:
*
* Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
* known to have failed and page queue must be either PQ_ACTIVE or
* PQ_INACTIVE. To avoid lock order violation, unlock the page queue
* while locking the vm object. Use marker page to detect page queue
* changes and maintain notion of next page on page queue. Return
* TRUE if no changes were detected, FALSE otherwise. vm object is
* locked on return.
*
* This function depends on both the lock portion of struct vm_object
* and normal struct vm_page being type stable.
*/
static boolean_t
vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
{
struct vm_page marker;
struct scan_state {
struct vm_batchqueue bq;
struct vm_pagequeue *pq;
boolean_t unchanged;
vm_object_t object;
int queue;
vm_page_t marker;
int maxscan;
int scanned;
};
queue = m->queue;
vm_page_init_marker(&marker, queue);
pq = vm_page_pagequeue(m);
object = m->object;
TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
static void
vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq,
vm_page_t marker, vm_page_t after, int maxscan)
{
vm_pagequeue_assert_locked(pq);
KASSERT((marker->aflags & PGA_ENQUEUED) == 0,
("marker %p already enqueued", marker));
if (after == NULL)
TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q);
else
TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q);
vm_page_aflag_set(marker, PGA_ENQUEUED);
vm_batchqueue_init(&ss->bq);
ss->pq = pq;
ss->marker = marker;
ss->maxscan = maxscan;
ss->scanned = 0;
vm_pagequeue_unlock(pq);
vm_page_unlock(m);
VM_OBJECT_WLOCK(object);
vm_page_lock(m);
vm_pagequeue_lock(pq);
}
/*
* The page's object might have changed, and/or the page might
* have moved from its original position in the queue. If the
* page's object has changed, then the caller should abandon
* processing the page because the wrong object lock was
* acquired. Use the marker's plinks.q, not the page's, to
* determine if the page has been moved. The state of the
* page's plinks.q can be indeterminate; whereas, the marker's
* plinks.q must be valid.
*/
*next = TAILQ_NEXT(&marker, plinks.q);
unchanged = m->object == object &&
m == TAILQ_PREV(&marker, pglist, plinks.q);
KASSERT(!unchanged || m->queue == queue,
("page %p queue %d %d", m, queue, m->queue));
TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
return (unchanged);
static void
vm_pageout_end_scan(struct scan_state *ss)
{
struct vm_pagequeue *pq;
pq = ss->pq;
vm_pagequeue_assert_locked(pq);
KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0,
("marker %p not enqueued", ss->marker));
TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q);
vm_page_aflag_clear(ss->marker, PGA_ENQUEUED);
VM_CNT_ADD(v_pdpages, ss->scanned);
}
/*
* Lock the page while holding the page queue lock. Use marker page
* to detect page queue changes and maintain notion of next page on
* page queue. Return TRUE if no changes were detected, FALSE
* otherwise. The page is locked on return. The page queue lock might
* be dropped and reacquired.
*
* This function depends on normal struct vm_page being type stable.
* Ensure that the page has not been dequeued after a pageout batch was
* collected. See vm_page_dequeue_complete().
*/
static boolean_t
vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
static inline bool
vm_pageout_page_queued(vm_page_t m, int queue)
{
vm_page_assert_locked(m);
if ((m->aflags & PGA_DEQUEUE) != 0)
return (false);
atomic_thread_fence_acq();
return (m->queue == queue);
}
/*
* Add a small number of queued pages to a batch queue for later processing
* without the corresponding queue lock held. The caller must have enqueued a
* marker page at the desired start point for the scan. Pages will be
* physically dequeued if the caller so requests. Otherwise, the returned
* batch may contain marker pages, and it is up to the caller to handle them.
*
* When processing the batch queue, vm_pageout_page_queued() must be used to
* determine whether the page was logically dequeued by another thread. Once
* this check is performed, the page lock guarantees that the page will not be
* disassociated from the queue.
*/
static __always_inline void
vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue)
{
struct vm_page marker;
struct vm_pagequeue *pq;
boolean_t unchanged;
int queue;
vm_page_t m, marker;
vm_page_lock_assert(m, MA_NOTOWNED);
if (vm_page_trylock(m))
return (TRUE);
marker = ss->marker;
pq = ss->pq;
queue = m->queue;
vm_page_init_marker(&marker, queue);
pq = vm_page_pagequeue(m);
KASSERT((marker->aflags & PGA_ENQUEUED) != 0,
("marker %p not enqueued", ss->marker));
TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
vm_pagequeue_unlock(pq);
vm_page_lock(m);
vm_pagequeue_lock(pq);
for (m = TAILQ_NEXT(marker, plinks.q); m != NULL &&
ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE;
m = TAILQ_NEXT(m, plinks.q), ss->scanned++) {
if ((m->flags & PG_MARKER) == 0) {
KASSERT((m->aflags & PGA_ENQUEUED) != 0,
("page %p not enqueued", m));
KASSERT((m->flags & PG_FICTITIOUS) == 0,
("Fictitious page %p cannot be in page queue", m));
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("Unmanaged page %p cannot be in page queue", m));
} else if (dequeue)
continue;
/* Page queue might have changed. */
*next = TAILQ_NEXT(&marker, plinks.q);
unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q);
KASSERT(!unchanged || m->queue == queue,
("page %p queue %d %d", m, queue, m->queue));
TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
return (unchanged);
(void)vm_batchqueue_insert(&ss->bq, m);
if (dequeue) {
TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
vm_page_aflag_clear(m, PGA_ENQUEUED);
}
}
TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
if (__predict_true(m != NULL))
TAILQ_INSERT_BEFORE(m, marker, plinks.q);
else
TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q);
if (dequeue)
vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt);
vm_pagequeue_unlock(pq);
}
/* Return the next page to be scanned, or NULL if the scan is complete. */
static __always_inline vm_page_t
vm_pageout_next(struct scan_state *ss, const bool dequeue)
{
if (ss->bq.bq_cnt == 0)
vm_pageout_collect_batch(ss, dequeue);
return (vm_batchqueue_pop(&ss->bq));
}
/*
@ -353,12 +384,12 @@ vm_pageout_cluster(vm_page_t m)
break;
}
vm_page_test_dirty(p);
if (p->dirty == 0) {
if (p->dirty == 0 || !vm_page_in_laundry(p)) {
ib = 0;
break;
}
vm_page_lock(p);
if (!vm_page_in_laundry(p) || vm_page_held(p)) {
if (vm_page_held(p)) {
vm_page_unlock(p);
ib = 0;
break;
@ -381,10 +412,10 @@ vm_pageout_cluster(vm_page_t m)
if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
break;
vm_page_test_dirty(p);
if (p->dirty == 0)
if (p->dirty == 0 || !vm_page_in_laundry(p))
break;
vm_page_lock(p);
if (!vm_page_in_laundry(p) || vm_page_held(p)) {
if (vm_page_held(p)) {
vm_page_unlock(p);
break;
}
@ -675,13 +706,18 @@ vm_pageout_clean(vm_page_t m, int *numpagedout)
static int
vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
{
struct scan_state ss;
struct vm_pagequeue *pq;
struct mtx *mtx;
vm_object_t object;
vm_page_t m, marker, next;
int act_delta, error, maxscan, numpagedout, queue, starting_target;
vm_page_t m, marker;
int act_delta, error, numpagedout, queue, starting_target;
int vnodes_skipped;
bool pageout_ok, queue_locked;
bool obj_locked, pageout_ok;
mtx = NULL;
obj_locked = false;
object = NULL;
starting_target = launder;
vnodes_skipped = 0;
@ -691,10 +727,6 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
* we've reached the end of the queue. A single iteration of this loop
* may cause more than one page to be laundered because of clustering.
*
* maxscan ensures that we don't re-examine requeued pages. Any
* additional pages written as part of a cluster are subtracted from
* maxscan since they must be taken from the laundry queue.
*
* As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
* swap devices are configured.
*/
@ -704,53 +736,68 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
queue = PQ_LAUNDRY;
scan:
pq = &vmd->vmd_pagequeues[queue];
marker = &vmd->vmd_markers[queue];
pq = &vmd->vmd_pagequeues[queue];
vm_pagequeue_lock(pq);
maxscan = pq->pq_cnt;
queue_locked = true;
for (m = TAILQ_FIRST(&pq->pq_pl);
m != NULL && maxscan-- > 0 && launder > 0;
m = next) {
vm_pagequeue_assert_locked(pq);
KASSERT(queue_locked, ("unlocked laundry queue"));
KASSERT(vm_page_in_laundry(m),
("page %p has an inconsistent queue", m));
next = TAILQ_NEXT(m, plinks.q);
if ((m->flags & PG_MARKER) != 0)
vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) {
if (__predict_false((m->flags & PG_MARKER) != 0))
continue;
KASSERT((m->flags & PG_FICTITIOUS) == 0,
("PG_FICTITIOUS page %p cannot be in laundry queue", m));
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("VPO_UNMANAGED page %p cannot be in laundry queue", m));
if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
vm_page_unlock(m);
vm_page_change_lock(m, &mtx);
recheck:
/*
* The page may have been disassociated from the queue
* while locks were dropped.
*/
if (!vm_pageout_page_queued(m, queue))
continue;
}
if (m->wire_count != 0) {
vm_page_dequeue_locked(m);
vm_page_unlock(m);
continue;
}
object = m->object;
if ((!VM_OBJECT_TRYWLOCK(object) &&
(!vm_pageout_fallback_object_lock(m, &next) ||
vm_page_held(m))) || vm_page_busied(m)) {
VM_OBJECT_WUNLOCK(object);
if (m->wire_count != 0 && vm_page_pagequeue(m) == pq)
vm_page_dequeue_locked(m);
vm_page_unlock(m);
/*
* A requeue was requested, so this page gets a second
* chance.
*/
if ((m->aflags & PGA_REQUEUE) != 0) {
vm_page_requeue(m);
continue;
}
/*
* Unlock the laundry queue, invalidating the 'next' pointer.
* Use a marker to remember our place in the laundry queue.
* Held pages are essentially stuck in the queue.
*
* Wired pages may not be freed. Complete their removal
* from the queue now to avoid needless revisits during
* future scans.
*/
TAILQ_INSERT_AFTER(&pq->pq_pl, m, marker, plinks.q);
vm_pagequeue_unlock(pq);
queue_locked = false;
if (m->hold_count != 0)
continue;
if (m->wire_count != 0) {
vm_page_dequeue_deferred(m);
continue;
}
if (object != m->object) {
if (obj_locked) {
VM_OBJECT_WUNLOCK(object);
obj_locked = false;
}
object = m->object;
}
if (!obj_locked) {
if (!VM_OBJECT_TRYWLOCK(object)) {
mtx_unlock(mtx);
/* Depends on type-stability. */
VM_OBJECT_WLOCK(object);
obj_locked = true;
mtx_lock(mtx);
goto recheck;
} else
obj_locked = true;
}
if (vm_page_busied(m))
continue;
/*
* Invalid pages can be easily freed. They cannot be
@ -799,9 +846,11 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
*/
if (!in_shortfall)
launder--;
goto drop_page;
} else if ((object->flags & OBJ_DEAD) == 0)
goto requeue_page;
continue;
} else if ((object->flags & OBJ_DEAD) == 0) {
vm_page_requeue(m);
continue;
}
}
/*
@ -836,11 +885,8 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
else
pageout_ok = true;
if (!pageout_ok) {
requeue_page:
vm_pagequeue_lock(pq);
queue_locked = true;
vm_page_requeue_locked(m);
goto drop_page;
vm_page_requeue(m);
continue;
}
/*
@ -859,24 +905,25 @@ vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
error = vm_pageout_clean(m, &numpagedout);
if (error == 0) {
launder -= numpagedout;
maxscan -= numpagedout - 1;
ss.scanned += numpagedout;
} else if (error == EDEADLK) {
pageout_lock_miss++;
vnodes_skipped++;
}
goto relock_queue;
mtx = NULL;
obj_locked = false;
}
drop_page:
vm_page_unlock(m);
VM_OBJECT_WUNLOCK(object);
relock_queue:
if (!queue_locked) {
vm_pagequeue_lock(pq);
queue_locked = true;
}
next = TAILQ_NEXT(marker, plinks.q);
TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
}
if (mtx != NULL) {
mtx_unlock(mtx);
mtx = NULL;
}
if (obj_locked) {
VM_OBJECT_WUNLOCK(object);
obj_locked = false;
}
vm_pagequeue_lock(pq);
vm_pageout_end_scan(&ss);
vm_pagequeue_unlock(pq);
if (launder > 0 && queue == PQ_UNSWAPPABLE) {
@ -1077,6 +1124,56 @@ vm_pageout_laundry_worker(void *arg)
}
}
static int
vm_pageout_reinsert_inactive_page(struct scan_state *ss, vm_page_t m)
{
struct vm_domain *vmd;
if (!vm_page_inactive(m) || (m->aflags & PGA_ENQUEUED) != 0)
return (0);
vm_page_aflag_set(m, PGA_ENQUEUED);
if ((m->aflags & PGA_REQUEUE_HEAD) != 0) {
vmd = vm_pagequeue_domain(m);
TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
} else if ((m->aflags & PGA_REQUEUE) != 0) {
TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q);
vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
} else
TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q);
return (1);
}
/*
* Re-add stuck pages to the inactive queue. We will examine them again
* during the next scan. If the queue state of a page has changed since
* it was physically removed from the page queue in
* vm_pageout_collect_batch(), don't do anything with that page.
*/
static void
vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq,
vm_page_t m)
{
struct vm_pagequeue *pq;
int delta;
delta = 0;
pq = ss->pq;
if (m != NULL) {
if (vm_batchqueue_insert(bq, m))
return;
vm_pagequeue_lock(pq);
delta += vm_pageout_reinsert_inactive_page(ss, m);
} else
vm_pagequeue_lock(pq);
while ((m = vm_batchqueue_pop(bq)) != NULL)
delta += vm_pageout_reinsert_inactive_page(ss, m);
vm_pagequeue_cnt_add(pq, delta);
vm_pagequeue_unlock(pq);
vm_batchqueue_init(bq);
}
/*
* vm_pageout_scan does the dirty work for the pageout daemon.
*
@ -1089,13 +1186,16 @@ vm_pageout_laundry_worker(void *arg)
static bool
vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
{
vm_page_t m, marker, next;
struct scan_state ss;
struct vm_batchqueue rq;
struct mtx *mtx;
vm_page_t m, marker;
struct vm_pagequeue *pq;
vm_object_t object;
long min_scan;
int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan;
int page_shortage, scan_tick, scanned, starting_page_shortage;
boolean_t queue_locked;
int act_delta, addl_page_shortage, deficit, inactq_shortage, max_scan;
int page_shortage, scan_tick, starting_page_shortage;
bool obj_locked;
/*
* If we need to reclaim memory ask kernel caches to return
@ -1136,79 +1236,85 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
page_shortage = deficit = 0;
starting_page_shortage = page_shortage;
mtx = NULL;
obj_locked = false;
object = NULL;
vm_batchqueue_init(&rq);
/*
* Start scanning the inactive queue for pages that we can free. The
* scan will stop when we reach the target or we have scanned the
* entire queue. (Note that m->act_count is not used to make
* decisions for the inactive queue, only for the active queue.)
*/
pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
marker = &vmd->vmd_markers[PQ_INACTIVE];
maxscan = pq->pq_cnt;
pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
vm_pagequeue_lock(pq);
queue_locked = TRUE;
for (m = TAILQ_FIRST(&pq->pq_pl);
m != NULL && maxscan-- > 0 && page_shortage > 0;
m = next) {
vm_pagequeue_assert_locked(pq);
KASSERT(queue_locked, ("unlocked inactive queue"));
KASSERT(vm_page_inactive(m), ("Inactive queue %p", m));
vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
while (page_shortage > 0 && (m = vm_pageout_next(&ss, true)) != NULL) {
KASSERT((m->flags & PG_MARKER) == 0,
("marker page %p was dequeued", m));
VM_CNT_INC(v_pdpages);
next = TAILQ_NEXT(m, plinks.q);
vm_page_change_lock(m, &mtx);
recheck:
/*
* skip marker pages
* The page may have been disassociated from the queue
* while locks were dropped.
*/
if (m->flags & PG_MARKER)
if (!vm_pageout_page_queued(m, PQ_INACTIVE)) {
addl_page_shortage++;
continue;
KASSERT((m->flags & PG_FICTITIOUS) == 0,
("Fictitious page %p cannot be in inactive queue", m));
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("Unmanaged page %p cannot be in inactive queue", m));
}
/*
* The page or object lock acquisitions fail if the
* page was removed from the queue or moved to a
* different position within the queue. In either
* case, addl_page_shortage should not be incremented.
* The page was re-enqueued after the page queue lock was
* dropped, or a requeue was requested. This page gets a second
* chance.
*/
if (!vm_pageout_page_lock(m, &next))
goto unlock_page;
else if (m->wire_count != 0) {
/*
* Wired pages may not be freed, and unwiring a queued
* page will cause it to be requeued. Thus, remove them
* from the queue now to avoid unnecessary revisits.
*/
vm_page_dequeue_locked(m);
if ((m->aflags & (PGA_ENQUEUED | PGA_REQUEUE |
PGA_REQUEUE_HEAD)) != 0)
goto reinsert;
/*
* Held pages are essentially stuck in the queue. So,
* they ought to be discounted from the inactive count.
* See the calculation of inactq_shortage before the
* loop over the active queue below.
*
* Wired pages may not be freed. Complete their removal
* from the queue now to avoid needless revisits during
* future scans.
*/
if (m->hold_count != 0) {
addl_page_shortage++;
goto unlock_page;
} else if (m->hold_count != 0) {
/*
* Held pages are essentially stuck in the
* queue. So, they ought to be discounted
* from the inactive count. See the
* calculation of inactq_shortage before the
* loop over the active queue below.
*/
addl_page_shortage++;
goto unlock_page;
goto reinsert;
}
object = m->object;
if (!VM_OBJECT_TRYWLOCK(object)) {
if (!vm_pageout_fallback_object_lock(m, &next))
goto unlock_object;
else if (m->wire_count != 0) {
vm_page_dequeue_locked(m);
addl_page_shortage++;
goto unlock_object;
} else if (m->hold_count != 0) {
addl_page_shortage++;
goto unlock_object;
if (m->wire_count != 0) {
addl_page_shortage++;
vm_page_dequeue_deferred(m);
continue;
}
if (object != m->object) {
if (obj_locked) {
VM_OBJECT_WUNLOCK(object);
obj_locked = false;
}
object = m->object;
}
if (!obj_locked) {
if (!VM_OBJECT_TRYWLOCK(object)) {
mtx_unlock(mtx);
/* Depends on type-stability. */
VM_OBJECT_WLOCK(object);
obj_locked = true;
mtx_lock(mtx);
goto recheck;
} else
obj_locked = true;
}
if (vm_page_busied(m)) {
/*
* Don't mess with busy pages. Leave them at
@ -1219,26 +1325,8 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
* inactive count.
*/
addl_page_shortage++;
unlock_object:
VM_OBJECT_WUNLOCK(object);
unlock_page:
vm_page_unlock(m);
continue;
goto reinsert;
}
KASSERT(!vm_page_held(m), ("Held page %p", m));
/*
* Dequeue the inactive page and unlock the inactive page
* queue, invalidating the 'next' pointer. Dequeueing the
* page here avoids a later reacquisition (and release) of
* the inactive page queue lock when vm_page_activate(),
* vm_page_free(), or vm_page_launder() is called. Use a
* marker to remember our place in the inactive queue.
*/
TAILQ_INSERT_AFTER(&pq->pq_pl, m, marker, plinks.q);
vm_page_dequeue_locked(m);
vm_pagequeue_unlock(pq);
queue_locked = FALSE;
/*
* Invalid pages can be easily freed. They cannot be
@ -1276,14 +1364,10 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
* queue.
*/
m->act_count += act_delta + ACT_ADVANCE;
goto drop_page;
continue;
} else if ((object->flags & OBJ_DEAD) == 0) {
vm_pagequeue_lock(pq);
queue_locked = TRUE;
m->queue = PQ_INACTIVE;
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
vm_pagequeue_cnt_inc(pq);
goto drop_page;
vm_page_aflag_set(m, PGA_REQUEUE);
goto reinsert;
}
}
@ -1309,23 +1393,39 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
*/
if (m->dirty == 0) {
free_page:
/*
* Because we dequeued the page and have already
* checked for concurrent dequeue and enqueue
* requests, we can safely disassociate the page
* from the inactive queue.
*/
KASSERT((m->aflags & PGA_QUEUE_STATE_MASK) == 0,
("page %p has queue state", m));
m->queue = PQ_NONE;
vm_page_free(m);
VM_CNT_INC(v_dfree);
--page_shortage;
page_shortage--;
} else if ((object->flags & OBJ_DEAD) == 0)
vm_page_launder(m);
drop_page:
vm_page_unlock(m);
VM_OBJECT_WUNLOCK(object);
if (!queue_locked) {
vm_pagequeue_lock(pq);
queue_locked = TRUE;
}
next = TAILQ_NEXT(marker, plinks.q);
TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
continue;
reinsert:
vm_pageout_reinsert_inactive(&ss, &rq, m);
}
if (mtx != NULL) {
mtx_unlock(mtx);
mtx = NULL;
}
if (obj_locked) {
VM_OBJECT_WUNLOCK(object);
obj_locked = false;
}
vm_pageout_reinsert_inactive(&ss, &rq, NULL);
vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL);
vm_pagequeue_lock(pq);
vm_pageout_end_scan(&ss);
vm_pagequeue_unlock(pq);
VM_CNT_ADD(v_dfree, starting_page_shortage - page_shortage);
/*
* Wake up the laundry thread so that it can perform any needed
* laundering. If we didn't meet our target, we're in shortfall and
@ -1386,9 +1486,9 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
vm_paging_target(vmd) + deficit + addl_page_shortage;
inactq_shortage *= act_scan_laundry_weight;
marker = &vmd->vmd_markers[PQ_ACTIVE];
pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
vm_pagequeue_lock(pq);
maxscan = pq->pq_cnt;
/*
* If we're just idle polling attempt to visit every
@ -1401,43 +1501,55 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
min_scan /= hz * vm_pageout_update_period;
} else
min_scan = 0;
if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0))
if (min_scan > 0 || (inactq_shortage > 0 && pq->pq_cnt > 0))
vmd->vmd_last_active_scan = scan_tick;
/*
* Scan the active queue for pages that can be deactivated. Update
* the per-page activity counter and use it to identify deactivation
* candidates. Held pages may be deactivated.
*
* To avoid requeuing each page that remains in the active queue, we
* implement the CLOCK algorithm. To maintain consistency in the
* generic page queue code, pages are inserted at the tail of the
* active queue. We thus use two hands, represented by marker pages:
* scans begin at the first hand, which precedes the second hand in
* the queue. When the two hands meet, they are moved back to the
* head and tail of the queue, respectively, and scanning resumes.
*/
for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next,
scanned++) {
KASSERT(m->queue == PQ_ACTIVE,
("vm_pageout_scan: page %p isn't active", m));
next = TAILQ_NEXT(m, plinks.q);
if ((m->flags & PG_MARKER) != 0)
continue;
KASSERT((m->flags & PG_FICTITIOUS) == 0,
("Fictitious page %p cannot be in active queue", m));
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("Unmanaged page %p cannot be in active queue", m));
if (!vm_pageout_page_lock(m, &next)) {
vm_page_unlock(m);
continue;
max_scan = inactq_shortage > 0 ? pq->pq_cnt : min_scan;
act_scan:
vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan);
while ((m = vm_pageout_next(&ss, false)) != NULL) {
if (__predict_false(m == &vmd->vmd_clock[1])) {
vm_pagequeue_lock(pq);
TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q);
TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0],
plinks.q);
TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1],
plinks.q);
max_scan -= ss.scanned;
vm_pageout_end_scan(&ss);
goto act_scan;
}
if (__predict_false((m->flags & PG_MARKER) != 0))
continue;
vm_page_change_lock(m, &mtx);
/*
* The count for page daemon pages is updated after checking
* the page for eligibility.
* The page may have been disassociated from the queue
* while locks were dropped.
*/
VM_CNT_INC(v_pdpages);
if (!vm_pageout_page_queued(m, PQ_ACTIVE))
continue;
/*
* Wired pages are dequeued lazily.
*/
if (m->wire_count != 0) {
vm_page_dequeue_locked(m);
vm_page_unlock(m);
vm_page_dequeue_deferred(m);
continue;
}
@ -1476,14 +1588,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
} else
m->act_count -= min(m->act_count, ACT_DECLINE);
/*
* Move this page to the tail of the active, inactive or laundry
* queue depending on usage.
*/
if (m->act_count == 0) {
/* Dequeue to avoid later lock recursion. */
vm_page_dequeue_locked(m);
/*
* When not short for inactive pages, let dirty pages go
* through the inactive queue before moving to the
@ -1515,11 +1620,18 @@ vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
inactq_shortage--;
}
}
} else
vm_page_requeue_locked(m);
vm_page_unlock(m);
}
}
if (mtx != NULL) {
mtx_unlock(mtx);
mtx = NULL;
}
vm_pagequeue_lock(pq);
TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q);
vm_pageout_end_scan(&ss);
vm_pagequeue_unlock(pq);
if (pass > 0)
vm_swapout_run_idle();
return (page_shortage <= 0);

View File

@ -73,8 +73,17 @@ struct vm_pagequeue {
const char * const pq_name;
} __aligned(CACHE_LINE_SIZE);
#include <sys/pidctrl.h>
#ifndef VM_BATCHQUEUE_SIZE
#define VM_BATCHQUEUE_SIZE 7
#endif
struct vm_batchqueue {
vm_page_t bq_pa[VM_BATCHQUEUE_SIZE];
int bq_cnt;
} __aligned(CACHE_LINE_SIZE);
#include <vm/uma.h>
#include <sys/pidctrl.h>
struct sysctl_oid;
/*
@ -82,12 +91,12 @@ struct sysctl_oid;
* and accounting.
*
* Lock Key:
* f vmd_free_mtx
* p vmd_pageout_mtx
* d vm_domainset_lock
* a atomic
* c const after boot
* q page queue lock
* f vmd_free_mtx
* p vmd_pageout_mtx
* d vm_domainset_lock
* a atomic
* c const after boot
* q page queue lock
*/
struct vm_domain {
struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
@ -107,8 +116,9 @@ struct vm_domain {
boolean_t vmd_oom;
int vmd_oom_seq;
int vmd_last_active_scan;
struct vm_page vmd_markers[PQ_COUNT]; /* markers for queue scans */
struct vm_page vmd_markers[PQ_COUNT]; /* (q) markers for queue scans */
struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
struct vm_page vmd_clock[2]; /* markers for active queue scan */
int vmd_pageout_wanted; /* (a, p) pageout daemon wait channel */
int vmd_pageout_pages_needed; /* (d) page daemon waiting for pages? */
@ -144,6 +154,7 @@ extern struct vm_domain vm_dom[MAXMEMDOM];
#define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED)
#define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex)
#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex)
#define vm_pagequeue_trylock(pq) mtx_trylock(&(pq)->pq_mutex)
#define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex)
#define vm_domain_free_assert_locked(n) \
@ -154,6 +165,8 @@ extern struct vm_domain vm_dom[MAXMEMDOM];
mtx_lock(vm_domain_free_lockptr((d)))
#define vm_domain_free_lockptr(d) \
(&(d)->vmd_free_mtx)
#define vm_domain_free_trylock(d) \
mtx_trylock(vm_domain_free_lockptr((d)))
#define vm_domain_free_unlock(d) \
mtx_unlock(vm_domain_free_lockptr((d)))
@ -172,14 +185,39 @@ static __inline void
vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
{
#ifdef notyet
vm_pagequeue_assert_locked(pq);
#endif
pq->pq_cnt += addend;
}
#define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1)
#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1)
static inline void
vm_batchqueue_init(struct vm_batchqueue *bq)
{
bq->bq_cnt = 0;
}
static inline bool
vm_batchqueue_insert(struct vm_batchqueue *bq, vm_page_t m)
{
if (bq->bq_cnt < nitems(bq->bq_pa)) {
bq->bq_pa[bq->bq_cnt++] = m;
return (true);
}
return (false);
}
static inline vm_page_t
vm_batchqueue_pop(struct vm_batchqueue *bq)
{
if (bq->bq_cnt == 0)
return (NULL);
return (bq->bq_pa[--bq->bq_cnt]);
}
void vm_domain_set(struct vm_domain *vmd);
void vm_domain_clear(struct vm_domain *vmd);
int vm_domain_allocate(struct vm_domain *vmd, int req, int npages);

View File

@ -354,9 +354,9 @@ vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
m->order = order;
if (tail)
TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
else
TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
fl[order].lcnt++;
}
@ -364,7 +364,7 @@ static void
vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
{
TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
TAILQ_REMOVE(&fl[order].pl, m, listq);
fl[order].lcnt--;
m->order = VM_NFREEORDER;
}
@ -1196,7 +1196,7 @@ vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
oind++) {
for (pind = 0; pind < VM_NFREEPOOL; pind++) {
fl = (*seg->free_queues)[pind];
TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
/*
* Is the size of this allocation request
* larger than the largest block size?

View File

@ -399,8 +399,15 @@ vm_daemon(void)
swapout_flags = vm_pageout_req_swapout;
vm_pageout_req_swapout = 0;
mtx_unlock(&vm_daemon_mtx);
if (swapout_flags)
if (swapout_flags != 0) {
/*
* Drain the per-CPU page queue batches as a deadlock
* avoidance measure.
*/
if ((swapout_flags & VM_SWAP_NORMAL) != 0)
vm_page_drain_pqbatch();
swapout_procs(swapout_flags);
}
/*
* scan the processes for exceeding their rlimits or if