vm: reduce lock contention when processing vm batchqueues

Rather than waiting until the batchqueue is full to acquire the lock &
process the queue, we now start trying to acquire the lock using trylocks
when the batchqueue is 1/2 full. This removes almost all contention on the
vm pagequeue mutex for for our busy sendfile() based web workload.
It also greadly reduces the amount of time a network driver ithread
remains blocked on a mutex, and eliminates some packet drops under
heavy load.

So that the system does not loose the benefit of processing large
batchqueues, I've doubled the size of the batchqueues. This way, when
there is no contention, we process the same batch size as before.

This has been run for several months on a busy Netflix server, as well
as on my personal desktop.

Reviewed by: markj
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D37305
This commit is contained in:
Andrew Gallatin 2022-12-14 14:34:07 -05:00
parent c4a4b2633d
commit 1cac76c93f
5 changed files with 25 additions and 10 deletions

View File

@ -293,7 +293,7 @@
* Use a fairly large batch size since we expect amd64 systems to have lots of
* memory.
*/
#define VM_BATCHQUEUE_SIZE 31
#define VM_BATCHQUEUE_SIZE 63
/*
* The pmap can create non-transparent large page mappings.

View File

@ -263,7 +263,7 @@ extern int vm_level_0_order;
* memory.
*/
#ifdef __powerpc64__
#define VM_BATCHQUEUE_SIZE 31
#define VM_BATCHQUEUE_SIZE 63
#endif
/*

View File

@ -3662,19 +3662,32 @@ vm_page_pqbatch_submit(vm_page_t m, uint8_t queue)
{
struct vm_batchqueue *bq;
struct vm_pagequeue *pq;
int domain;
int domain, slots_remaining;
KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue));
domain = vm_page_domain(m);
critical_enter();
bq = DPCPU_PTR(pqbatch[domain][queue]);
if (vm_batchqueue_insert(bq, m)) {
slots_remaining = vm_batchqueue_insert(bq, m);
if (slots_remaining > (VM_BATCHQUEUE_SIZE >> 1)) {
/* keep building the bq */
critical_exit();
return;
} else if (slots_remaining > 0 ) {
/* Try to process the bq if we can get the lock */
pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue];
if (vm_pagequeue_trylock(pq)) {
vm_pqbatch_process(pq, bq, queue);
vm_pagequeue_unlock(pq);
}
critical_exit();
return;
}
critical_exit();
/* if we make it here, the bq is full so wait for the lock */
pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue];
vm_pagequeue_lock(pq);
critical_enter();

View File

@ -1405,7 +1405,7 @@ vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq,
pq = ss->pq;
if (m != NULL) {
if (vm_batchqueue_insert(bq, m))
if (vm_batchqueue_insert(bq, m) != 0)
return;
vm_pagequeue_lock(pq);
delta += vm_pageout_reinsert_inactive_page(pq, marker, m);

View File

@ -75,7 +75,7 @@ struct vm_pagequeue {
} __aligned(CACHE_LINE_SIZE);
#ifndef VM_BATCHQUEUE_SIZE
#define VM_BATCHQUEUE_SIZE 7
#define VM_BATCHQUEUE_SIZE 15
#endif
struct vm_batchqueue {
@ -356,15 +356,17 @@ vm_batchqueue_init(struct vm_batchqueue *bq)
bq->bq_cnt = 0;
}
static inline bool
static inline int
vm_batchqueue_insert(struct vm_batchqueue *bq, vm_page_t m)
{
int slots_free;
if (bq->bq_cnt < nitems(bq->bq_pa)) {
slots_free = nitems(bq->bq_pa) - bq->bq_cnt;
if (slots_free > 0) {
bq->bq_pa[bq->bq_cnt++] = m;
return (true);
return (slots_free);
}
return (false);
return (slots_free);
}
static inline vm_page_t