Replace vm_fault()'s heuristic for automatic cache behind with a heuristic

that performs the equivalent of an automatic madvise(..., MADV_DONTNEED).
The current heuristic, even with the improvements that I made a few years
ago, is a good example of making the wrong trade-off, or optimizing for
the infrequent case.  The infrequent case being reading a single file that
is much larger than memory using mmap(2).  And, in this case, the page
daemon isn't the bottleneck; it's the I/O.

In all other cases, the current heuristic has too many false positives,
i.e., it caches too many pages that are later reused.  To give one
example, thousands of pages are cached by the current heuristic during a
buildworld and all of them are reactivated before the buildworld
completes.  In particular, clang reads source files using mmap(2) and
there are some relatively large source files in our source tree, e.g.,
sqlite, that are read multiple times.  With the new heuristic, I see fewer
false positives and they have a much lower cost.

I actually tried something like this more than two years ago and it
didn't perform as well as the cache behind heuristic.  However, that was
before the changes to the page daemon in late summer of 2013 and the
existence of pmap_advise().  In particular, with the page daemon doing
its work more frequently and in smaller batches, it now completes its
work while the application accessing the file is blocked on I/O.
Whereas previously, the page daemon appeared to hog the CPU for so long
that it caused "hiccups" in the application's execution.

Finally, I'll add that the elimination of cache pages is a prerequisite
for NUMA support.

Reviewed by:	jeff, kib
Sponsored by:	EMC / Isilon Storage Division
This commit is contained in:
Alan Cox 2015-04-04 19:10:22 +00:00
parent a8b295acf3
commit a8b0f1009d

View File

@ -81,6 +81,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mman.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
@ -113,7 +114,8 @@ static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
#define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX)
#define VM_FAULT_NINCR (VM_FAULT_READ_MAX / VM_FAULT_READ_BEHIND)
#define VM_FAULT_SUM (VM_FAULT_NINCR * (VM_FAULT_NINCR + 1) / 2)
#define VM_FAULT_CACHE_BEHIND (VM_FAULT_READ_BEHIND * VM_FAULT_SUM)
#define VM_FAULT_DONTNEED_MIN 1048576
struct faultstate {
vm_page_t m;
@ -128,7 +130,8 @@ struct faultstate {
struct vnode *vp;
};
static void vm_fault_cache_behind(const struct faultstate *fs, int distance);
static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr,
int ahead);
static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
int faultcount, int reqpage);
@ -566,8 +569,7 @@ RetryFault:;
nera = VM_FAULT_READ_AHEAD_MAX;
ahead = nera;
if (fs.pindex == fs.entry->next_read)
vm_fault_cache_behind(&fs,
VM_FAULT_READ_MAX);
vm_fault_dontneed(&fs, vaddr, ahead);
} else if (fs.pindex == fs.entry->next_read) {
/*
* This is a sequential fault. Arithmetically
@ -585,8 +587,7 @@ RetryFault:;
}
ahead = nera;
if (era == VM_FAULT_READ_AHEAD_MAX)
vm_fault_cache_behind(&fs,
VM_FAULT_CACHE_BEHIND);
vm_fault_dontneed(&fs, vaddr, ahead);
} else {
/*
* This is a non-sequential fault. Request a
@ -1034,56 +1035,69 @@ RetryFault:;
}
/*
* Speed up the reclamation of up to "distance" pages that precede the
* faulting pindex within the first object of the shadow chain.
* Speed up the reclamation of pages that precede the faulting pindex within
* the first object of the shadow chain. Essentially, perform the equivalent
* to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes
* the faulting pindex by the cluster size when the pages read by vm_fault()
* cross a cluster-size boundary. The cluster size is the greater of the
* smallest superpage size and VM_FAULT_DONTNEED_MIN.
*
* When "fs->first_object" is a shadow object, the pages in the backing object
* that precede the faulting pindex are deactivated by vm_fault(). So, this
* function must only be concerned with pages in the first object.
*/
static void
vm_fault_cache_behind(const struct faultstate *fs, int distance)
vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead)
{
vm_map_entry_t entry;
vm_object_t first_object, object;
vm_page_t m, m_prev;
vm_pindex_t pindex;
vm_offset_t end, start;
vm_page_t m, m_next;
vm_pindex_t pend, pstart;
vm_size_t size;
object = fs->object;
VM_OBJECT_ASSERT_WLOCKED(object);
first_object = fs->first_object;
if (first_object != object) {
if (!VM_OBJECT_TRYWLOCK(first_object)) {
if (!VM_OBJECT_TRYRLOCK(first_object)) {
VM_OBJECT_WUNLOCK(object);
VM_OBJECT_WLOCK(first_object);
VM_OBJECT_RLOCK(first_object);
VM_OBJECT_WLOCK(object);
}
}
/* Neither fictitious nor unmanaged pages can be cached. */
/* Neither fictitious nor unmanaged pages can be reclaimed. */
if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
if (fs->first_pindex < distance)
pindex = 0;
else
pindex = fs->first_pindex - distance;
if (pindex < OFF_TO_IDX(fs->entry->offset))
pindex = OFF_TO_IDX(fs->entry->offset);
m = first_object != object ? fs->first_m : fs->m;
vm_page_assert_xbusied(m);
m_prev = vm_page_prev(m);
while ((m = m_prev) != NULL && m->pindex >= pindex &&
m->valid == VM_PAGE_BITS_ALL) {
m_prev = vm_page_prev(m);
if (vm_page_busied(m))
continue;
vm_page_lock(m);
if (m->hold_count == 0 && m->wire_count == 0) {
pmap_remove_all(m);
vm_page_aflag_clear(m, PGA_REFERENCED);
if (m->dirty != 0)
vm_page_deactivate(m);
else
vm_page_cache(m);
size = VM_FAULT_DONTNEED_MIN;
if (MAXPAGESIZES > 1 && size < pagesizes[1])
size = pagesizes[1];
end = rounddown2(vaddr, size);
if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) &&
(entry = fs->entry)->start < end) {
if (end - entry->start < size)
start = entry->start;
else
start = end - size;
pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED);
pstart = OFF_TO_IDX(entry->offset) + atop(start -
entry->start);
m_next = vm_page_find_least(first_object, pstart);
pend = OFF_TO_IDX(entry->offset) + atop(end -
entry->start);
while ((m = m_next) != NULL && m->pindex < pend) {
m_next = TAILQ_NEXT(m, listq);
if (m->valid != VM_PAGE_BITS_ALL ||
vm_page_busied(m))
continue;
vm_page_lock(m);
if (m->hold_count == 0 && m->wire_count == 0)
vm_page_advise(m, MADV_DONTNEED);
vm_page_unlock(m);
}
vm_page_unlock(m);
}
}
if (first_object != object)
VM_OBJECT_WUNLOCK(first_object);
VM_OBJECT_RUNLOCK(first_object);
}
/*