Give vm_fault()'s sequential access optimization a makeover.
There are two aspects to the sequential access optimization: (1) read ahead of pages that are expected to be accessed in the near future and (2) unmap and cache behind of pages that are not expected to be accessed again. This revision changes both aspects. The read ahead optimization is now more effective. It starts with the same initial read window as before, but arithmetically grows the window on sequential page faults. This can yield increased read bandwidth. For example, on one of my machines, a program using mmap() to read a file that is several times larger than the machine's physical memory takes about 17% less time to complete. The unmap and cache behind optimization is now more selectively applied. The read ahead window must grow to its maximum size before unmap and cache behind is performed. This significantly reduces the number of times that pages are unmapped and cached only to be reactivated a short time later. The unmap and cache behind optimization now clears each page's referenced flag. Previously, in the case of dirty pages, if the containing file was still mapped at the time that the page daemon examined the dirty pages, they would be reactivated. From a stylistic standpoint, this revision also cleanly separates the implementation of the read ahead and unmap/cache behind optimizations. Glanced at: kib MFC after: 2 weeks
This commit is contained in:
parent
eecb732d59
commit
13458803f4
@ -118,9 +118,11 @@ static int prefault_pageorder[] = {
|
||||
static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
|
||||
static void vm_fault_prefault(pmap_t, vm_offset_t, vm_map_entry_t);
|
||||
|
||||
#define VM_FAULT_READ_AHEAD 8
|
||||
#define VM_FAULT_READ_BEHIND 7
|
||||
#define VM_FAULT_READ (VM_FAULT_READ_AHEAD+VM_FAULT_READ_BEHIND+1)
|
||||
#define VM_FAULT_READ_BEHIND 8
|
||||
#define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX)
|
||||
#define VM_FAULT_NINCR (VM_FAULT_READ_MAX / VM_FAULT_READ_BEHIND)
|
||||
#define VM_FAULT_SUM (VM_FAULT_NINCR * (VM_FAULT_NINCR + 1) / 2)
|
||||
#define VM_FAULT_CACHE_BEHIND (VM_FAULT_READ_BEHIND * VM_FAULT_SUM)
|
||||
|
||||
struct faultstate {
|
||||
vm_page_t m;
|
||||
@ -136,6 +138,8 @@ struct faultstate {
|
||||
int vfslocked;
|
||||
};
|
||||
|
||||
static void vm_fault_cache_behind(const struct faultstate *fs, int distance);
|
||||
|
||||
static inline void
|
||||
release_page(struct faultstate *fs)
|
||||
{
|
||||
@ -236,13 +240,13 @@ vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
|
||||
int fault_flags, vm_page_t *m_hold)
|
||||
{
|
||||
vm_prot_t prot;
|
||||
int is_first_object_locked, result;
|
||||
boolean_t growstack, wired;
|
||||
long ahead, behind;
|
||||
int alloc_req, era, faultcount, nera, reqpage, result;
|
||||
boolean_t growstack, is_first_object_locked, wired;
|
||||
int map_generation;
|
||||
vm_object_t next_object;
|
||||
vm_page_t marray[VM_FAULT_READ], mt, mt_prev;
|
||||
vm_page_t marray[VM_FAULT_READ_MAX];
|
||||
int hardfault;
|
||||
int faultcount, ahead, behind, alloc_req;
|
||||
struct faultstate fs;
|
||||
struct vnode *vp;
|
||||
int locked, error;
|
||||
@ -252,7 +256,7 @@ vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
|
||||
PCPU_INC(cnt.v_vm_faults);
|
||||
fs.vp = NULL;
|
||||
fs.vfslocked = 0;
|
||||
faultcount = behind = 0;
|
||||
faultcount = reqpage = 0;
|
||||
|
||||
RetryFault:;
|
||||
|
||||
@ -460,75 +464,47 @@ RetryFault:;
|
||||
*/
|
||||
if (TRYPAGER) {
|
||||
int rv;
|
||||
int reqpage = 0;
|
||||
u_char behavior = vm_map_entry_behavior(fs.entry);
|
||||
|
||||
if (behavior == MAP_ENTRY_BEHAV_RANDOM ||
|
||||
P_KILLED(curproc)) {
|
||||
ahead = 0;
|
||||
behind = 0;
|
||||
ahead = 0;
|
||||
} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
|
||||
behind = 0;
|
||||
ahead = atop(fs.entry->end - vaddr) - 1;
|
||||
if (ahead > VM_FAULT_READ_AHEAD_MAX)
|
||||
ahead = VM_FAULT_READ_AHEAD_MAX;
|
||||
if (fs.pindex == fs.entry->next_read)
|
||||
vm_fault_cache_behind(&fs,
|
||||
VM_FAULT_READ_MAX);
|
||||
} else {
|
||||
behind = (vaddr - fs.entry->start) >> PAGE_SHIFT;
|
||||
/*
|
||||
* If this is a sequential page fault, then
|
||||
* arithmetically increase the number of pages
|
||||
* in the read-ahead window. Otherwise, reset
|
||||
* the read-ahead window to its smallest size.
|
||||
*/
|
||||
behind = atop(vaddr - fs.entry->start);
|
||||
if (behind > VM_FAULT_READ_BEHIND)
|
||||
behind = VM_FAULT_READ_BEHIND;
|
||||
|
||||
ahead = ((fs.entry->end - vaddr) >> PAGE_SHIFT) - 1;
|
||||
if (ahead > VM_FAULT_READ_AHEAD)
|
||||
ahead = VM_FAULT_READ_AHEAD;
|
||||
ahead = atop(fs.entry->end - vaddr) - 1;
|
||||
era = fs.entry->read_ahead;
|
||||
if (fs.pindex == fs.entry->next_read) {
|
||||
nera = era + behind;
|
||||
if (nera > VM_FAULT_READ_AHEAD_MAX)
|
||||
nera = VM_FAULT_READ_AHEAD_MAX;
|
||||
behind = 0;
|
||||
if (ahead > nera)
|
||||
ahead = nera;
|
||||
if (era == VM_FAULT_READ_AHEAD_MAX)
|
||||
vm_fault_cache_behind(&fs,
|
||||
VM_FAULT_CACHE_BEHIND);
|
||||
} else if (ahead > VM_FAULT_READ_AHEAD_MIN)
|
||||
ahead = VM_FAULT_READ_AHEAD_MIN;
|
||||
if (era != ahead)
|
||||
fs.entry->read_ahead = ahead;
|
||||
}
|
||||
is_first_object_locked = FALSE;
|
||||
if ((behavior == MAP_ENTRY_BEHAV_SEQUENTIAL ||
|
||||
(behavior != MAP_ENTRY_BEHAV_RANDOM &&
|
||||
fs.pindex >= fs.entry->lastr &&
|
||||
fs.pindex < fs.entry->lastr + VM_FAULT_READ)) &&
|
||||
(fs.first_object == fs.object ||
|
||||
(is_first_object_locked = VM_OBJECT_TRYLOCK(fs.first_object))) &&
|
||||
fs.first_object->type != OBJT_DEVICE &&
|
||||
fs.first_object->type != OBJT_PHYS &&
|
||||
fs.first_object->type != OBJT_SG) {
|
||||
vm_pindex_t firstpindex;
|
||||
|
||||
if (fs.first_pindex < 2 * VM_FAULT_READ)
|
||||
firstpindex = 0;
|
||||
else
|
||||
firstpindex = fs.first_pindex - 2 * VM_FAULT_READ;
|
||||
mt = fs.first_object != fs.object ?
|
||||
fs.first_m : fs.m;
|
||||
KASSERT(mt != NULL, ("vm_fault: missing mt"));
|
||||
KASSERT((mt->oflags & VPO_BUSY) != 0,
|
||||
("vm_fault: mt %p not busy", mt));
|
||||
mt_prev = vm_page_prev(mt);
|
||||
|
||||
/*
|
||||
* note: partially valid pages cannot be
|
||||
* included in the lookahead - NFS piecemeal
|
||||
* writes will barf on it badly.
|
||||
*/
|
||||
while ((mt = mt_prev) != NULL &&
|
||||
mt->pindex >= firstpindex &&
|
||||
mt->valid == VM_PAGE_BITS_ALL) {
|
||||
mt_prev = vm_page_prev(mt);
|
||||
if (mt->busy ||
|
||||
(mt->oflags & VPO_BUSY))
|
||||
continue;
|
||||
vm_page_lock(mt);
|
||||
if (mt->hold_count ||
|
||||
mt->wire_count) {
|
||||
vm_page_unlock(mt);
|
||||
continue;
|
||||
}
|
||||
pmap_remove_all(mt);
|
||||
if (mt->dirty != 0)
|
||||
vm_page_deactivate(mt);
|
||||
else
|
||||
vm_page_cache(mt);
|
||||
vm_page_unlock(mt);
|
||||
}
|
||||
ahead += behind;
|
||||
behind = 0;
|
||||
}
|
||||
if (is_first_object_locked)
|
||||
VM_OBJECT_UNLOCK(fs.first_object);
|
||||
|
||||
/*
|
||||
* Call the pager to retrieve the data, if any, after
|
||||
@ -899,7 +875,7 @@ RetryFault:;
|
||||
* without holding a write lock on it.
|
||||
*/
|
||||
if (hardfault)
|
||||
fs.entry->lastr = fs.pindex + faultcount - behind;
|
||||
fs.entry->next_read = fs.pindex + faultcount - reqpage;
|
||||
|
||||
if ((prot & VM_PROT_WRITE) != 0 ||
|
||||
(fault_flags & VM_FAULT_DIRTY) != 0) {
|
||||
@ -991,6 +967,60 @@ RetryFault:;
|
||||
return (KERN_SUCCESS);
|
||||
}
|
||||
|
||||
/*
|
||||
* Speed up the reclamation of up to "distance" pages that precede the
|
||||
* faulting pindex within the first object of the shadow chain.
|
||||
*/
|
||||
static void
|
||||
vm_fault_cache_behind(const struct faultstate *fs, int distance)
|
||||
{
|
||||
vm_object_t first_object, object;
|
||||
vm_page_t m, m_prev;
|
||||
vm_pindex_t pindex;
|
||||
|
||||
object = fs->object;
|
||||
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
|
||||
first_object = fs->first_object;
|
||||
if (first_object != object) {
|
||||
if (!VM_OBJECT_TRYLOCK(first_object)) {
|
||||
VM_OBJECT_UNLOCK(object);
|
||||
VM_OBJECT_LOCK(first_object);
|
||||
VM_OBJECT_LOCK(object);
|
||||
}
|
||||
}
|
||||
if (first_object->type != OBJT_DEVICE &&
|
||||
first_object->type != OBJT_PHYS && first_object->type != OBJT_SG) {
|
||||
if (fs->first_pindex < distance)
|
||||
pindex = 0;
|
||||
else
|
||||
pindex = fs->first_pindex - distance;
|
||||
if (pindex < OFF_TO_IDX(fs->entry->offset))
|
||||
pindex = OFF_TO_IDX(fs->entry->offset);
|
||||
m = first_object != object ? fs->first_m : fs->m;
|
||||
KASSERT((m->oflags & VPO_BUSY) != 0,
|
||||
("vm_fault_cache_behind: page %p is not busy", m));
|
||||
m_prev = vm_page_prev(m);
|
||||
while ((m = m_prev) != NULL && m->pindex >= pindex &&
|
||||
m->valid == VM_PAGE_BITS_ALL) {
|
||||
m_prev = vm_page_prev(m);
|
||||
if (m->busy != 0 || (m->oflags & VPO_BUSY) != 0)
|
||||
continue;
|
||||
vm_page_lock(m);
|
||||
if (m->hold_count == 0 && m->wire_count == 0) {
|
||||
pmap_remove_all(m);
|
||||
vm_page_aflag_clear(m, PGA_REFERENCED);
|
||||
if (m->dirty != 0)
|
||||
vm_page_deactivate(m);
|
||||
else
|
||||
vm_page_cache(m);
|
||||
}
|
||||
vm_page_unlock(m);
|
||||
}
|
||||
}
|
||||
if (first_object != object)
|
||||
VM_OBJECT_UNLOCK(first_object);
|
||||
}
|
||||
|
||||
/*
|
||||
* vm_fault_prefault provides a quick way of clustering
|
||||
* pagefaults into a processes address space. It is a "cousin"
|
||||
|
@ -1300,6 +1300,8 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
|
||||
new_entry->protection = prot;
|
||||
new_entry->max_protection = max;
|
||||
new_entry->wired_count = 0;
|
||||
new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
|
||||
new_entry->next_read = OFF_TO_IDX(offset);
|
||||
|
||||
KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
|
||||
("OVERCOMMIT: vm_map_insert leaks vm_map %p", new_entry));
|
||||
|
@ -112,8 +112,9 @@ struct vm_map_entry {
|
||||
vm_prot_t protection; /* protection code */
|
||||
vm_prot_t max_protection; /* maximum protection */
|
||||
vm_inherit_t inheritance; /* inheritance */
|
||||
uint8_t read_ahead; /* pages in the read-ahead window */
|
||||
int wired_count; /* can be paged if = 0 */
|
||||
vm_pindex_t lastr; /* last read */
|
||||
vm_pindex_t next_read; /* index of the next sequential read */
|
||||
struct ucred *cred; /* tmp storage for creator ref */
|
||||
};
|
||||
|
||||
@ -329,6 +330,14 @@ long vmspace_wired_count(struct vmspace *vmspace);
|
||||
#define VM_FAULT_CHANGE_WIRING 1 /* Change the wiring as appropriate */
|
||||
#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/VM_PROT_COPY */
|
||||
|
||||
/*
|
||||
* Initially, mappings are slightly sequential. The maximum window size must
|
||||
* account for the map entry's "read_ahead" field being defined as an uint8_t.
|
||||
*/
|
||||
#define VM_FAULT_READ_AHEAD_MIN 7
|
||||
#define VM_FAULT_READ_AHEAD_INIT 15
|
||||
#define VM_FAULT_READ_AHEAD_MAX min(atop(MAXPHYS) - 1, UINT8_MAX)
|
||||
|
||||
/*
|
||||
* The following "find_space" options are supported by vm_map_find()
|
||||
*/
|
||||
|
Loading…
Reference in New Issue
Block a user