Remove the panic from vm_radix_insert() and propagate the error to the

callers of vm_page_insert().

The default action for every caller is to unwind-back the operation
besides vm_page_rename() where this has proven to be impossible to do.
For that case, it just spins until the page is not available to be
allocated. However, due to vm_page_rename() to be mostly rare (and
having never hit this panic in the past) it is tought to be a very
seldom thing and not a possible performance factor.

The patch has been tested with an atomic counter returning NULL from
the zone allocator every 1/100000 allocations. Per-printf, I've verified
that a typical buildkernel could trigger this 30 times. The patch
survived to 2 hours of repeated buildkernel/world.

Several technical notes:
- The vm_page_insert() is moved, in several callers, closer to failure
  points.  This could be committed separately before vmcontention hits
  the tree just to verify -CURRENT is happy with it.
- vm_page_rename() does not need to have the page lock in the callers
  as it hide that as an implementation detail. Do the locking internally.
- now vm_page_insert() returns an int, with 0 meaning everything was ok,
  thus KPI is broken by this patch.
This commit is contained in:
attilio 2012-02-05 17:37:26 +00:00
parent def86adf8c
commit 6587a6afdd
8 changed files with 135 additions and 28 deletions

View File

@ -85,6 +85,7 @@ vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr)
vm_map_entry_t entry;
vm_pindex_t upindex;
vm_prot_t prot;
vm_page_bits_t vbits;
boolean_t wired;
KASSERT((uaddr & PAGE_MASK) == 0,
@ -95,6 +96,7 @@ vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr)
* unwired in sf_buf_mext().
*/
kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr));
vbits = kern_pg->valid;
kern_pg->valid = VM_PAGE_BITS_ALL;
KASSERT(kern_pg->queue == PQ_NONE && kern_pg->wire_count == 1,
("vm_pgmoveco: kern_pg is not correctly wired"));
@ -105,6 +107,13 @@ vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr)
return(EFAULT);
}
VM_OBJECT_LOCK(uobject);
if (vm_page_insert(kern_pg, uobject, upindex) != 0) {
kern_pg->valid = vbits;
VM_OBJECT_UNLOCK(uobject);
vm_map_lookup_done(map, entry);
return(ENOMEM);
}
vm_page_dirty(kern_pg);
retry:
if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
if (vm_page_sleep_if_busy(user_pg, TRUE, "vm_pgmoveco"))
@ -122,8 +131,6 @@ vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr)
if (uobject->backing_object != NULL)
pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE);
}
vm_page_insert(kern_pg, uobject, upindex);
vm_page_dirty(kern_pg);
VM_OBJECT_UNLOCK(uobject);
vm_map_lookup_done(map, entry);
return(KERN_SUCCESS);

View File

@ -307,11 +307,14 @@ old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot,
*/
page = vm_page_getfake(paddr, memattr);
VM_OBJECT_LOCK(object);
if (vm_page_insert(page, object, offset) != 0) {
vm_page_putfake(page);
return (VM_PAGER_FAIL);
}
vm_page_lock(*mres);
vm_page_free(*mres);
vm_page_unlock(*mres);
*mres = page;
vm_page_insert(page, object, pidx);
}
page->valid = VM_PAGE_BITS_ALL;
return (VM_PAGER_OK);

View File

@ -179,6 +179,10 @@ sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
/* Construct a new fake page. */
page = vm_page_getfake(paddr, memattr);
VM_OBJECT_LOCK(object);
if (vm_page_insert(page, object, offset) != 0) {
vm_page_putfake(page);
return (VM_PAGER_FAIL);
}
TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, pageq);
/* Free the original pages and insert this fake page into the object. */
@ -187,7 +191,6 @@ sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
vm_page_free(m[i]);
vm_page_unlock(m[i]);
}
vm_page_insert(page, object, offset);
m[reqpage] = page;
page->valid = VM_PAGE_BITS_ALL;

View File

@ -769,9 +769,7 @@ RetryFault:;
* process'es object. The page is
* automatically made dirty.
*/
vm_page_lock(fs.m);
vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
vm_page_unlock(fs.m);
vm_page_busy(fs.m);
fs.first_m = fs.m;
fs.m = NULL;

View File

@ -1397,9 +1397,7 @@ vm_object_split(vm_map_entry_t entry)
vm_reserv_rename(m, new_object, orig_object,
offidxstart);
#endif
vm_page_lock(m);
vm_page_rename(m, new_object, idx);
vm_page_unlock(m);
/*
* page automatically made dirty by rename and
* cache handled
@ -1654,9 +1652,7 @@ vm_object_backing_scan(vm_object_t object, int op)
* If the page was mapped to a process, it can remain
* mapped through the rename.
*/
vm_page_lock(p);
vm_page_rename(p, object, new_pindex);
vm_page_unlock(p);
/* page automatically made dirty by rename */
}
}

View File

@ -109,6 +109,7 @@ __FBSDID("$FreeBSD$");
#include <vm/uma.h>
#include <vm/uma_int.h>
#include <machine/cpu.h>
#include <machine/md_var.h>
/*
@ -777,13 +778,15 @@ vm_page_dirty(vm_page_t m)
* The object and page must be locked.
* This routine may not block.
*/
void
int
vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
{
vm_page_t neighbor;
vm_pindex_t cpindex;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
if (m->object != NULL)
panic("vm_page_insert: page already inserted");
cpindex = m->pindex;
/*
* Record the object/offset pair in this page
@ -804,8 +807,13 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
} else
TAILQ_INSERT_TAIL(&object->memq, m, listq);
}
if (vm_radix_insert(&object->rtree, pindex, m) != 0)
panic("vm_page_insert: unable to insert the new page");
if (vm_radix_insert(&object->rtree, pindex, m) != 0) {
TAILQ_REMOVE(&object->memq, m, listq);
m->object = NULL;
m->pindex = cpindex;
return (ENOMEM);
}
/*
* show that the object has one more resident page.
@ -823,6 +831,7 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
*/
if (m->aflags & PGA_WRITEABLE)
vm_object_set_writeable_dirty(object);
return (0);
}
/*
@ -967,9 +976,20 @@ vm_page_prev(vm_page_t m)
void
vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
{
u_int i;
MPASS(m->object != NULL);
VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
vm_page_lock(m);
vm_page_remove(m);
vm_page_insert(m, new_object, new_pindex);
vm_page_unlock(m);
while (vm_page_insert(m, new_object, new_pindex) != 0) {
pagedaemon_wakeup();
for (i = 0; i < 10000000; i++)
cpu_spinwait();
}
vm_page_dirty(m);
}
@ -1250,7 +1270,19 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
if (object->memattr != VM_MEMATTR_DEFAULT &&
object->type != OBJT_DEVICE && object->type != OBJT_SG)
pmap_page_set_memattr(m, object->memattr);
vm_page_insert(m, object, pindex);
if (vm_page_insert(m, object, pindex) != 0) {
/* See the comment below about hold count handling. */
if (vp != NULL)
vdrop(vp);
vm_page_lock(m);
if (req & VM_ALLOC_WIRED)
vm_page_unwire(m, 0);
vm_page_free(m);
vm_page_unlock(m);
pagedaemon_wakeup();
return (NULL);
}
} else
m->pindex = pindex;
@ -1317,6 +1349,7 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
{
struct vnode *drop;
vm_page_t deferred_vdrop_list, m, m_ret;
vm_pindex_t cpindex;
u_int flags, oflags;
int req_class;
@ -1403,6 +1436,7 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
memattr == VM_MEMATTR_DEFAULT)
memattr = object->memattr;
}
cpindex = pindex;
for (m = m_ret; m < &m_ret[npages]; m++) {
m->aflags = 0;
m->flags &= flags;
@ -1412,12 +1446,30 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
m->oflags = oflags;
if (memattr != VM_MEMATTR_DEFAULT)
pmap_page_set_memattr(m, memattr);
if (object != NULL)
vm_page_insert(m, object, pindex);
else
m->pindex = pindex;
pindex++;
}
for (m = m_ret; m < &m_ret[npages]; m++) {
if (object != NULL) {
if (vm_page_insert(m, object, cpindex) != 0) {
while (deferred_vdrop_list != NULL) {
vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
deferred_vdrop_list =
deferred_vdrop_list->pageq.tqe_next;
}
for (m = m_ret; m < &m_ret[npages]; m++) {
vm_page_lock(m);
if (req & VM_ALLOC_WIRED)
vm_page_unwire(m, 0);
vm_page_free(m);
vm_page_unlock(m);
}
pagedaemon_wakeup();
return (NULL);
}
} else
m->pindex = cpindex;
cpindex++;
}
while (deferred_vdrop_list != NULL) {
vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
@ -2642,11 +2694,8 @@ vm_page_cowfault(vm_page_t m)
pindex = m->pindex;
retry_alloc:
pmap_remove_all(m);
vm_page_remove(m);
mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
if (mnew == NULL) {
vm_page_insert(m, object, pindex);
vm_page_unlock(m);
VM_OBJECT_UNLOCK(object);
VM_WAIT;
@ -2672,8 +2721,9 @@ vm_page_cowfault(vm_page_t m)
vm_page_lock(mnew);
vm_page_free(mnew);
vm_page_unlock(mnew);
vm_page_insert(m, object, pindex);
} else { /* clear COW & copy page */
pmap_remove_all(m);
vm_page_remove(m);
if (!so_zerocp_fullpage)
pmap_copy_page(m, mnew);
mnew->valid = VM_PAGE_BITS_ALL;

View File

@ -386,7 +386,7 @@ void vm_page_dontneed(vm_page_t);
void vm_page_deactivate (vm_page_t);
vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
void vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
vm_page_t vm_page_next(vm_page_t m);
int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);

View File

@ -55,6 +55,7 @@
#include <sys/kdb.h>
CTASSERT(sizeof(struct vm_radix_node) < PAGE_SIZE);
CTASSERT((sizeof(u_int) * NBBY) >= VM_RADIX_LIMIT);
static uma_zone_t vm_radix_node_zone;
@ -211,6 +212,24 @@ vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode,
rtree->rt_root = root;
}
static inline void
vm_radix_unwind_heightup(struct vm_radix *rtree, struct vm_radix_node *root,
struct vm_radix_node *iroot, int ilevel)
{
struct vm_radix_node *rnode;
CTR4(KTR_VM, "unwind: tree %p, root %p, iroot %p, ilevel %d",
rtree, root, iroot, ilevel);
while (iroot != root && root != NULL) {
rnode = root;
MPASS(rnode->rn_count == 0 || rnode->rn_count == 1);
rnode->rn_count = 0;
root = rnode->rn_child[0];
vm_radix_node_put(rnode);
}
vm_radix_setroot(rtree, iroot, ilevel);
}
static inline void *
vm_radix_match(void *child, int color)
{
@ -262,10 +281,9 @@ vm_radix_reclaim_allnodes_internal(struct vm_radix_node *rnode, int level)
int
vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, void *val)
{
struct vm_radix_node *rnode;
struct vm_radix_node *root;
int level;
int slot;
struct vm_radix_node *iroot, *rnode, *root;
u_int allocmsk;
int clev, ilevel, level, slot;
CTR3(KTR_VM,
"insert: tree %p, index %ju, val %p", rtree, (uintmax_t)index, val);
@ -276,6 +294,8 @@ vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, void *val)
* Increase the height by adding nodes at the root until
* there is sufficient space.
*/
ilevel = level;
iroot = root;
while (level == 0 || index > VM_RADIX_MAX(level)) {
CTR3(KTR_VM, "insert: expanding %ju > %ju height %d",
(uintmax_t)index, (uintmax_t)VM_RADIX_MAX(level), level);
@ -292,6 +312,8 @@ vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, void *val)
CTR4(KTR_VM,
"insert: tree %p, root %p, index: %ju, level: %d ENOMEM",
rtree, root, (uintmax_t)index, level);
vm_radix_unwind_heightup(rtree, root, iroot,
ilevel);
return (ENOMEM);
}
/*
@ -309,6 +331,8 @@ vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, void *val)
}
/* Now that the tree is tall enough, fill in the path to the index. */
allocmsk = 0;
clev = level;
rnode = root;
for (level = level - 1; level > 0; level--) {
slot = vm_radix_slot(index, level);
@ -324,9 +348,35 @@ vm_radix_insert(struct vm_radix *rtree, vm_pindex_t index, void *val)
"insert: tree %p, rnode %p, child %p, count %u ENOMEM",
rtree, rnode, rnode->rn_child[slot],
rnode->rn_count);
MPASS(level != clev || allocmsk == 0);
while (allocmsk != 0) {
rnode = root;
level = clev;
level--;
CTR4(KTR_VM,
"insert: unwind root %p, level %d, slot %d, allocmsk: 0x%x",
root, level, slot, allocmsk);
slot = vm_radix_slot(index, level);
MPASS(level >= (ffs(allocmsk) - 1));
while (level > (ffs(allocmsk) - 1)) {
MPASS(level > 0);
slot = vm_radix_slot(index,
level);
rnode = rnode->rn_child[slot];
level--;
}
MPASS((allocmsk & (1 << level)) != 0);
allocmsk &= ~(1 << level);
rnode->rn_count--;
vm_radix_node_put(rnode->rn_child[slot]);
rnode->rn_child[slot] = NULL;
}
vm_radix_unwind_heightup(rtree, root, iroot,
ilevel);
return (ENOMEM);
}
rnode->rn_count++;
allocmsk |= (1 << level);
}
CTR5(KTR_VM,
"insert: tree %p, index %ju, level %d, slot %d, rnode %p",