Reimplement contigmalloc(9) with an algorithm which stands a greatly-
improved chance of working despite pressure from running programs. Instead of trying to throw a bunch of pages out to swap and hope for the best, only a range that can potentially fulfill contigmalloc(9)'s request will have its contents paged out (potentially, not forcibly) at a time. The new contigmalloc operation still operates in three passes, but it could potentially be tuned to more or less. The first pass only looks at pages in the cache and free pages, so they would be thrown out without having to block. If this is not enough, the subsequent passes page out any unwired memory. To combat memory pressure refragmenting the section of memory being laundered, each page is removed from the systems' free memory queue once it has been freed so that blocking later doesn't cause the memory laundered so far to get reallocated. The page-out operations are now blocking, as it would make little sense to try to push out a page, then get its status immediately afterward to remove it from the available free pages queue, if it's unlikely to have been freed. Another change is that if KVA allocation fails, the allocated memory segment will be freed and not leaked. There is a sysctl/tunable, defaulting to on, which causes the old contigmalloc() algorithm to be used. Nonetheless, I have been using vm.old_contigmalloc=0 for over a month. It is safe to switch at run-time to see the difference it makes. A new interface has been used which does not require mapping the allocated pages into KVA: vm_page.h functions vm_page_alloc_contig() and vm_page_release_contig(). These are what vm.old_contigmalloc=0 uses internally, so the sysctl/tunable does not affect their operation. When using the contigmalloc(9) and contigfree(9) interfaces, memory is now tracked with malloc(9) stats. Several functions have been exported from kern_malloc.c to allow other subsystems to use these statistics, as well. This invalidates the BUGS section of the contigmalloc(9) manpage.
This commit is contained in:
parent
4a5be3f70a
commit
4362fada8f
@ -175,6 +175,47 @@ malloc_last_fail(void)
|
||||
return (time_uptime - t_malloc_fail);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add this to the informational malloc_type bucket.
|
||||
*/
|
||||
static void
|
||||
malloc_type_zone_allocated(struct malloc_type *ksp, unsigned long size,
|
||||
int zindx)
|
||||
{
|
||||
mtx_lock(&ksp->ks_mtx);
|
||||
ksp->ks_calls++;
|
||||
if (zindx != -1)
|
||||
ksp->ks_size |= 1 << zindx;
|
||||
if (size != 0) {
|
||||
ksp->ks_memuse += size;
|
||||
ksp->ks_inuse++;
|
||||
if (ksp->ks_memuse > ksp->ks_maxused)
|
||||
ksp->ks_maxused = ksp->ks_memuse;
|
||||
}
|
||||
mtx_unlock(&ksp->ks_mtx);
|
||||
}
|
||||
|
||||
void
|
||||
malloc_type_allocated(struct malloc_type *ksp, unsigned long size)
|
||||
{
|
||||
malloc_type_zone_allocated(ksp, size, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove this allocation from the informational malloc_type bucket.
|
||||
*/
|
||||
void
|
||||
malloc_type_freed(struct malloc_type *ksp, unsigned long size)
|
||||
{
|
||||
mtx_lock(&ksp->ks_mtx);
|
||||
KASSERT(size <= ksp->ks_memuse,
|
||||
("malloc(9)/free(9) confusion.\n%s",
|
||||
"Probably freeing with wrong type, but maybe not here."));
|
||||
ksp->ks_memuse -= size;
|
||||
ksp->ks_inuse--;
|
||||
mtx_unlock(&ksp->ks_mtx);
|
||||
}
|
||||
|
||||
/*
|
||||
* malloc:
|
||||
*
|
||||
@ -196,7 +237,6 @@ malloc(size, type, flags)
|
||||
#ifdef DIAGNOSTIC
|
||||
unsigned long osize = size;
|
||||
#endif
|
||||
register struct malloc_type *ksp = type;
|
||||
|
||||
#ifdef INVARIANTS
|
||||
/*
|
||||
@ -242,29 +282,16 @@ malloc(size, type, flags)
|
||||
krequests[size >> KMEM_ZSHIFT]++;
|
||||
#endif
|
||||
va = uma_zalloc(zone, flags);
|
||||
mtx_lock(&ksp->ks_mtx);
|
||||
if (va == NULL)
|
||||
goto out;
|
||||
|
||||
ksp->ks_size |= 1 << indx;
|
||||
size = keg->uk_size;
|
||||
if (va != NULL)
|
||||
size = keg->uk_size;
|
||||
malloc_type_zone_allocated(type, va == NULL ? 0 : size, indx);
|
||||
} else {
|
||||
size = roundup(size, PAGE_SIZE);
|
||||
zone = NULL;
|
||||
keg = NULL;
|
||||
va = uma_large_malloc(size, flags);
|
||||
mtx_lock(&ksp->ks_mtx);
|
||||
if (va == NULL)
|
||||
goto out;
|
||||
malloc_type_allocated(type, va == NULL ? 0 : size);
|
||||
}
|
||||
ksp->ks_memuse += size;
|
||||
ksp->ks_inuse++;
|
||||
out:
|
||||
ksp->ks_calls++;
|
||||
if (ksp->ks_memuse > ksp->ks_maxused)
|
||||
ksp->ks_maxused = ksp->ks_memuse;
|
||||
|
||||
mtx_unlock(&ksp->ks_mtx);
|
||||
if (flags & M_WAITOK)
|
||||
KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
|
||||
else if (va == NULL)
|
||||
@ -289,7 +316,6 @@ free(addr, type)
|
||||
void *addr;
|
||||
struct malloc_type *type;
|
||||
{
|
||||
register struct malloc_type *ksp = type;
|
||||
uma_slab_t slab;
|
||||
u_long size;
|
||||
|
||||
@ -297,7 +323,7 @@ free(addr, type)
|
||||
if (addr == NULL)
|
||||
return;
|
||||
|
||||
KASSERT(ksp->ks_memuse > 0,
|
||||
KASSERT(type->ks_memuse > 0,
|
||||
("malloc(9)/free(9) confusion.\n%s",
|
||||
"Probably freeing with wrong type, but maybe not here."));
|
||||
size = 0;
|
||||
@ -334,13 +360,7 @@ free(addr, type)
|
||||
size = slab->us_size;
|
||||
uma_large_free(slab);
|
||||
}
|
||||
mtx_lock(&ksp->ks_mtx);
|
||||
KASSERT(size <= ksp->ks_memuse,
|
||||
("malloc(9)/free(9) confusion.\n%s",
|
||||
"Probably freeing with wrong type, but maybe not here."));
|
||||
ksp->ks_memuse -= size;
|
||||
ksp->ks_inuse--;
|
||||
mtx_unlock(&ksp->ks_mtx);
|
||||
malloc_type_freed(type, size);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -105,6 +105,8 @@ void free(void *addr, struct malloc_type *type);
|
||||
void *malloc(unsigned long size, struct malloc_type *type, int flags);
|
||||
void malloc_init(void *);
|
||||
int malloc_last_fail(void);
|
||||
void malloc_type_allocated(struct malloc_type *type, unsigned long size);
|
||||
void malloc_type_freed(struct malloc_type *type, unsigned long size);
|
||||
void malloc_uninit(void *);
|
||||
void *realloc(void *addr, unsigned long size, struct malloc_type *type,
|
||||
int flags);
|
||||
|
@ -68,6 +68,9 @@ __FBSDID("$FreeBSD$");
|
||||
#include <sys/malloc.h>
|
||||
#include <sys/mutex.h>
|
||||
#include <sys/proc.h>
|
||||
#include <sys/kernel.h>
|
||||
#include <sys/linker_set.h>
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/vmmeter.h>
|
||||
#include <sys/vnode.h>
|
||||
|
||||
@ -83,49 +86,62 @@ __FBSDID("$FreeBSD$");
|
||||
#include <vm/vm_extern.h>
|
||||
|
||||
static int
|
||||
vm_contig_launder(int queue)
|
||||
vm_contig_launder_page(vm_page_t m)
|
||||
{
|
||||
vm_object_t object;
|
||||
vm_page_t m, m_tmp, next;
|
||||
vm_page_t m_tmp;
|
||||
struct vnode *vp;
|
||||
|
||||
if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) {
|
||||
vm_page_lock_queues();
|
||||
return (EBUSY);
|
||||
}
|
||||
if (!VM_OBJECT_TRYLOCK(m->object))
|
||||
return (EAGAIN);
|
||||
vm_page_test_dirty(m);
|
||||
if (m->dirty == 0 && m->hold_count == 0)
|
||||
pmap_remove_all(m);
|
||||
if (m->dirty) {
|
||||
object = m->object;
|
||||
if (object->type == OBJT_VNODE) {
|
||||
vm_page_unlock_queues();
|
||||
vp = object->handle;
|
||||
VM_OBJECT_UNLOCK(object);
|
||||
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
|
||||
VM_OBJECT_LOCK(object);
|
||||
vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
|
||||
VM_OBJECT_UNLOCK(object);
|
||||
VOP_UNLOCK(vp, 0, curthread);
|
||||
vm_page_lock_queues();
|
||||
return (0);
|
||||
} else if (object->type == OBJT_SWAP ||
|
||||
object->type == OBJT_DEFAULT) {
|
||||
m_tmp = m;
|
||||
vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC);
|
||||
VM_OBJECT_UNLOCK(object);
|
||||
return (0);
|
||||
}
|
||||
} else if (m->hold_count == 0)
|
||||
vm_page_cache(m);
|
||||
VM_OBJECT_UNLOCK(m->object);
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
vm_contig_launder(int queue)
|
||||
{
|
||||
vm_page_t m, next;
|
||||
int error;
|
||||
|
||||
for (m = TAILQ_FIRST(&vm_page_queues[queue].pl); m != NULL; m = next) {
|
||||
next = TAILQ_NEXT(m, pageq);
|
||||
KASSERT(m->queue == queue,
|
||||
("vm_contig_launder: page %p's queue is not %d", m, queue));
|
||||
if (!VM_OBJECT_TRYLOCK(m->object))
|
||||
continue;
|
||||
if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) {
|
||||
VM_OBJECT_UNLOCK(m->object);
|
||||
vm_page_lock_queues();
|
||||
error = vm_contig_launder_page(m);
|
||||
if (error == 0)
|
||||
return (TRUE);
|
||||
}
|
||||
vm_page_test_dirty(m);
|
||||
if (m->dirty == 0 && m->busy == 0 && m->hold_count == 0)
|
||||
pmap_remove_all(m);
|
||||
if (m->dirty) {
|
||||
object = m->object;
|
||||
if (object->type == OBJT_VNODE) {
|
||||
vm_page_unlock_queues();
|
||||
vp = object->handle;
|
||||
VM_OBJECT_UNLOCK(object);
|
||||
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
|
||||
VM_OBJECT_LOCK(object);
|
||||
vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
|
||||
VM_OBJECT_UNLOCK(object);
|
||||
VOP_UNLOCK(vp, 0, curthread);
|
||||
vm_page_lock_queues();
|
||||
return (TRUE);
|
||||
} else if (object->type == OBJT_SWAP ||
|
||||
object->type == OBJT_DEFAULT) {
|
||||
m_tmp = m;
|
||||
vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC);
|
||||
VM_OBJECT_UNLOCK(object);
|
||||
return (TRUE);
|
||||
}
|
||||
} else if (m->busy == 0 && m->hold_count == 0)
|
||||
vm_page_cache(m);
|
||||
VM_OBJECT_UNLOCK(m->object);
|
||||
if (error == EBUSY)
|
||||
return (FALSE);
|
||||
}
|
||||
return (FALSE);
|
||||
}
|
||||
@ -308,6 +324,204 @@ contigmalloc1(
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
static void
|
||||
vm_page_release_contigl(vm_page_t m, vm_pindex_t count)
|
||||
{
|
||||
mtx_lock_spin(&vm_page_queue_free_mtx);
|
||||
while (count--) {
|
||||
vm_page_free_toq(m);
|
||||
m++;
|
||||
}
|
||||
mtx_unlock_spin(&vm_page_queue_free_mtx);
|
||||
}
|
||||
|
||||
void
|
||||
vm_page_release_contig(vm_page_t m, vm_pindex_t count)
|
||||
{
|
||||
vm_page_lock_queues();
|
||||
vm_page_release_contigl(m, count);
|
||||
vm_page_unlock_queues();
|
||||
}
|
||||
|
||||
static int
|
||||
vm_contig_unqueue_free(vm_page_t m)
|
||||
{
|
||||
int error = 0;
|
||||
|
||||
mtx_lock_spin(&vm_page_queue_free_mtx);
|
||||
if ((m->queue - m->pc) == PQ_FREE)
|
||||
vm_pageq_remove_nowakeup(m);
|
||||
else
|
||||
error = EAGAIN;
|
||||
mtx_unlock_spin(&vm_page_queue_free_mtx);
|
||||
if (error)
|
||||
return (error);
|
||||
m->valid = VM_PAGE_BITS_ALL;
|
||||
if (m->flags & PG_ZERO)
|
||||
vm_page_zero_count--;
|
||||
/* Don't clear the PG_ZERO flag; we'll need it later. */
|
||||
m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
|
||||
KASSERT(m->dirty == 0,
|
||||
("contigmalloc2: page %p was dirty", m));
|
||||
m->wire_count = 0;
|
||||
m->busy = 0;
|
||||
m->object = NULL;
|
||||
return (error);
|
||||
}
|
||||
|
||||
vm_page_t
|
||||
vm_page_alloc_contig(vm_pindex_t npages, vm_paddr_t low, vm_paddr_t high,
|
||||
vm_offset_t alignment, vm_offset_t boundary)
|
||||
{
|
||||
vm_object_t object;
|
||||
vm_offset_t size;
|
||||
vm_paddr_t phys;
|
||||
vm_page_t pga = vm_page_array;
|
||||
int i, pass, pqtype, start;
|
||||
|
||||
size = npages << PAGE_SHIFT;
|
||||
if (size == 0)
|
||||
panic("vm_page_alloc_contig: size must not be 0");
|
||||
if ((alignment & (alignment - 1)) != 0)
|
||||
panic("vm_page_alloc_contig: alignment must be a power of 2");
|
||||
if ((boundary & (boundary - 1)) != 0)
|
||||
panic("vm_page_alloc_contig: boundary must be a power of 2");
|
||||
|
||||
for (pass = 0; pass < 2; pass++) {
|
||||
start = vm_page_array_size;
|
||||
vm_page_lock_queues();
|
||||
retry:
|
||||
start--;
|
||||
/*
|
||||
* Find last page in array that is free, within range,
|
||||
* aligned, and such that the boundary won't be crossed.
|
||||
*/
|
||||
for (i = start; i >= 0; i--) {
|
||||
phys = VM_PAGE_TO_PHYS(&pga[i]);
|
||||
pqtype = pga[i].queue - pga[i].pc;
|
||||
if (pass == 0) {
|
||||
if (pqtype != PQ_FREE && pqtype != PQ_CACHE)
|
||||
continue;
|
||||
} else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
|
||||
pga[i].queue != PQ_ACTIVE &&
|
||||
pga[i].queue != PQ_INACTIVE)
|
||||
continue;
|
||||
if (phys >= low && phys + size <= high &&
|
||||
((phys & (alignment - 1)) == 0) &&
|
||||
((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)
|
||||
break;
|
||||
}
|
||||
/* There are no candidates at all. */
|
||||
if (i == -1) {
|
||||
vm_page_unlock_queues();
|
||||
continue;
|
||||
}
|
||||
start = i;
|
||||
/*
|
||||
* Check successive pages for contiguous and free.
|
||||
*/
|
||||
for (i = start + 1; i < start + npages; i++) {
|
||||
pqtype = pga[i].queue - pga[i].pc;
|
||||
if (VM_PAGE_TO_PHYS(&pga[i]) !=
|
||||
VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)
|
||||
goto retry;
|
||||
if (pass == 0) {
|
||||
if (pqtype != PQ_FREE && pqtype != PQ_CACHE)
|
||||
goto retry;
|
||||
} else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
|
||||
pga[i].queue != PQ_ACTIVE &&
|
||||
pga[i].queue != PQ_INACTIVE)
|
||||
goto retry;
|
||||
}
|
||||
for (i = start; i < start + npages; i++) {
|
||||
vm_page_t m = &pga[i];
|
||||
|
||||
retry_page:
|
||||
pqtype = m->queue - m->pc;
|
||||
if (pass != 0 && pqtype != PQ_FREE &&
|
||||
pqtype != PQ_CACHE) {
|
||||
switch (m->queue) {
|
||||
case PQ_ACTIVE:
|
||||
case PQ_INACTIVE:
|
||||
if (vm_contig_launder_page(m) != 0)
|
||||
goto cleanup_freed;
|
||||
pqtype = m->queue - m->pc;
|
||||
if (pqtype == PQ_FREE ||
|
||||
pqtype == PQ_CACHE)
|
||||
break;
|
||||
default:
|
||||
cleanup_freed:
|
||||
vm_page_release_contigl(&pga[start],
|
||||
i - start);
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
if (pqtype == PQ_CACHE) {
|
||||
object = m->object;
|
||||
if (!VM_OBJECT_TRYLOCK(object))
|
||||
goto retry;
|
||||
vm_page_busy(m);
|
||||
vm_page_free(m);
|
||||
VM_OBJECT_UNLOCK(object);
|
||||
}
|
||||
/*
|
||||
* There is no good API for freeing a page
|
||||
* directly to PQ_NONE on our behalf, so spin.
|
||||
*/
|
||||
if (vm_contig_unqueue_free(m) != 0)
|
||||
goto retry_page;
|
||||
}
|
||||
vm_page_unlock_queues();
|
||||
/*
|
||||
* We've found a contiguous chunk that meets are requirements.
|
||||
*/
|
||||
return (&pga[start]);
|
||||
}
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
static void *
|
||||
contigmalloc2(vm_page_t m, vm_pindex_t npages, int flags)
|
||||
{
|
||||
vm_object_t object = kernel_object;
|
||||
vm_map_t map = kernel_map;
|
||||
vm_offset_t addr, tmp_addr;
|
||||
vm_pindex_t i;
|
||||
|
||||
/*
|
||||
* Allocate kernel VM, unfree and assign the physical pages to
|
||||
* it and return kernel VM pointer.
|
||||
*/
|
||||
vm_map_lock(map);
|
||||
if (vm_map_findspace(map, vm_map_min(map), npages << PAGE_SHIFT, &addr)
|
||||
!= KERN_SUCCESS) {
|
||||
vm_map_unlock(map);
|
||||
return (NULL);
|
||||
}
|
||||
vm_object_reference(object);
|
||||
vm_map_insert(map, object, addr - VM_MIN_KERNEL_ADDRESS,
|
||||
addr, addr + (npages << PAGE_SHIFT), VM_PROT_ALL, VM_PROT_ALL, 0);
|
||||
vm_map_unlock(map);
|
||||
tmp_addr = addr;
|
||||
VM_OBJECT_LOCK(object);
|
||||
for (i = 0; i < npages; i++) {
|
||||
vm_page_insert(&m[i], object,
|
||||
OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
|
||||
if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
|
||||
pmap_zero_page(&m[i]);
|
||||
tmp_addr += PAGE_SIZE;
|
||||
}
|
||||
VM_OBJECT_UNLOCK(object);
|
||||
vm_map_wire(map, addr, addr + (npages << PAGE_SHIFT),
|
||||
VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
|
||||
return ((void *)addr);
|
||||
}
|
||||
|
||||
static int vm_old_contigmalloc = 1;
|
||||
SYSCTL_INT(_vm, OID_AUTO, old_contigmalloc,
|
||||
CTLFLAG_RW, &vm_old_contigmalloc, 0, "Use the old contigmalloc algorithm");
|
||||
TUNABLE_INT("vm.old_contigmalloc", &vm_old_contigmalloc);
|
||||
|
||||
void *
|
||||
contigmalloc(
|
||||
unsigned long size, /* should be size_t here and for malloc() */
|
||||
@ -319,17 +533,37 @@ contigmalloc(
|
||||
unsigned long boundary)
|
||||
{
|
||||
void * ret;
|
||||
vm_page_t pages;
|
||||
vm_pindex_t npgs;
|
||||
|
||||
npgs = round_page(size) >> PAGE_SHIFT;
|
||||
mtx_lock(&Giant);
|
||||
ret = contigmalloc1(size, type, flags, low, high, alignment, boundary,
|
||||
kernel_map);
|
||||
if (vm_old_contigmalloc) {
|
||||
ret = contigmalloc1(size, type, flags, low, high, alignment,
|
||||
boundary, kernel_map);
|
||||
} else {
|
||||
pages = vm_page_alloc_contig(npgs, low, high,
|
||||
alignment, boundary);
|
||||
if (pages == NULL) {
|
||||
ret = NULL;
|
||||
} else {
|
||||
ret = contigmalloc2(pages, npgs, flags);
|
||||
if (ret == NULL)
|
||||
vm_page_release_contig(pages, npgs);
|
||||
}
|
||||
|
||||
}
|
||||
mtx_unlock(&Giant);
|
||||
malloc_type_allocated(type, ret == NULL ? 0 : npgs << PAGE_SHIFT);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
void
|
||||
contigfree(void *addr, unsigned long size, struct malloc_type *type)
|
||||
{
|
||||
vm_pindex_t npgs;
|
||||
|
||||
npgs = round_page(size) >> PAGE_SHIFT;
|
||||
kmem_free(kernel_map, (vm_offset_t)addr, size);
|
||||
malloc_type_freed(type, npgs << PAGE_SHIFT);
|
||||
}
|
||||
|
@ -342,6 +342,9 @@ void vm_pageq_requeue(vm_page_t m);
|
||||
|
||||
void vm_page_activate (vm_page_t);
|
||||
vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int);
|
||||
vm_page_t vm_page_alloc_contig (vm_pindex_t, vm_paddr_t, vm_paddr_t,
|
||||
vm_offset_t, vm_offset_t);
|
||||
void vm_page_release_contig (vm_page_t, vm_pindex_t);
|
||||
vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
|
||||
void vm_page_cache (register vm_page_t);
|
||||
int vm_page_try_to_cache (vm_page_t);
|
||||
|
Loading…
Reference in New Issue
Block a user