Eliminate kmem_arena and kmem_object in preparation for further NUMA commits.

The arena argument to kmem_*() is now only used in an assert.  A follow-up
commit will remove the argument altogether before we freeze the API for the
next release.

This replaces the hard limit on kmem size with a soft limit imposed by UMA.  When
the soft limit is exceeded we periodically wakeup the UMA reclaim thread to
attempt to shrink KVA.  On 32bit architectures this should behave much more
gracefully as we exhaust KVA.  On 64bit the limits are likely never hit.

Reviewed by:	markj, kib (some objections)
Discussed with:	alc
Tested by:	pho
Sponsored by:	Netflix / Dell EMC Isilon
Differential Revision:	https://reviews.freebsd.org/D13187
This commit is contained in:
jeff 2017-11-28 23:40:54 +00:00
parent f93de233c6
commit 990ca74cdc
11 changed files with 140 additions and 79 deletions

View File

@ -239,16 +239,22 @@ sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS)
{
u_long size;
size = vmem_size(kmem_arena, VMEM_ALLOC);
size = uma_size();
return (sysctl_handle_long(oidp, &size, 0, req));
}
static int
sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS)
{
u_long size;
u_long size, limit;
size = vmem_size(kmem_arena, VMEM_FREE);
/* The sysctl is unsigned, implement as a saturation value. */
size = uma_size();
limit = uma_limit();
if (size > limit)
size = 0;
else
size = limit - size;
return (sysctl_handle_long(oidp, &size, 0, req));
}
@ -669,19 +675,6 @@ reallocf(void *addr, unsigned long size, struct malloc_type *mtp, int flags)
return (mem);
}
/*
* Wake the uma reclamation pagedaemon thread when we exhaust KVA. It
* will call the lowmem handler and uma_reclaim() callbacks in a
* context that is safe.
*/
static void
kmem_reclaim(vmem_t *vm, int flags)
{
uma_reclaim_wakeup();
pagedaemon_wakeup();
}
#ifndef __sparc64__
CTASSERT(VM_KMEM_SIZE_SCALE >= 1);
#endif
@ -759,9 +752,7 @@ kmeminit(void)
#else
tmp = vm_kmem_size;
#endif
vmem_init(kmem_arena, "kmem arena", kva_alloc(tmp), tmp, PAGE_SIZE,
0, 0);
vmem_set_reclaim(kmem_arena, kmem_reclaim);
uma_set_limit(tmp);
#ifdef DEBUG_MEMGUARD
/*
@ -769,7 +760,7 @@ kmeminit(void)
* replacement allocator used for detecting tamper-after-free
* scenarios as they occur. It is only used for debugging.
*/
memguard_init(kmem_arena);
memguard_init(kernel_arena);
#endif
}

View File

@ -137,6 +137,7 @@ struct vmem {
int vm_nbusytag;
vmem_size_t vm_inuse;
vmem_size_t vm_size;
vmem_size_t vm_limit;
/* Used on import. */
vmem_import_t *vm_importfn;
@ -228,11 +229,11 @@ static uma_zone_t vmem_bt_zone;
/* boot time arena storage. */
static struct vmem kernel_arena_storage;
static struct vmem kmem_arena_storage;
static struct vmem buffer_arena_storage;
static struct vmem transient_arena_storage;
/* kernel and kmem arenas are aliased for backwards KPI compat. */
vmem_t *kernel_arena = &kernel_arena_storage;
vmem_t *kmem_arena = &kmem_arena_storage;
vmem_t *kmem_arena = &kernel_arena_storage;
vmem_t *buffer_arena = &buffer_arena_storage;
vmem_t *transient_arena = &transient_arena_storage;
@ -254,11 +255,11 @@ bt_fill(vmem_t *vm, int flags)
VMEM_ASSERT_LOCKED(vm);
/*
* Only allow the kmem arena to dip into reserve tags. It is the
* Only allow the kernel arena to dip into reserve tags. It is the
* vmem where new tags come from.
*/
flags &= BT_FLAGS;
if (vm != kmem_arena)
if (vm != kernel_arena)
flags &= ~M_USE_RESERVE;
/*
@ -615,22 +616,22 @@ vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
{
vmem_addr_t addr;
*pflag = UMA_SLAB_KMEM;
*pflag = UMA_SLAB_KERNEL;
/*
* Single thread boundary tag allocation so that the address space
* and memory are added in one atomic operation.
*/
mtx_lock(&vmem_bt_lock);
if (vmem_xalloc(kmem_arena, bytes, 0, 0, 0, VMEM_ADDR_MIN,
if (vmem_xalloc(kernel_arena, bytes, 0, 0, 0, VMEM_ADDR_MIN,
VMEM_ADDR_MAX, M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT,
&addr) == 0) {
if (kmem_back(kmem_object, addr, bytes,
if (kmem_back(kernel_object, addr, bytes,
M_NOWAIT | M_USE_RESERVE) == 0) {
mtx_unlock(&vmem_bt_lock);
return ((void *)addr);
}
vmem_xfree(kmem_arena, addr, bytes);
vmem_xfree(kernel_arena, addr, bytes);
mtx_unlock(&vmem_bt_lock);
/*
* Out of memory, not address space. This may not even be
@ -835,7 +836,7 @@ vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags)
int error;
if (vm->vm_importfn == NULL)
return EINVAL;
return (EINVAL);
/*
* To make sure we get a span that meets the alignment we double it
@ -845,6 +846,9 @@ vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags)
size = (align * 2) + size;
size = roundup(size, vm->vm_import_quantum);
if (vm->vm_limit != 0 && vm->vm_limit < vm->vm_size + size)
return (ENOMEM);
/*
* Hide MAXALLOC tags so we're guaranteed to be able to add this
* span and the tag we want to allocate from it.
@ -856,7 +860,7 @@ vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags)
VMEM_LOCK(vm);
vm->vm_nfreetags += BT_MAXALLOC;
if (error)
return ENOMEM;
return (ENOMEM);
vmem_add1(vm, addr, size, BT_TYPE_SPAN);
@ -977,6 +981,15 @@ vmem_set_import(vmem_t *vm, vmem_import_t *importfn,
VMEM_UNLOCK(vm);
}
void
vmem_set_limit(vmem_t *vm, vmem_size_t limit)
{
VMEM_LOCK(vm);
vm->vm_limit = limit;
VMEM_UNLOCK(vm);
}
void
vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn)
{
@ -1009,6 +1022,7 @@ vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size,
vm->vm_quantum_shift = flsl(quantum) - 1;
vm->vm_nbusytag = 0;
vm->vm_size = 0;
vm->vm_limit = 0;
vm->vm_inuse = 0;
qc_init(vm, qcache_max);

View File

@ -75,6 +75,12 @@ void vmem_destroy(vmem_t *);
void vmem_set_import(vmem_t *vm, vmem_import_t *importfn,
vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum);
/*
* Set a limit on the total size of a vmem.
*/
void vmem_set_limit(vmem_t *vm, vmem_size_t limit);
/*
* Set a callback for reclaiming memory when space is exhausted:
*/

View File

@ -66,7 +66,7 @@ __FBSDID("$FreeBSD$");
static SYSCTL_NODE(_vm, OID_AUTO, memguard, CTLFLAG_RW, NULL, "MemGuard data");
/*
* The vm_memguard_divisor variable controls how much of kmem_map should be
* The vm_memguard_divisor variable controls how much of kernel_arena should be
* reserved for MemGuard.
*/
static u_int vm_memguard_divisor;
@ -157,7 +157,7 @@ SYSCTL_ULONG(_vm_memguard, OID_AUTO, frequency_hits, CTLFLAG_RD,
/*
* Return a fudged value to be used for vm_kmem_size for allocating
* the kmem_map. The memguard memory will be a submap.
* the kernel_arena. The memguard memory will be a submap.
*/
unsigned long
memguard_fudge(unsigned long km_size, const struct vm_map *parent_map)
@ -348,7 +348,7 @@ memguard_alloc(unsigned long req_size, int flags)
addr = origaddr;
if (do_guard)
addr += PAGE_SIZE;
rv = kmem_back(kmem_object, addr, size_p, flags);
rv = kmem_back(kernel_object, addr, size_p, flags);
if (rv != KERN_SUCCESS) {
vmem_xfree(memguard_arena, origaddr, size_v);
memguard_fail_pgs++;
@ -418,7 +418,7 @@ memguard_free(void *ptr)
* vm_map lock to serialize updates to memguard_wasted, since
* we had the lock at increment.
*/
kmem_unback(kmem_object, addr, size);
kmem_unback(kernel_object, addr, size);
if (sizev > size)
addr -= PAGE_SIZE;
vmem_xfree(memguard_arena, addr, sizev);

View File

@ -609,12 +609,11 @@ void uma_zone_set_freef(uma_zone_t zone, uma_free freef);
* These flags are setable in the allocf and visible in the freef.
*/
#define UMA_SLAB_BOOT 0x01 /* Slab alloced from boot pages */
#define UMA_SLAB_KMEM 0x02 /* Slab alloced from kmem_map */
#define UMA_SLAB_KERNEL 0x04 /* Slab alloced from kernel_map */
#define UMA_SLAB_PRIV 0x08 /* Slab alloced from priv allocator */
#define UMA_SLAB_OFFP 0x10 /* Slab is managed separately */
#define UMA_SLAB_MALLOC 0x20 /* Slab is a large malloc slab */
/* 0x40 and 0x80 are available */
/* 0x02, 0x40 and 0x80 are available */
/*
* Used to pre-fill a zone with some number of items

View File

@ -147,6 +147,10 @@ static struct mtx uma_boot_pages_mtx;
static struct sx uma_drain_lock;
/* kmem soft limit. */
static unsigned long uma_kmem_limit;
static volatile unsigned long uma_kmem_total;
/* Is the VM done starting up? */
static int booted = 0;
#define UMA_STARTUP 1
@ -285,6 +289,22 @@ static int zone_warnings = 1;
SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
"Warn when UMA zones becomes full");
/* Adjust bytes under management by UMA. */
static inline void
uma_total_dec(unsigned long size)
{
atomic_subtract_long(&uma_kmem_total, size);
}
static inline void
uma_total_inc(unsigned long size)
{
if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit)
uma_reclaim_wakeup();
}
/*
* This routine checks to see whether or not it's safe to enable buckets.
*/
@ -831,6 +851,7 @@ keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
if (keg->uk_flags & UMA_ZONE_OFFPAGE)
zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
uma_total_dec(PAGE_SIZE * keg->uk_ppera);
}
/*
@ -935,6 +956,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
{
uma_alloc allocf;
uma_slab_t slab;
unsigned long size;
uint8_t *mem;
uint8_t flags;
int i;
@ -945,6 +967,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
allocf = keg->uk_allocf;
KEG_UNLOCK(keg);
size = keg->uk_ppera * PAGE_SIZE;
if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
@ -968,13 +991,14 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
wait |= M_NODUMP;
/* zone is passed for legacy reasons. */
mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait);
mem = allocf(zone, size, &flags, wait);
if (mem == NULL) {
if (keg->uk_flags & UMA_ZONE_OFFPAGE)
zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
slab = NULL;
goto out;
}
uma_total_inc(size);
/* Point the slab into the allocated memory */
if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
@ -1079,8 +1103,8 @@ page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
{
void *p; /* Returned page */
*pflag = UMA_SLAB_KMEM;
p = (void *) kmem_malloc(kmem_arena, bytes, wait);
*pflag = UMA_SLAB_KERNEL;
p = (void *) kmem_malloc(kernel_arena, bytes, wait);
return (p);
}
@ -1161,9 +1185,7 @@ page_free(void *mem, vm_size_t size, uint8_t flags)
{
struct vmem *vmem;
if (flags & UMA_SLAB_KMEM)
vmem = kmem_arena;
else if (flags & UMA_SLAB_KERNEL)
if (flags & UMA_SLAB_KERNEL)
vmem = kernel_arena;
else
panic("UMA: page_free used with invalid flags %x", flags);
@ -3132,31 +3154,33 @@ uma_reclaim(void)
sx_xunlock(&uma_drain_lock);
}
static int uma_reclaim_needed;
static volatile int uma_reclaim_needed;
void
uma_reclaim_wakeup(void)
{
uma_reclaim_needed = 1;
wakeup(&uma_reclaim_needed);
if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
wakeup(uma_reclaim);
}
void
uma_reclaim_worker(void *arg __unused)
{
sx_xlock(&uma_drain_lock);
for (;;) {
sx_sleep(&uma_reclaim_needed, &uma_drain_lock, PVM,
"umarcl", 0);
if (uma_reclaim_needed) {
uma_reclaim_needed = 0;
sx_xunlock(&uma_drain_lock);
EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
sx_xlock(&uma_drain_lock);
uma_reclaim_locked(true);
}
sx_xlock(&uma_drain_lock);
while (uma_reclaim_needed == 0)
sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl",
hz);
sx_xunlock(&uma_drain_lock);
EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
sx_xlock(&uma_drain_lock);
uma_reclaim_locked(true);
atomic_set_int(&uma_reclaim_needed, 0);
sx_xunlock(&uma_drain_lock);
/* Don't fire more than once per-second. */
pause("umarclslp", hz);
}
}
@ -3194,6 +3218,7 @@ uma_large_malloc(vm_size_t size, int wait)
slab->us_data = mem;
slab->us_flags = flags | UMA_SLAB_MALLOC;
slab->us_size = size;
uma_total_inc(size);
} else {
zone_free_item(slabzone, slab, NULL, SKIP_NONE);
}
@ -3206,6 +3231,7 @@ uma_large_free(uma_slab_t slab)
{
page_free(slab->us_data, slab->us_size, slab->us_flags);
uma_total_dec(slab->us_size);
zone_free_item(slabzone, slab, NULL, SKIP_NONE);
}
@ -3221,6 +3247,27 @@ uma_zero_item(void *item, uma_zone_t zone)
bzero(item, zone->uz_size);
}
unsigned long
uma_limit(void)
{
return (uma_kmem_limit);
}
void
uma_set_limit(unsigned long limit)
{
uma_kmem_limit = limit;
}
unsigned long
uma_size(void)
{
return uma_kmem_total;
}
void
uma_print_stats(void)
{

View File

@ -425,6 +425,13 @@ vsetslab(vm_offset_t va, uma_slab_t slab)
void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag,
int wait);
void uma_small_free(void *mem, vm_size_t size, uint8_t flags);
/* Set a global soft limit on UMA managed memory. */
void uma_set_limit(unsigned long limit);
unsigned long uma_limit(void);
/* Return the amount of memory managed by UMA. */
unsigned long uma_size(void);
#endif /* _KERNEL */
#endif /* VM_UMA_INT_H */

View File

@ -164,11 +164,13 @@ vm_offset_t
kmem_alloc_attr(vmem_t *vmem, vm_size_t size, int flags, vm_paddr_t low,
vm_paddr_t high, vm_memattr_t memattr)
{
vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
vm_object_t object = kernel_object;
vm_offset_t addr, i, offset;
vm_page_t m;
int pflags, tries;
KASSERT(vmem == kernel_arena,
("kmem_alloc_attr: Only kernel_arena is supported."));
size = round_page(size);
if (vmem_alloc(vmem, size, M_BESTFIT | flags, &addr))
return (0);
@ -220,12 +222,14 @@ kmem_alloc_contig(struct vmem *vmem, vm_size_t size, int flags, vm_paddr_t low,
vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
vm_memattr_t memattr)
{
vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
vm_object_t object = kernel_object;
vm_offset_t addr, offset, tmp;
vm_page_t end_m, m;
u_long npages;
int pflags, tries;
KASSERT(vmem == kernel_arena,
("kmem_alloc_contig: Only kernel_arena is supported."));
size = round_page(size);
if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr))
return (0);
@ -314,12 +318,13 @@ kmem_malloc(struct vmem *vmem, vm_size_t size, int flags)
vm_offset_t addr;
int rv;
KASSERT(vmem == kernel_arena,
("kmem_malloc: Only kernel_arena is supported."));
size = round_page(size);
if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr))
return (0);
rv = kmem_back((vmem == kmem_arena) ? kmem_object : kernel_object,
addr, size, flags);
rv = kmem_back(kernel_object, addr, size, flags);
if (rv != KERN_SUCCESS) {
vmem_free(vmem, addr, size);
return (0);
@ -339,8 +344,8 @@ kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags)
vm_page_t m, mpred;
int pflags;
KASSERT(object == kmem_object || object == kernel_object,
("kmem_back: only supports kernel objects."));
KASSERT(object == kernel_object,
("kmem_back: only supports kernel object."));
offset = addr - VM_MIN_KERNEL_ADDRESS;
pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
@ -396,8 +401,8 @@ kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
vm_page_t m, next;
vm_offset_t end, offset;
KASSERT(object == kmem_object || object == kernel_object,
("kmem_unback: only supports kernel objects."));
KASSERT(object == kernel_object,
("kmem_unback: only supports kernel object."));
pmap_remove(kernel_pmap, addr, addr + size);
offset = addr - VM_MIN_KERNEL_ADDRESS;
@ -422,9 +427,10 @@ void
kmem_free(struct vmem *vmem, vm_offset_t addr, vm_size_t size)
{
KASSERT(vmem == kernel_arena,
("kmem_free: Only kernel_arena is supported."));
size = round_page(size);
kmem_unback((vmem == kmem_arena) ? kmem_object : kernel_object,
addr, size);
kmem_unback(kernel_object, addr, size);
vmem_free(vmem, addr, size);
}

View File

@ -1189,9 +1189,9 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
vm_inherit_t inheritance;
VM_MAP_ASSERT_LOCKED(map);
KASSERT((object != kmem_object && object != kernel_object) ||
KASSERT(object != kernel_object ||
(cow & MAP_COPY_ON_WRITE) == 0,
("vm_map_insert: kmem or kernel object and COW"));
("vm_map_insert: kernel object and COW"));
KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
("vm_map_insert: paradoxical MAP_NOFAULT request"));
KASSERT((prot & ~max) == 0,
@ -2990,7 +2990,7 @@ vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
VM_OBJECT_WLOCK(object);
if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT |
OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
object == kernel_object || object == kmem_object)) {
object == kernel_object)) {
vm_object_collapse(object);
/*

View File

@ -144,7 +144,6 @@ struct object_q vm_object_list;
struct mtx vm_object_list_mtx; /* lock for object list and count */
struct vm_object kernel_object_store;
struct vm_object kmem_object_store;
static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0,
"VM object stats");
@ -294,14 +293,6 @@ vm_object_init(void)
kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
#endif
rw_init(&kmem_object->lock, "kmem vm object");
_vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS -
VM_MIN_KERNEL_ADDRESS), kmem_object);
#if VM_NRESERVLEVEL > 0
kmem_object->flags |= OBJ_COLORED;
kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
#endif
/*
* The lock portion of struct vm_object must be type stable due
* to vm_pageout_fallback_object_lock locking a vm object

View File

@ -227,10 +227,10 @@ extern struct object_q vm_object_list; /* list of allocated objects */
extern struct mtx vm_object_list_mtx; /* lock for object list and count */
extern struct vm_object kernel_object_store;
extern struct vm_object kmem_object_store;
/* kernel and kmem are aliased for backwards KPI compat. */
#define kernel_object (&kernel_object_store)
#define kmem_object (&kmem_object_store)
#define kmem_object (&kernel_object_store)
#define VM_OBJECT_ASSERT_LOCKED(object) \
rw_assert(&(object)->lock, RA_LOCKED)