/* * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * uma_core.c Implementation of the Universal Memory allocator * * This allocator is intended to replace the multitude of similar object caches * in the standard FreeBSD kernel. The intent is to be flexible as well as * effecient. A primary design goal is to return unused memory to the rest of * the system. This will make the system as a whole more flexible due to the * ability to move memory to subsystems which most need it instead of leaving * pools of reserved memory unused. * * The basic ideas stem from similar slab/zone based allocators whose algorithms * are well known. * */ /* * TODO: * - Improve memory usage for large allocations * - Investigate cache size adjustments */ #include __FBSDID("$FreeBSD$"); /* I should really use ktr.. */ /* #define UMA_DEBUG 1 #define UMA_DEBUG_ALLOC 1 #define UMA_DEBUG_ALLOC_1 1 */ #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This is the zone from which all zones are spawned. The idea is that even * the zone heads are allocated from the allocator, so we use the bss section * to bootstrap us. */ static struct uma_zone masterzone; static uma_zone_t zones = &masterzone; /* This is the zone from which all of uma_slab_t's are allocated. */ static uma_zone_t slabzone; /* * The initial hash tables come out of this zone so they can be allocated * prior to malloc coming up. */ static uma_zone_t hashzone; /* * Zone that buckets come from. */ static uma_zone_t bucketzone; /* * Are we allowed to allocate buckets? */ static int bucketdisable = 1; /* Linked list of all zones in the system */ static LIST_HEAD(,uma_zone) uma_zones = LIST_HEAD_INITIALIZER(&uma_zones); /* This mutex protects the zone list */ static struct mtx uma_mtx; /* These are the pcpu cache locks */ static struct mtx uma_pcpu_mtx[MAXCPU]; /* Linked list of boot time pages */ static LIST_HEAD(,uma_slab) uma_boot_pages = LIST_HEAD_INITIALIZER(&uma_boot_pages); /* Count of free boottime pages */ static int uma_boot_free = 0; /* Is the VM done starting up? */ static int booted = 0; /* This is the handle used to schedule our working set calculator */ static struct callout uma_callout; /* This is mp_maxid + 1, for use while looping over each cpu */ static int maxcpu; /* * This structure is passed as the zone ctor arg so that I don't have to create * a special allocation function just for zones. */ struct uma_zctor_args { char *name; size_t size; uma_ctor ctor; uma_dtor dtor; uma_init uminit; uma_fini fini; int align; u_int16_t flags; }; /* Prototypes.. */ static void *obj_alloc(uma_zone_t, int, u_int8_t *, int); static void *page_alloc(uma_zone_t, int, u_int8_t *, int); static void page_free(void *, int, u_int8_t); static uma_slab_t slab_zalloc(uma_zone_t, int); static void cache_drain(uma_zone_t, int); static void bucket_drain(uma_zone_t, uma_bucket_t); static void zone_drain_common(uma_zone_t, int); static void zone_ctor(void *, int, void *); static void zone_dtor(void *, int, void *); static void zero_init(void *, int); static void zone_small_init(uma_zone_t zone); static void zone_large_init(uma_zone_t zone); static void zone_foreach(void (*zfunc)(uma_zone_t)); static void zone_timeout(uma_zone_t zone); static int hash_alloc(struct uma_hash *); static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_startup3(void); static void *uma_zalloc_internal(uma_zone_t, void *, int); static void uma_zfree_internal(uma_zone_t, void *, void *, int); static void bucket_enable(void); static int uma_zalloc_bucket(uma_zone_t zone, int flags); static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags); static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab); static __inline void zone_drain(uma_zone_t); void uma_print_zone(uma_zone_t); void uma_print_stats(void); static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, sysctl_vm_zone, "A", "Zone Info"); SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); /* * This routine checks to see whether or not it's safe to enable buckets. */ static void bucket_enable(void) { if (cnt.v_free_count < cnt.v_free_min) bucketdisable = 1; else bucketdisable = 0; } /* * Routine called by timeout which is used to fire off some time interval * based calculations. (working set, stats, etc.) * * Arguments: * arg Unused * * Returns: * Nothing */ static void uma_timeout(void *unused) { bucket_enable(); zone_foreach(zone_timeout); /* Reschedule this event */ callout_reset(&uma_callout, UMA_WORKING_TIME * hz, uma_timeout, NULL); } /* * Routine to perform timeout driven calculations. This does the working set * as well as hash expanding, and per cpu statistics aggregation. * * Arguments: * zone The zone to operate on * * Returns: * Nothing */ static void zone_timeout(uma_zone_t zone) { uma_cache_t cache; u_int64_t alloc; int cpu; alloc = 0; /* * Aggregate per cpu cache statistics back to the zone. * * I may rewrite this to set a flag in the per cpu cache instead of * locking. If the flag is not cleared on the next round I will have * to lock and do it here instead so that the statistics don't get too * far out of sync. */ if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) { for (cpu = 0; cpu < maxcpu; cpu++) { if (CPU_ABSENT(cpu)) continue; CPU_LOCK(cpu); cache = &zone->uz_cpu[cpu]; /* Add them up, and reset */ alloc += cache->uc_allocs; cache->uc_allocs = 0; CPU_UNLOCK(cpu); } } /* Now push these stats back into the zone.. */ ZONE_LOCK(zone); zone->uz_allocs += alloc; /* * Expand the zone hash table. * * This is done if the number of slabs is larger than the hash size. * What I'm trying to do here is completely reduce collisions. This * may be a little aggressive. Should I allow for two collisions max? */ if (zone->uz_flags & UMA_ZFLAG_HASH && zone->uz_pages / zone->uz_ppera >= zone->uz_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; /* * This is so involved because allocating and freeing * while the zone lock is held will lead to deadlock. * I have to do everything in stages and check for * races. */ newhash = zone->uz_hash; ZONE_UNLOCK(zone); ret = hash_alloc(&newhash); ZONE_LOCK(zone); if (ret) { if (hash_expand(&zone->uz_hash, &newhash)) { oldhash = zone->uz_hash; zone->uz_hash = newhash; } else oldhash = newhash; ZONE_UNLOCK(zone); hash_free(&oldhash); ZONE_LOCK(zone); } } /* * Here we compute the working set size as the total number of items * left outstanding since the last time interval. This is slightly * suboptimal. What we really want is the highest number of outstanding * items during the last time quantum. This should be close enough. * * The working set size is used to throttle the zone_drain function. * We don't want to return memory that we may need again immediately. */ alloc = zone->uz_allocs - zone->uz_oallocs; zone->uz_oallocs = zone->uz_allocs; zone->uz_wssize = alloc; ZONE_UNLOCK(zone); } /* * Allocate and zero fill the next sized hash table from the appropriate * backing store. * * Arguments: * hash A new hash structure with the old hash size in uh_hashsize * * Returns: * 1 on sucess and 0 on failure. */ static int hash_alloc(struct uma_hash *hash) { int oldsize; int alloc; oldsize = hash->uh_hashsize; /* We're just going to go to a power of two greater */ if (oldsize) { hash->uh_hashsize = oldsize * 2; alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; /* XXX Shouldn't be abusing DEVBUF here */ hash->uh_slab_hash = (struct slabhead *)malloc(alloc, M_DEVBUF, M_NOWAIT); } else { alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; hash->uh_slab_hash = uma_zalloc_internal(hashzone, NULL, M_WAITOK); hash->uh_hashsize = UMA_HASH_SIZE_INIT; } if (hash->uh_slab_hash) { bzero(hash->uh_slab_hash, alloc); hash->uh_hashmask = hash->uh_hashsize - 1; return (1); } return (0); } /* * Expands the hash table for OFFPAGE zones. This is done from zone_timeout * to reduce collisions. This must not be done in the regular allocation path, * otherwise, we can recurse on the vm while allocating pages. * * Arguments: * oldhash The hash you want to expand * newhash The hash structure for the new table * * Returns: * Nothing * * Discussion: */ static int hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) { uma_slab_t slab; int hval; int i; if (!newhash->uh_slab_hash) return (0); if (oldhash->uh_hashsize >= newhash->uh_hashsize) return (0); /* * I need to investigate hash algorithms for resizing without a * full rehash. */ for (i = 0; i < oldhash->uh_hashsize; i++) while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) { slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]); SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink); hval = UMA_HASH(newhash, slab->us_data); SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], slab, us_hlink); } return (1); } /* * Free the hash bucket to the appropriate backing store. * * Arguments: * slab_hash The hash bucket we're freeing * hashsize The number of entries in that hash bucket * * Returns: * Nothing */ static void hash_free(struct uma_hash *hash) { if (hash->uh_slab_hash == NULL) return; if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) uma_zfree_internal(hashzone, hash->uh_slab_hash, NULL, 0); else free(hash->uh_slab_hash, M_DEVBUF); } /* * Frees all outstanding items in a bucket * * Arguments: * zone The zone to free to, must be unlocked. * bucket The free/alloc bucket with items, cpu queue must be locked. * * Returns: * Nothing */ static void bucket_drain(uma_zone_t zone, uma_bucket_t bucket) { uma_slab_t slab; int mzone; void *item; if (bucket == NULL) return; slab = NULL; mzone = 0; /* We have to lookup the slab again for malloc.. */ if (zone->uz_flags & UMA_ZFLAG_MALLOC) mzone = 1; while (bucket->ub_ptr > -1) { item = bucket->ub_bucket[bucket->ub_ptr]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_ptr] = NULL; KASSERT(item != NULL, ("bucket_drain: botched ptr, item is NULL")); #endif bucket->ub_ptr--; /* * This is extremely inefficient. The slab pointer was passed * to uma_zfree_arg, but we lost it because the buckets don't * hold them. This will go away when free() gets a size passed * to it. */ if (mzone) slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK)); uma_zfree_internal(zone, item, slab, 1); } } /* * Drains the per cpu caches for a zone. * * Arguments: * zone The zone to drain, must be unlocked. * destroy Whether or not to destroy the pcpu buckets (from zone_dtor) * * Returns: * Nothing * * This function returns with the zone locked so that the per cpu queues can * not be filled until zone_drain is finished. * */ static void cache_drain(uma_zone_t zone, int destroy) { uma_bucket_t bucket; uma_cache_t cache; int cpu; /* * Flush out the per cpu queues. * * XXX This causes unnecessary thrashing due to immediately having * empty per cpu queues. I need to improve this. */ /* * We have to lock each cpu cache before locking the zone */ ZONE_UNLOCK(zone); for (cpu = 0; cpu < maxcpu; cpu++) { if (CPU_ABSENT(cpu)) continue; CPU_LOCK(cpu); cache = &zone->uz_cpu[cpu]; bucket_drain(zone, cache->uc_allocbucket); bucket_drain(zone, cache->uc_freebucket); if (destroy) { if (cache->uc_allocbucket != NULL) uma_zfree_internal(bucketzone, cache->uc_allocbucket, NULL, 0); if (cache->uc_freebucket != NULL) uma_zfree_internal(bucketzone, cache->uc_freebucket, NULL, 0); cache->uc_allocbucket = cache->uc_freebucket = NULL; } } /* * Drain the bucket queues and free the buckets, we just keep two per * cpu (alloc/free). */ ZONE_LOCK(zone); while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) { LIST_REMOVE(bucket, ub_link); ZONE_UNLOCK(zone); bucket_drain(zone, bucket); uma_zfree_internal(bucketzone, bucket, NULL, 0); ZONE_LOCK(zone); } /* Now we do the free queue.. */ while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) { LIST_REMOVE(bucket, ub_link); uma_zfree_internal(bucketzone, bucket, NULL, 0); } /* We unlock here, but they will all block until the zone is unlocked */ for (cpu = 0; cpu < maxcpu; cpu++) { if (CPU_ABSENT(cpu)) continue; CPU_UNLOCK(cpu); } } /* * Frees pages from a zone back to the system. This is done on demand from * the pageout daemon. * * Arguments: * zone The zone to free pages from * all Should we drain all items? * destroy Whether to destroy the zone and pcpu buckets (from zone_dtor) * * Returns: * Nothing. */ static void zone_drain_common(uma_zone_t zone, int destroy) { struct slabhead freeslabs = {}; uma_slab_t slab; uma_slab_t n; u_int64_t extra; u_int8_t flags; u_int8_t *mem; int i; /* * We don't want to take pages from staticly allocated zones at this * time */ if (zone->uz_flags & UMA_ZFLAG_NOFREE || zone->uz_freef == NULL) return; ZONE_LOCK(zone); if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone, destroy); if (destroy) zone->uz_wssize = 0; if (zone->uz_free < zone->uz_wssize) goto finished; #ifdef UMA_DEBUG printf("%s working set size: %llu free items: %u\n", zone->uz_name, (unsigned long long)zone->uz_wssize, zone->uz_free); #endif extra = zone->uz_free - zone->uz_wssize; extra /= zone->uz_ipers; /* extra is now the number of extra slabs that we can free */ if (extra == 0) goto finished; slab = LIST_FIRST(&zone->uz_free_slab); while (slab && extra) { n = LIST_NEXT(slab, us_link); /* We have no where to free these to */ if (slab->us_flags & UMA_SLAB_BOOT) { slab = n; continue; } LIST_REMOVE(slab, us_link); zone->uz_pages -= zone->uz_ppera; zone->uz_free -= zone->uz_ipers; if (zone->uz_flags & UMA_ZFLAG_HASH) UMA_HASH_REMOVE(&zone->uz_hash, slab, slab->us_data); SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); slab = n; extra--; } finished: ZONE_UNLOCK(zone); while ((slab = SLIST_FIRST(&freeslabs)) != NULL) { SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink); if (zone->uz_fini) for (i = 0; i < zone->uz_ipers; i++) zone->uz_fini( slab->us_data + (zone->uz_rsize * i), zone->uz_size); flags = slab->us_flags; mem = slab->us_data; if (zone->uz_flags & UMA_ZFLAG_OFFPAGE) uma_zfree_internal(slabzone, slab, NULL, 0); if (zone->uz_flags & UMA_ZFLAG_MALLOC) { vm_object_t obj; if (flags & UMA_SLAB_KMEM) obj = kmem_object; else obj = NULL; for (i = 0; i < zone->uz_ppera; i++) vsetobj((vm_offset_t)mem + (i * PAGE_SIZE), obj); } #ifdef UMA_DEBUG printf("%s: Returning %d bytes.\n", zone->uz_name, UMA_SLAB_SIZE * zone->uz_ppera); #endif zone->uz_freef(mem, UMA_SLAB_SIZE * zone->uz_ppera, flags); } } static __inline void zone_drain(uma_zone_t zone) { zone_drain_common(zone, 0); } /* * Allocate a new slab for a zone. This does not insert the slab onto a list. * * Arguments: * zone The zone to allocate slabs for * wait Shall we wait? * * Returns: * The slab that was allocated or NULL if there is no memory and the * caller specified M_NOWAIT. * */ static uma_slab_t slab_zalloc(uma_zone_t zone, int wait) { uma_slab_t slab; /* Starting slab */ u_int8_t *mem; u_int8_t flags; int i; slab = NULL; #ifdef UMA_DEBUG printf("slab_zalloc: Allocating a new slab for %s\n", zone->uz_name); #endif ZONE_UNLOCK(zone); if (zone->uz_flags & UMA_ZFLAG_OFFPAGE) { slab = uma_zalloc_internal(slabzone, NULL, wait); if (slab == NULL) { ZONE_LOCK(zone); return NULL; } } /* * This reproduces the old vm_zone behavior of zero filling pages the * first time they are added to a zone. * * Malloced items are zeroed in uma_zalloc. */ if ((zone->uz_flags & UMA_ZFLAG_MALLOC) == 0) wait |= M_ZERO; else wait &= ~M_ZERO; if (booted || (zone->uz_flags & UMA_ZFLAG_PRIVALLOC)) { mem = zone->uz_allocf(zone, zone->uz_ppera * UMA_SLAB_SIZE, &flags, wait); if (mem == NULL) { ZONE_LOCK(zone); return (NULL); } } else { uma_slab_t tmps; if (zone->uz_ppera > 1) panic("UMA: Attemping to allocate multiple pages before vm has started.\n"); if (zone->uz_flags & UMA_ZFLAG_MALLOC) panic("Mallocing before uma_startup2 has been called.\n"); if (uma_boot_free == 0) panic("UMA: Ran out of pre init pages, increase UMA_BOOT_PAGES\n"); tmps = LIST_FIRST(&uma_boot_pages); LIST_REMOVE(tmps, us_link); uma_boot_free--; mem = tmps->us_data; flags = tmps->us_flags; } /* Point the slab into the allocated memory */ if (!(zone->uz_flags & UMA_ZFLAG_OFFPAGE)) slab = (uma_slab_t )(mem + zone->uz_pgoff); if (zone->uz_flags & UMA_ZFLAG_MALLOC) for (i = 0; i < zone->uz_ppera; i++) vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab); slab->us_zone = zone; slab->us_data = mem; /* * This is intended to spread data out across cache lines. * * This code doesn't seem to work properly on x86, and on alpha * it makes absolutely no performance difference. I'm sure it could * use some tuning, but sun makes outrageous claims about it's * performance. */ #if 0 if (zone->uz_cachemax) { slab->us_data += zone->uz_cacheoff; zone->uz_cacheoff += UMA_CACHE_INC; if (zone->uz_cacheoff > zone->uz_cachemax) zone->uz_cacheoff = 0; } #endif slab->us_freecount = zone->uz_ipers; slab->us_firstfree = 0; slab->us_flags = flags; for (i = 0; i < zone->uz_ipers; i++) slab->us_freelist[i] = i+1; if (zone->uz_init) for (i = 0; i < zone->uz_ipers; i++) zone->uz_init(slab->us_data + (zone->uz_rsize * i), zone->uz_size); ZONE_LOCK(zone); if (zone->uz_flags & UMA_ZFLAG_HASH) UMA_HASH_INSERT(&zone->uz_hash, slab, mem); zone->uz_pages += zone->uz_ppera; zone->uz_free += zone->uz_ipers; return (slab); } /* * Allocates a number of pages from the system * * Arguments: * zone Unused * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait) { void *p; /* Returned page */ *pflag = UMA_SLAB_KMEM; p = (void *) kmem_malloc(kmem_map, bytes, wait); return (p); } /* * Allocates a number of pages from within an object * * Arguments: * zone Unused * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. * */ static void * obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { vm_object_t object; vm_offset_t retkva, zkva; vm_page_t p; int pages, startpages; object = zone->uz_obj; retkva = 0; /* * This looks a little weird since we're getting one page at a time */ VM_OBJECT_LOCK(object); p = TAILQ_LAST(&object->memq, pglist); pages = p != NULL ? p->pindex + 1 : 0; startpages = pages; zkva = zone->uz_kva + pages * PAGE_SIZE; for (; bytes > 0; bytes -= PAGE_SIZE) { p = vm_page_alloc(object, pages, VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); if (p == NULL) { if (pages != startpages) pmap_qremove(retkva, pages - startpages); while (pages != startpages) { pages--; p = TAILQ_LAST(&object->memq, pglist); vm_page_lock_queues(); vm_page_unwire(p, 0); vm_page_free(p); vm_page_unlock_queues(); } retkva = 0; goto done; } pmap_qenter(zkva, &p, 1); if (retkva == 0) retkva = zkva; zkva += PAGE_SIZE; pages += 1; } done: VM_OBJECT_UNLOCK(object); *flags = UMA_SLAB_PRIV; return ((void *)retkva); } /* * Frees a number of pages to the system * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing * */ static void page_free(void *mem, int size, u_int8_t flags) { vm_map_t map; if (flags & UMA_SLAB_KMEM) map = kmem_map; else panic("UMA: page_free used with invalid flags %d\n", flags); kmem_free(map, (vm_offset_t)mem, size); } /* * Zero fill initializer * * Arguments/Returns follow uma_init specifications * */ static void zero_init(void *mem, int size) { bzero(mem, size); } /* * Finish creating a small uma zone. This calculates ipers, and the zone size. * * Arguments * zone The zone we should initialize * * Returns * Nothing */ static void zone_small_init(uma_zone_t zone) { int rsize; int memused; int ipers; rsize = zone->uz_size; if (rsize < UMA_SMALLEST_UNIT) rsize = UMA_SMALLEST_UNIT; if (rsize & zone->uz_align) rsize = (rsize & ~zone->uz_align) + (zone->uz_align + 1); zone->uz_rsize = rsize; rsize += 1; /* Account for the byte of linkage */ zone->uz_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize; zone->uz_ppera = 1; KASSERT(zone->uz_ipers != 0, ("zone_small_init: ipers is 0, uh-oh!")); memused = zone->uz_ipers * zone->uz_rsize; /* Can we do any better? */ if ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE) { /* * We can't do this if we're internal or if we've been * asked to not go to the VM for buckets. If we do this we * may end up going to the VM (kmem_map) for slabs which we * do not want to do if we're UMA_ZFLAG_CACHEONLY as a * result of UMA_ZONE_VM, which clearly forbids it. */ if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) || (zone->uz_flags & UMA_ZFLAG_CACHEONLY)) return; ipers = UMA_SLAB_SIZE / zone->uz_rsize; if (ipers > zone->uz_ipers) { zone->uz_flags |= UMA_ZFLAG_OFFPAGE; if ((zone->uz_flags & UMA_ZFLAG_MALLOC) == 0) zone->uz_flags |= UMA_ZFLAG_HASH; zone->uz_ipers = ipers; } } } /* * Finish creating a large (> UMA_SLAB_SIZE) uma zone. Just give in and do * OFFPAGE for now. When I can allow for more dynamic slab sizes this will be * more complicated. * * Arguments * zone The zone we should initialize * * Returns * Nothing */ static void zone_large_init(uma_zone_t zone) { int pages; KASSERT((zone->uz_flags & UMA_ZFLAG_CACHEONLY) == 0, ("zone_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY zone")); pages = zone->uz_size / UMA_SLAB_SIZE; /* Account for remainder */ if ((pages * UMA_SLAB_SIZE) < zone->uz_size) pages++; zone->uz_ppera = pages; zone->uz_ipers = 1; zone->uz_flags |= UMA_ZFLAG_OFFPAGE; if ((zone->uz_flags & UMA_ZFLAG_MALLOC) == 0) zone->uz_flags |= UMA_ZFLAG_HASH; zone->uz_rsize = zone->uz_size; } /* * Zone header ctor. This initializes all fields, locks, etc. And inserts * the zone onto the global zone list. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_zcreat_args * */ static void zone_ctor(void *mem, int size, void *udata) { struct uma_zctor_args *arg = udata; uma_zone_t zone = mem; int privlc; bzero(zone, size); zone->uz_name = arg->name; zone->uz_size = arg->size; zone->uz_ctor = arg->ctor; zone->uz_dtor = arg->dtor; zone->uz_init = arg->uminit; zone->uz_fini = arg->fini; zone->uz_align = arg->align; zone->uz_free = 0; zone->uz_pages = 0; zone->uz_flags = 0; zone->uz_allocf = page_alloc; zone->uz_freef = page_free; if (arg->flags & UMA_ZONE_ZINIT) zone->uz_init = zero_init; if (arg->flags & UMA_ZONE_INTERNAL) zone->uz_flags |= UMA_ZFLAG_INTERNAL; if (arg->flags & UMA_ZONE_MALLOC) zone->uz_flags |= UMA_ZFLAG_MALLOC; if (arg->flags & UMA_ZONE_NOFREE) zone->uz_flags |= UMA_ZFLAG_NOFREE; if (arg->flags & UMA_ZONE_VM) zone->uz_flags |= UMA_ZFLAG_CACHEONLY; /* * XXX: * The +1 byte added to uz_size is to account for the byte of * linkage that is added to the size in zone_small_init(). If * we don't account for this here then we may end up in * zone_small_init() with a calculated 'ipers' of 0. */ if ((zone->uz_size+1) > (UMA_SLAB_SIZE - sizeof(struct uma_slab))) zone_large_init(zone); else zone_small_init(zone); #ifdef UMA_MD_SMALL_ALLOC if (zone->uz_ppera == 1) { zone->uz_allocf = uma_small_alloc; zone->uz_freef = uma_small_free; } #endif /* UMA_MD_SMALL_ALLOC */ if (arg->flags & UMA_ZONE_MTXCLASS) privlc = 1; else privlc = 0; /* * If we're putting the slab header in the actual page we need to * figure out where in each page it goes. This calculates a right * justified offset into the memory on an ALIGN_PTR boundary. */ if (!(zone->uz_flags & UMA_ZFLAG_OFFPAGE)) { int totsize; int waste; /* Size of the slab struct and free list */ totsize = sizeof(struct uma_slab) + zone->uz_ipers; if (totsize & UMA_ALIGN_PTR) totsize = (totsize & ~UMA_ALIGN_PTR) + (UMA_ALIGN_PTR + 1); zone->uz_pgoff = UMA_SLAB_SIZE - totsize; waste = zone->uz_pgoff; waste -= (zone->uz_ipers * zone->uz_rsize); /* * This calculates how much space we have for cache line size * optimizations. It works by offseting each slab slightly. * Currently it breaks on x86, and so it is disabled. */ if (zone->uz_align < UMA_CACHE_INC && waste > UMA_CACHE_INC) { zone->uz_cachemax = waste - UMA_CACHE_INC; zone->uz_cacheoff = 0; } totsize = zone->uz_pgoff + sizeof(struct uma_slab) + zone->uz_ipers; /* I don't think it's possible, but I'll make sure anyway */ if (totsize > UMA_SLAB_SIZE) { printf("zone %s ipers %d rsize %d size %d\n", zone->uz_name, zone->uz_ipers, zone->uz_rsize, zone->uz_size); panic("UMA slab won't fit.\n"); } } if (zone->uz_flags & UMA_ZFLAG_HASH) hash_alloc(&zone->uz_hash); #ifdef UMA_DEBUG printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n", zone->uz_name, zone, zone->uz_size, zone->uz_ipers, zone->uz_ppera, zone->uz_pgoff); #endif ZONE_LOCK_INIT(zone, privlc); mtx_lock(&uma_mtx); LIST_INSERT_HEAD(&uma_zones, zone, uz_link); mtx_unlock(&uma_mtx); /* * Some internal zones don't have room allocated for the per cpu * caches. If we're internal, bail out here. */ if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; if (zone->uz_ipers < UMA_BUCKET_SIZE) zone->uz_count = zone->uz_ipers - 1; else zone->uz_count = UMA_BUCKET_SIZE - 1; } /* * Zone header dtor. This frees all data, destroys locks, frees the hash table * and removes the zone from the global list. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void zone_dtor(void *arg, int size, void *udata) { uma_zone_t zone; zone = (uma_zone_t)arg; mtx_lock(&uma_mtx); LIST_REMOVE(zone, uz_link); zone_drain_common(zone, 1); mtx_unlock(&uma_mtx); ZONE_LOCK(zone); if (zone->uz_free != 0) printf("Zone %s was not empty (%d items). Lost %d pages of memory.\n", zone->uz_name, zone->uz_free, zone->uz_pages); ZONE_UNLOCK(zone); if ((zone->uz_flags & UMA_ZFLAG_OFFPAGE) != 0) hash_free(&zone->uz_hash); ZONE_LOCK_FINI(zone); } /* * Traverses every zone in the system and calls a callback * * Arguments: * zfunc A pointer to a function which accepts a zone * as an argument. * * Returns: * Nothing */ static void zone_foreach(void (*zfunc)(uma_zone_t)) { uma_zone_t zone; mtx_lock(&uma_mtx); LIST_FOREACH(zone, &uma_zones, uz_link) { zfunc(zone); } mtx_unlock(&uma_mtx); } /* Public functions */ /* See uma.h */ void uma_startup(void *bootmem) { struct uma_zctor_args args; uma_slab_t slab; int slabsize; int i; #ifdef UMA_DEBUG printf("Creating uma zone headers zone.\n"); #endif #ifdef SMP maxcpu = mp_maxid + 1; #else maxcpu = 1; #endif #ifdef UMA_DEBUG printf("Max cpu = %d, mp_maxid = %d\n", maxcpu, mp_maxid); Debugger("stop"); #endif mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF); /* "manually" Create the initial zone */ args.name = "UMA Zones"; args.size = sizeof(struct uma_zone) + (sizeof(struct uma_cache) * (maxcpu - 1)); args.ctor = zone_ctor; args.dtor = zone_dtor; args.uminit = zero_init; args.fini = NULL; args.align = 32 - 1; args.flags = UMA_ZONE_INTERNAL; /* The initial zone has no Per cpu queues so it's smaller */ zone_ctor(zones, sizeof(struct uma_zone), &args); /* Initialize the pcpu cache lock set once and for all */ for (i = 0; i < maxcpu; i++) CPU_LOCK_INIT(i); #ifdef UMA_DEBUG printf("Filling boot free list.\n"); #endif for (i = 0; i < UMA_BOOT_PAGES; i++) { slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE)); slab->us_data = (u_int8_t *)slab; slab->us_flags = UMA_SLAB_BOOT; LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link); uma_boot_free++; } #ifdef UMA_DEBUG printf("Creating slab zone.\n"); #endif /* * This is the max number of free list items we'll have with * offpage slabs. */ slabsize = UMA_SLAB_SIZE - sizeof(struct uma_slab); slabsize /= UMA_MAX_WASTE; slabsize++; /* In case there it's rounded */ slabsize += sizeof(struct uma_slab); /* Now make a zone for slab headers */ slabzone = uma_zcreate("UMA Slabs", slabsize, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_INTERNAL); hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_INTERNAL); bucketzone = uma_zcreate("UMA Buckets", sizeof(struct uma_bucket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_INTERNAL); #ifdef UMA_MD_SMALL_ALLOC booted = 1; #endif #ifdef UMA_DEBUG printf("UMA startup complete.\n"); #endif } /* see uma.h */ void uma_startup2(void) { booted = 1; bucket_enable(); #ifdef UMA_DEBUG printf("UMA startup2 complete.\n"); #endif } /* * Initialize our callout handle * */ static void uma_startup3(void) { #ifdef UMA_DEBUG printf("Starting callout.\n"); #endif callout_init(&uma_callout, 0); callout_reset(&uma_callout, UMA_WORKING_TIME * hz, uma_timeout, NULL); #ifdef UMA_DEBUG printf("UMA startup3 complete.\n"); #endif } /* See uma.h */ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, u_int16_t flags) { struct uma_zctor_args args; /* This stuff is essential for the zone ctor */ args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = uminit; args.fini = fini; args.align = align; args.flags = flags; return (uma_zalloc_internal(zones, &args, M_WAITOK)); } /* See uma.h */ void uma_zdestroy(uma_zone_t zone) { uma_zfree_internal(zones, zone, NULL, 0); } /* See uma.h */ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { void *item; uma_cache_t cache; uma_bucket_t bucket; int cpu; /* This is the fast path allocation */ #ifdef UMA_DEBUG_ALLOC_1 printf("Allocating one item from %s(%p)\n", zone->uz_name, zone); #endif #ifdef INVARIANTS /* * To make sure that WAITOK or NOWAIT is set, but not more than * one, and check against the API botches that are common. * The uma code implies M_WAITOK if M_NOWAIT is not set, so * we default to waiting if none of the flags is set. */ cpu = flags & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT); if (cpu != M_NOWAIT && cpu != M_WAITOK) { static struct timeval lasterr; static int curerr, once; if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) { printf("Bad uma_zalloc flags: %x\n", cpu); backtrace(); once++; } } #endif if (!(flags & M_NOWAIT)) { KASSERT(curthread->td_intr_nesting_level == 0, ("malloc(M_WAITOK) in interrupt context")); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "malloc() of \"%s\"", zone->uz_name); } zalloc_restart: cpu = PCPU_GET(cpuid); CPU_LOCK(cpu); cache = &zone->uz_cpu[cpu]; zalloc_start: bucket = cache->uc_allocbucket; if (bucket) { if (bucket->ub_ptr > -1) { item = bucket->ub_bucket[bucket->ub_ptr]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_ptr] = NULL; #endif bucket->ub_ptr--; KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); cache->uc_allocs++; #ifdef INVARIANTS ZONE_LOCK(zone); uma_dbg_alloc(zone, NULL, item); ZONE_UNLOCK(zone); #endif CPU_UNLOCK(cpu); if (zone->uz_ctor) zone->uz_ctor(item, zone->uz_size, udata); if (flags & M_ZERO) bzero(item, zone->uz_size); return (item); } else if (cache->uc_freebucket) { /* * We have run out of items in our allocbucket. * See if we can switch with our free bucket. */ if (cache->uc_freebucket->ub_ptr > -1) { uma_bucket_t swap; #ifdef UMA_DEBUG_ALLOC printf("uma_zalloc: Swapping empty with alloc.\n"); #endif swap = cache->uc_freebucket; cache->uc_freebucket = cache->uc_allocbucket; cache->uc_allocbucket = swap; goto zalloc_start; } } } ZONE_LOCK(zone); /* Since we have locked the zone we may as well send back our stats */ zone->uz_allocs += cache->uc_allocs; cache->uc_allocs = 0; /* Our old one is now a free bucket */ if (cache->uc_allocbucket) { KASSERT(cache->uc_allocbucket->ub_ptr == -1, ("uma_zalloc_arg: Freeing a non free bucket.")); LIST_INSERT_HEAD(&zone->uz_free_bucket, cache->uc_allocbucket, ub_link); cache->uc_allocbucket = NULL; } /* Check the free list for a new alloc bucket */ if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) { KASSERT(bucket->ub_ptr != -1, ("uma_zalloc_arg: Returning an empty bucket.")); LIST_REMOVE(bucket, ub_link); cache->uc_allocbucket = bucket; ZONE_UNLOCK(zone); goto zalloc_start; } /* We are no longer associated with this cpu!!! */ CPU_UNLOCK(cpu); /* Bump up our uz_count so we get here less */ if (zone->uz_count < UMA_BUCKET_SIZE - 1) zone->uz_count++; /* * Now lets just fill a bucket and put it on the free list. If that * works we'll restart the allocation from the begining. */ if (uma_zalloc_bucket(zone, flags)) { ZONE_UNLOCK(zone); goto zalloc_restart; } ZONE_UNLOCK(zone); /* * We may not be able to get a bucket so return an actual item. */ #ifdef UMA_DEBUG printf("uma_zalloc_arg: Bucketzone returned NULL\n"); #endif return (uma_zalloc_internal(zone, udata, flags)); } static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags) { uma_slab_t slab; /* * This is to prevent us from recursively trying to allocate * buckets. The problem is that if an allocation forces us to * grab a new bucket we will call page_alloc, which will go off * and cause the vm to allocate vm_map_entries. If we need new * buckets there too we will recurse in kmem_alloc and bad * things happen. So instead we return a NULL bucket, and make * the code that allocates buckets smart enough to deal with it */ if (zone == bucketzone && zone->uz_recurse != 0) return (NULL); slab = NULL; for (;;) { /* * Find a slab with some space. Prefer slabs that are partially * used over those that are totally full. This helps to reduce * fragmentation. */ if (zone->uz_free != 0) { if (!LIST_EMPTY(&zone->uz_part_slab)) { slab = LIST_FIRST(&zone->uz_part_slab); } else { slab = LIST_FIRST(&zone->uz_free_slab); LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link); } return (slab); } /* * M_NOVM means don't ask at all! */ if (flags & M_NOVM) break; if (zone->uz_maxpages && zone->uz_pages >= zone->uz_maxpages) { zone->uz_flags |= UMA_ZFLAG_FULL; if (flags & M_NOWAIT) break; else msleep(zone, &zone->uz_lock, PVM, "zonelimit", 0); continue; } zone->uz_recurse++; slab = slab_zalloc(zone, flags); zone->uz_recurse--; /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove * at least one item. */ if (slab) { LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link); return (slab); } /* * We might not have been able to get a slab but another cpu * could have while we were unlocked. Check again before we * fail. */ if (flags & M_NOWAIT) flags |= M_NOVM; } return (slab); } static void * uma_slab_alloc(uma_zone_t zone, uma_slab_t slab) { void *item; u_int8_t freei; freei = slab->us_firstfree; slab->us_firstfree = slab->us_freelist[freei]; item = slab->us_data + (zone->uz_rsize * freei); slab->us_freecount--; zone->uz_free--; #ifdef INVARIANTS uma_dbg_alloc(zone, slab, item); #endif /* Move this slab to the full list */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&zone->uz_full_slab, slab, us_link); } return (item); } static int uma_zalloc_bucket(uma_zone_t zone, int flags) { uma_bucket_t bucket; uma_slab_t slab; /* * Try this zone's free list first so we don't allocate extra buckets. */ if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) { KASSERT(bucket->ub_ptr == -1, ("uma_zalloc_bucket: Bucket on free list is not empty.")); LIST_REMOVE(bucket, ub_link); } else { int bflags; bflags = flags; if (zone->uz_flags & UMA_ZFLAG_CACHEONLY) bflags |= M_NOVM; ZONE_UNLOCK(zone); bucket = uma_zalloc_internal(bucketzone, NULL, bflags); ZONE_LOCK(zone); if (bucket != NULL) { #ifdef INVARIANTS bzero(bucket, bucketzone->uz_size); #endif bucket->ub_ptr = -1; } } if (bucket == NULL) return (0); #ifdef SMP /* * This code is here to limit the number of simultaneous bucket fills * for any given zone to the number of per cpu caches in this zone. This * is done so that we don't allocate more memory than we really need. */ if (zone->uz_fills >= mp_ncpus) goto done; #endif zone->uz_fills++; /* Try to keep the buckets totally full */ while (bucket->ub_ptr < zone->uz_count && (slab = uma_zone_slab(zone, flags)) != NULL) { while (slab->us_freecount && bucket->ub_ptr < zone->uz_count) { bucket->ub_bucket[++bucket->ub_ptr] = uma_slab_alloc(zone, slab); } /* Don't block on the next fill */ flags |= M_NOWAIT; } zone->uz_fills--; if (bucket->ub_ptr != -1) { LIST_INSERT_HEAD(&zone->uz_full_bucket, bucket, ub_link); return (1); } #ifdef SMP done: #endif uma_zfree_internal(bucketzone, bucket, NULL, 0); return (0); } /* * Allocates an item for an internal zone * * Arguments * zone The zone to alloc for. * udata The data to be passed to the constructor. * flags M_WAITOK, M_NOWAIT, M_ZERO. * * Returns * NULL if there is no memory and M_NOWAIT is set * An item if successful */ static void * uma_zalloc_internal(uma_zone_t zone, void *udata, int flags) { uma_slab_t slab; void *item; item = NULL; /* * This is to stop us from allocating per cpu buckets while we're * running out of UMA_BOOT_PAGES. Otherwise, we would exhaust the * boot pages. */ if (bucketdisable && zone == bucketzone) return (NULL); #ifdef UMA_DEBUG_ALLOC printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone); #endif ZONE_LOCK(zone); slab = uma_zone_slab(zone, flags); if (slab == NULL) { ZONE_UNLOCK(zone); return (NULL); } item = uma_slab_alloc(zone, slab); ZONE_UNLOCK(zone); if (zone->uz_ctor != NULL) zone->uz_ctor(item, zone->uz_size, udata); if (flags & M_ZERO) bzero(item, zone->uz_size); return (item); } /* See uma.h */ void uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { uma_cache_t cache; uma_bucket_t bucket; int bflags; int cpu; int skip; /* This is the fast path free */ skip = 0; #ifdef UMA_DEBUG_ALLOC_1 printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone); #endif /* * The race here is acceptable. If we miss it we'll just have to wait * a little longer for the limits to be reset. */ if (zone->uz_flags & UMA_ZFLAG_FULL) goto zfree_internal; if (zone->uz_dtor) { zone->uz_dtor(item, zone->uz_size, udata); skip = 1; } zfree_restart: cpu = PCPU_GET(cpuid); CPU_LOCK(cpu); cache = &zone->uz_cpu[cpu]; zfree_start: bucket = cache->uc_freebucket; if (bucket) { /* * Do we have room in our bucket? It is OK for this uz count * check to be slightly out of sync. */ if (bucket->ub_ptr < zone->uz_count) { bucket->ub_ptr++; KASSERT(bucket->ub_bucket[bucket->ub_ptr] == NULL, ("uma_zfree: Freeing to non free bucket index.")); bucket->ub_bucket[bucket->ub_ptr] = item; #ifdef INVARIANTS ZONE_LOCK(zone); if (zone->uz_flags & UMA_ZFLAG_MALLOC) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); ZONE_UNLOCK(zone); #endif CPU_UNLOCK(cpu); return; } else if (cache->uc_allocbucket) { #ifdef UMA_DEBUG_ALLOC printf("uma_zfree: Swapping buckets.\n"); #endif /* * We have run out of space in our freebucket. * See if we can switch with our alloc bucket. */ if (cache->uc_allocbucket->ub_ptr < cache->uc_freebucket->ub_ptr) { uma_bucket_t swap; swap = cache->uc_freebucket; cache->uc_freebucket = cache->uc_allocbucket; cache->uc_allocbucket = swap; goto zfree_start; } } } /* * We can get here for two reasons: * * 1) The buckets are NULL * 2) The alloc and free buckets are both somewhat full. * */ ZONE_LOCK(zone); bucket = cache->uc_freebucket; cache->uc_freebucket = NULL; /* Can we throw this on the zone full list? */ if (bucket != NULL) { #ifdef UMA_DEBUG_ALLOC printf("uma_zfree: Putting old bucket on the free list.\n"); #endif /* ub_ptr is pointing to the last free item */ KASSERT(bucket->ub_ptr != -1, ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); LIST_INSERT_HEAD(&zone->uz_full_bucket, bucket, ub_link); } if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) { LIST_REMOVE(bucket, ub_link); ZONE_UNLOCK(zone); cache->uc_freebucket = bucket; goto zfree_start; } /* We're done with this CPU now */ CPU_UNLOCK(cpu); /* And the zone.. */ ZONE_UNLOCK(zone); #ifdef UMA_DEBUG_ALLOC printf("uma_zfree: Allocating new free bucket.\n"); #endif bflags = M_NOWAIT; if (zone->uz_flags & UMA_ZFLAG_CACHEONLY) bflags |= M_NOVM; #ifdef INVARIANTS bflags |= M_ZERO; #endif bucket = uma_zalloc_internal(bucketzone, NULL, bflags); if (bucket) { bucket->ub_ptr = -1; ZONE_LOCK(zone); LIST_INSERT_HEAD(&zone->uz_free_bucket, bucket, ub_link); ZONE_UNLOCK(zone); goto zfree_restart; } /* * If nothing else caught this, we'll just do an internal free. */ zfree_internal: #ifdef INVARIANTS /* * If we need to skip the dtor and the uma_dbg_free in uma_zfree_internal * because we've already called the dtor above, but we ended up here, then * we need to make sure that we take care of the uma_dbg_free immediately. */ if (skip) { ZONE_LOCK(zone); if (zone->uz_flags & UMA_ZFLAG_MALLOC) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); ZONE_UNLOCK(zone); } #endif uma_zfree_internal(zone, item, udata, skip); return; } /* * Frees an item to an INTERNAL zone or allocates a free bucket * * Arguments: * zone The zone to free to * item The item we're freeing * udata User supplied data for the dtor * skip Skip the dtor, it was done in uma_zfree_arg */ static void uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip) { uma_slab_t slab; u_int8_t *mem; u_int8_t freei; if (!skip && zone->uz_dtor) zone->uz_dtor(item, zone->uz_size, udata); ZONE_LOCK(zone); if (!(zone->uz_flags & UMA_ZFLAG_MALLOC)) { mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK)); if (zone->uz_flags & UMA_ZFLAG_HASH) slab = hash_sfind(&zone->uz_hash, mem); else { mem += zone->uz_pgoff; slab = (uma_slab_t)mem; } } else { slab = (uma_slab_t)udata; } /* Do we need to remove from any lists? */ if (slab->us_freecount+1 == zone->uz_ipers) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link); } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link); } /* Slab management stuff */ freei = ((unsigned long)item - (unsigned long)slab->us_data) / zone->uz_rsize; #ifdef INVARIANTS if (!skip) uma_dbg_free(zone, slab, item); #endif slab->us_freelist[freei] = slab->us_firstfree; slab->us_firstfree = freei; slab->us_freecount++; /* Zone statistics */ zone->uz_free++; if (zone->uz_flags & UMA_ZFLAG_FULL) { if (zone->uz_pages < zone->uz_maxpages) zone->uz_flags &= ~UMA_ZFLAG_FULL; /* We can handle one more allocation */ wakeup_one(zone); } ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_max(uma_zone_t zone, int nitems) { ZONE_LOCK(zone); if (zone->uz_ppera > 1) zone->uz_maxpages = nitems * zone->uz_ppera; else zone->uz_maxpages = nitems / zone->uz_ipers; if (zone->uz_maxpages * zone->uz_ipers < nitems) zone->uz_maxpages++; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef) { ZONE_LOCK(zone); zone->uz_freef = freef; ZONE_UNLOCK(zone); } /* See uma.h */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) { ZONE_LOCK(zone); zone->uz_flags |= UMA_ZFLAG_PRIVALLOC; zone->uz_allocf = allocf; ZONE_UNLOCK(zone); } /* See uma.h */ int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count) { int pages; vm_offset_t kva; mtx_lock(&Giant); pages = count / zone->uz_ipers; if (pages * zone->uz_ipers < count) pages++; kva = kmem_alloc_pageable(kernel_map, pages * UMA_SLAB_SIZE); if (kva == 0) { mtx_unlock(&Giant); return (0); } if (obj == NULL) obj = vm_object_allocate(OBJT_DEFAULT, pages); else { VM_OBJECT_LOCK_INIT(obj); _vm_object_allocate(OBJT_DEFAULT, pages, obj); } ZONE_LOCK(zone); zone->uz_kva = kva; zone->uz_obj = obj; zone->uz_maxpages = pages; zone->uz_allocf = obj_alloc; zone->uz_flags |= UMA_ZFLAG_NOFREE | UMA_ZFLAG_PRIVALLOC; ZONE_UNLOCK(zone); mtx_unlock(&Giant); return (1); } /* See uma.h */ void uma_prealloc(uma_zone_t zone, int items) { int slabs; uma_slab_t slab; ZONE_LOCK(zone); slabs = items / zone->uz_ipers; if (slabs * zone->uz_ipers < items) slabs++; while (slabs > 0) { slab = slab_zalloc(zone, M_WAITOK); LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link); slabs--; } ZONE_UNLOCK(zone); } /* See uma.h */ void uma_reclaim(void) { /* * You might think that the delay below would improve performance since * the allocator will give away memory that it may ask for immediately. * Really, it makes things worse, since cpu cycles are so much cheaper * than disk activity. */ #if 0 static struct timeval tv = {0}; struct timeval now; getmicrouptime(&now); if (now.tv_sec > tv.tv_sec + 30) tv = now; else return; #endif #ifdef UMA_DEBUG printf("UMA: vm asked us to release pages!\n"); #endif bucket_enable(); zone_foreach(zone_drain); /* * Some slabs may have been freed but this zone will be visited early * we visit again so that we can free pages that are empty once other * zones are drained. We have to do the same for buckets. */ zone_drain(slabzone); zone_drain(bucketzone); } void * uma_large_malloc(int size, int wait) { void *mem; uma_slab_t slab; u_int8_t flags; slab = uma_zalloc_internal(slabzone, NULL, wait); if (slab == NULL) return (NULL); mem = page_alloc(NULL, size, &flags, wait); if (mem) { vsetslab((vm_offset_t)mem, slab); slab->us_data = mem; slab->us_flags = flags | UMA_SLAB_MALLOC; slab->us_size = size; } else { uma_zfree_internal(slabzone, slab, NULL, 0); } return (mem); } void uma_large_free(uma_slab_t slab) { vsetobj((vm_offset_t)slab->us_data, kmem_object); /* * XXX: We get a lock order reversal if we don't have Giant: * vm_map_remove (locks system map) -> vm_map_delete -> * vm_map_entry_unwire -> vm_fault_unwire -> mtx_lock(&Giant) */ if (!mtx_owned(&Giant)) { mtx_lock(&Giant); page_free(slab->us_data, slab->us_size, slab->us_flags); mtx_unlock(&Giant); } else page_free(slab->us_data, slab->us_size, slab->us_flags); uma_zfree_internal(slabzone, slab, NULL, 0); } void uma_print_stats(void) { zone_foreach(uma_print_zone); } void uma_print_zone(uma_zone_t zone) { printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n", zone->uz_name, zone, zone->uz_size, zone->uz_rsize, zone->uz_flags, zone->uz_ipers, zone->uz_ppera, (zone->uz_ipers * zone->uz_pages) - zone->uz_free, zone->uz_free); } /* * Sysctl handler for vm.zone * * stolen from vm_zone.c */ static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS) { int error, len, cnt; const int linesize = 128; /* conservative */ int totalfree; char *tmpbuf, *offset; uma_zone_t z; char *p; int cpu; int cachefree; uma_bucket_t bucket; uma_cache_t cache; cnt = 0; mtx_lock(&uma_mtx); LIST_FOREACH(z, &uma_zones, uz_link) cnt++; mtx_unlock(&uma_mtx); MALLOC(tmpbuf, char *, (cnt == 0 ? 1 : cnt) * linesize, M_TEMP, M_WAITOK); len = snprintf(tmpbuf, linesize, "\nITEM SIZE LIMIT USED FREE REQUESTS\n\n"); if (cnt == 0) tmpbuf[len - 1] = '\0'; error = SYSCTL_OUT(req, tmpbuf, cnt == 0 ? len-1 : len); if (error || cnt == 0) goto out; offset = tmpbuf; mtx_lock(&uma_mtx); LIST_FOREACH(z, &uma_zones, uz_link) { if (cnt == 0) /* list may have changed size */ break; for (cpu = 0; cpu < maxcpu; cpu++) { if (CPU_ABSENT(cpu)) continue; CPU_LOCK(cpu); } ZONE_LOCK(z); cachefree = 0; for (cpu = 0; cpu < maxcpu; cpu++) { if (CPU_ABSENT(cpu)) continue; cache = &z->uz_cpu[cpu]; if (cache->uc_allocbucket != NULL) cachefree += cache->uc_allocbucket->ub_ptr + 1; if (cache->uc_freebucket != NULL) cachefree += cache->uc_freebucket->ub_ptr + 1; CPU_UNLOCK(cpu); } /* * The "UMA Zones" zone (master zone) does not have pcpu * caches allocated for it, so the above computation is entirely * bogus. Re-set cachefree to 0 in that case. */ if (z == zones) cachefree = 0; LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) { cachefree += bucket->ub_ptr + 1; } totalfree = z->uz_free + cachefree; len = snprintf(offset, linesize, "%-12.12s %6.6u, %8.8u, %6.6u, %6.6u, %8.8llu\n", z->uz_name, z->uz_size, z->uz_maxpages * z->uz_ipers, (z->uz_ipers * (z->uz_pages / z->uz_ppera)) - totalfree, totalfree, (unsigned long long)z->uz_allocs); ZONE_UNLOCK(z); for (p = offset + 12; p > offset && *p == ' '; --p) /* nothing */ ; p[1] = ':'; cnt--; offset += len; } mtx_unlock(&uma_mtx); *offset++ = '\0'; error = SYSCTL_OUT(req, tmpbuf, offset - tmpbuf); out: FREE(tmpbuf, M_TEMP); return (error); }