From 099a0e588cbe1bbc56a565bf57d722621b47a866 Mon Sep 17 00:00:00 2001 From: Bosko Milekic Date: Mon, 31 May 2004 21:46:06 +0000 Subject: [PATCH] Bring in mbuma to replace mballoc. mbuma is an Mbuf & Cluster allocator built on top of a number of extensions to the UMA framework, all included herein. Extensions to UMA worth noting: - Better layering between slab <-> zone caches; introduce Keg structure which splits off slab cache away from the zone structure and allows multiple zones to be stacked on top of a single Keg (single type of slab cache); perhaps we should look into defining a subset API on top of the Keg for special use by malloc(9), for example. - UMA_ZONE_REFCNT zones can now be added, and reference counters automagically allocated for them within the end of the associated slab structures. uma_find_refcnt() does a kextract to fetch the slab struct reference from the underlying page, and lookup the corresponding refcnt. mbuma things worth noting: - integrates mbuf & cluster allocations with extended UMA and provides caches for commonly-allocated items; defines several zones (two primary, one secondary) and two kegs. - change up certain code paths that always used to do: m_get() + m_clget() to instead just use m_getcl() and try to take advantage of the newly defined secondary Packet zone. - netstat(1) and systat(1) quickly hacked up to do basic stat reporting but additional stats work needs to be done once some other details within UMA have been taken care of and it becomes clearer to how stats will work within the modified framework. From the user perspective, one implication is that the NMBCLUSTERS compile-time option is no longer used. The maximum number of clusters is still capped off according to maxusers, but it can be made unlimited by setting the kern.ipc.nmbclusters boot-time tunable to zero. Work should be done to write an appropriate sysctl handler allowing dynamic tuning of kern.ipc.nmbclusters at runtime. Additional things worth noting/known issues (READ): - One report of 'ips' (ServeRAID) driver acting really slow in conjunction with mbuma. Need more data. Latest report is that ips is equally sucking with and without mbuma. - Giant leak in NFS code sometimes occurs, can't reproduce but currently analyzing; brueffer is able to reproduce but THIS IS NOT an mbuma-specific problem and currently occurs even WITHOUT mbuma. - Issues in network locking: there is at least one code path in the rip code where one or more locks are acquired and we end up in m_prepend() with M_WAITOK, which causes WITNESS to whine from within UMA. Current temporary solution: force all UMA allocations to be M_NOWAIT from within UMA for now to avoid deadlocks unless WITNESS is defined and we can determine with certainty that we're not holding any locks when we're M_WAITOK. - I've seen at least one weird socketbuffer empty-but- mbuf-still-attached panic. I don't believe this to be related to mbuma but please keep your eyes open, turn on debugging, and capture crash dumps. This change removes more code than it adds. A paper is available detailing the change and considering various performance issues, it was presented at BSDCan2004: http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf Please read the paper for Future Work and implementation details, as well as credits. Testing and Debugging: rwatson, brueffer, Ketrien I. Saihr-Kesenchedra, ... Reviewed by: Lots of people (for different parts) --- sys/conf/files | 2 +- sys/i386/i386/vm_machdep.c | 7 + sys/kern/kern_malloc.c | 27 +- sys/kern/kern_mbuf.c | 385 ++++++++ sys/kern/subr_mbuf.c | 1548 ------------------------------ sys/kern/uipc_mbuf.c | 235 ++++- sys/kern/uipc_mbuf2.c | 40 +- sys/kern/uipc_sockbuf.c | 13 +- sys/kern/uipc_socket.c | 95 +- sys/kern/uipc_socket2.c | 13 +- sys/kern/uipc_syscalls.c | 16 + sys/sparc64/sparc64/vm_machdep.c | 7 + sys/sys/mbuf.h | 203 ++-- sys/vm/uma.h | 78 +- sys/vm/uma_core.c | 966 +++++++++++++------ sys/vm/uma_dbg.c | 34 +- sys/vm/uma_int.h | 173 ++-- sys/vm/vm_kern.c | 10 - usr.bin/netstat/main.c | 8 - usr.bin/netstat/mbuf.c | 196 +--- usr.bin/netstat/netstat.1 | 4 - usr.bin/netstat/netstat.h | 1 - usr.bin/systat/mbufs.c | 53 +- 23 files changed, 1739 insertions(+), 2375 deletions(-) create mode 100644 sys/kern/kern_mbuf.c delete mode 100644 sys/kern/subr_mbuf.c diff --git a/sys/conf/files b/sys/conf/files index c2d7e7ecd47b..0d48a92d164e 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1075,6 +1075,7 @@ kern/kern_lock.c standard kern/kern_lockf.c standard kern/kern_mac.c standard kern/kern_malloc.c standard +kern/kern_mbuf.c standard kern/kern_mib.c standard kern/kern_module.c standard kern/kern_mutex.c standard @@ -1116,7 +1117,6 @@ kern/subr_hints.c standard kern/subr_kobj.c standard kern/subr_log.c standard kern/subr_mbpool.c optional libmbpool -kern/subr_mbuf.c standard kern/subr_mchain.c optional libmchain kern/subr_module.c standard kern/subr_msgbuf.c standard diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c index 50fd06ea7f3e..9a2f9e3300c5 100644 --- a/sys/i386/i386/vm_machdep.c +++ b/sys/i386/i386/vm_machdep.c @@ -95,6 +95,10 @@ __FBSDID("$FreeBSD$"); #include #endif +#ifndef NSFBUFS +#define NSFBUFS (512 + maxusers * 16) +#endif + static void cpu_reset_real(void); #ifdef SMP static void cpu_reset_proxy(void); @@ -584,6 +588,9 @@ sf_buf_init(void *arg) vm_offset_t sf_base; int i; + nsfbufs = NSFBUFS; + TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); + sf_buf_active = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask); TAILQ_INIT(&sf_buf_freelist); sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE); diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index c92e70f89d4f..4bc3348b92ab 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -191,6 +191,7 @@ malloc(size, type, flags) int indx; caddr_t va; uma_zone_t zone; + uma_keg_t keg; #ifdef DIAGNOSTIC unsigned long osize = size; #endif @@ -235,6 +236,7 @@ malloc(size, type, flags) size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; indx = kmemsize[size >> KMEM_ZSHIFT]; zone = kmemzones[indx].kz_zone; + keg = zone->uz_keg; #ifdef MALLOC_PROFILE krequests[size >> KMEM_ZSHIFT]++; #endif @@ -244,10 +246,11 @@ malloc(size, type, flags) goto out; ksp->ks_size |= 1 << indx; - size = zone->uz_size; + size = keg->uk_size; } else { size = roundup(size, PAGE_SIZE); zone = NULL; + keg = NULL; va = uma_large_malloc(size, flags); mtx_lock(&ksp->ks_mtx); if (va == NULL) @@ -309,7 +312,7 @@ free(addr, type) #ifdef INVARIANTS struct malloc_type **mtp = addr; #endif - size = slab->us_zone->uz_size; + size = slab->us_keg->uk_size; #ifdef INVARIANTS /* * Cache a pointer to the malloc_type that most recently freed @@ -325,7 +328,7 @@ free(addr, type) sizeof(struct malloc_type *); *mtp = type; #endif - uma_zfree_arg(slab->us_zone, addr, slab); + uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab); } else { size = slab->us_size; uma_large_free(slab); @@ -364,8 +367,8 @@ realloc(addr, size, type, flags) ("realloc: address %p out of range", (void *)addr)); /* Get the size of the original block */ - if (slab->us_zone) - alloc = slab->us_zone->uz_size; + if (slab->us_keg) + alloc = slab->us_keg->uk_size; else alloc = slab->us_size; @@ -410,7 +413,6 @@ kmeminit(dummy) void *dummy; { u_int8_t indx; - u_long npg; u_long mem_size; int i; @@ -428,7 +430,7 @@ kmeminit(dummy) * Note that the kmem_map is also used by the zone allocator, * so make sure that there is enough space. */ - vm_kmem_size = VM_KMEM_SIZE; + vm_kmem_size = VM_KMEM_SIZE + nmbclusters * PAGE_SIZE; mem_size = cnt.v_page_count; #if defined(VM_KMEM_SIZE_SCALE) @@ -462,17 +464,8 @@ kmeminit(dummy) */ init_param3(vm_kmem_size / PAGE_SIZE); - /* - * In mbuf_init(), we set up submaps for mbufs and clusters, in which - * case we rounddown() (nmbufs * MSIZE) and (nmbclusters * MCLBYTES), - * respectively. Mathematically, this means that what we do here may - * amount to slightly more address space than we need for the submaps, - * but it never hurts to have an extra page in kmem_map. - */ - npg = (nmbufs*MSIZE + nmbclusters*MCLBYTES + vm_kmem_size) / PAGE_SIZE; - kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase, - (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE)); + (vm_offset_t *)&kmemlimit, vm_kmem_size); kmem_map->system_map = 1; uma_startup2(); diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c new file mode 100644 index 000000000000..2bec5adf4a1f --- /dev/null +++ b/sys/kern/kern_mbuf.c @@ -0,0 +1,385 @@ +/*- + * Copyright (c) 2004 + * Bosko Milekic . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_mac.h" +#include "opt_param.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA + * Zones. + * + * Mbuf Clusters (2K, contiguous) are allocated from the Cluster + * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the + * administrator so desires. + * + * Mbufs are allocated from a UMA Master Zone called the Mbuf + * Zone. + * + * Additionally, FreeBSD provides a Packet Zone, which it + * configures as a Secondary Zone to the Mbuf Master Zone, + * thus sharing backend Slab kegs with the Mbuf Master Zone. + * + * Thus common-case allocations and locking are simplified: + * + * m_clget() m_getcl() + * | | + * | .------------>[(Packet Cache)] m_get(), m_gethdr() + * | | [ Packet ] | + * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] + * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ] + * | \________ | + * [ Cluster Keg ] \ / + * | [ Mbuf Keg ] + * [ Cluster Slabs ] | + * | [ Mbuf Slabs ] + * \____________(VM)_________________/ + */ + +int nmbclusters; +struct mbstat mbstat; + +static void +tunable_mbinit(void *dummy) +{ + + /* This has to be done before VM init. */ + nmbclusters = 1024 + maxusers * 64; + TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); +} +SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL); + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RW, &nmbclusters, 0, + "Maximum number of mbuf clusters allowed"); +SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, + "Mbuf general information and statistics"); + +/* + * Zones from which we allocate. + */ +uma_zone_t zone_mbuf; +uma_zone_t zone_clust; +uma_zone_t zone_pack; + +/* + * Local prototypes. + */ +static void mb_ctor_mbuf(void *, int, void *); +static void mb_ctor_clust(void *, int, void *); +static void mb_ctor_pack(void *, int, void *); +static void mb_dtor_mbuf(void *, int, void *); +static void mb_dtor_clust(void *, int, void *); /* XXX */ +static void mb_dtor_pack(void *, int, void *); /* XXX */ +static void mb_init_pack(void *, int); +static void mb_fini_pack(void *, int); + +static void mb_reclaim(void *); +static void mbuf_init(void *); + +/* + * Initialize FreeBSD Network buffer allocation. + */ +SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL) +static void +mbuf_init(void *dummy) +{ + + /* + * Configure UMA zones for Mbufs, Clusters, and Packets. + */ + zone_mbuf = uma_zcreate("Mbuf", MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MAXBUCKET); + zone_clust = uma_zcreate("MbufClust", MCLBYTES, mb_ctor_clust, + mb_dtor_clust, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_REFCNT); + if (nmbclusters > 0) + uma_zone_set_max(zone_clust, nmbclusters); + zone_pack = uma_zsecond_create("Packet", mb_ctor_pack, mb_dtor_pack, + mb_init_pack, mb_fini_pack, zone_mbuf); + + /* uma_prealloc() goes here */ + + /* + * Hook event handler for low-memory situation, used to + * drain protocols and push data back to the caches (UMA + * later pushes it back to VM). + */ + EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, + EVENTHANDLER_PRI_FIRST); + + /* + * [Re]set counters and local statistics knobs. + * XXX Some of these should go and be replaced, but UMA stat + * gathering needs to be revised. + */ + mbstat.m_mbufs = 0; + mbstat.m_mclusts = 0; + mbstat.m_drain = 0; + mbstat.m_msize = MSIZE; + mbstat.m_mclbytes = MCLBYTES; + mbstat.m_minclsize = MINCLSIZE; + mbstat.m_mlen = MLEN; + mbstat.m_mhlen = MHLEN; + mbstat.m_numtypes = MT_NTYPES; + + mbstat.m_mcfail = mbstat.m_mpfail = 0; + mbstat.sf_iocnt = 0; + mbstat.sf_allocwait = mbstat.sf_allocfail = 0; +} + +/* + * Constructor for Mbuf master zone. + * + * The 'arg' pointer points to a mb_args structure which + * contains call-specific information required to support the + * mbuf allocation API. + */ +static void +mb_ctor_mbuf(void *mem, int size, void *arg) +{ + struct mbuf *m; + struct mb_args *args; + int flags; + int how; + short type; + + m = (struct mbuf *)mem; + args = (struct mb_args *)arg; + flags = args->flags; + how = args->how; + type = args->type; + + m->m_type = type; + m->m_next = NULL; + m->m_nextpkt = NULL; + if (flags & M_PKTHDR) { + m->m_data = m->m_pktdat; + m->m_flags = M_PKTHDR; + m->m_pkthdr.rcvif = NULL; + m->m_pkthdr.csum_flags = 0; + SLIST_INIT(&m->m_pkthdr.tags); +#ifdef MAC + /* If the label init fails, fail the alloc */ + if (mac_init_mbuf(m, how) != 0) { + m_free(m); +/* XXX*/ panic("mb_ctor_mbuf(): can't deal with failure!"); +/* return 0; */ + } +#endif + } else { + m->m_data = m->m_dat; + m->m_flags = 0; + } + mbstat.m_mbufs += 1; /* XXX */ +/* return 1; +*/ +} + +/* + * The Mbuf master zone and Packet secondary zone destructor. + */ +static void +mb_dtor_mbuf(void *mem, int size, void *arg) +{ + struct mbuf *m; + + m = (struct mbuf *)mem; + if ((m->m_flags & M_PKTHDR) != 0) + m_tag_delete_chain(m, NULL); + mbstat.m_mbufs -= 1; /* XXX */ +} + +/* XXX Only because of stats */ +static void +mb_dtor_pack(void *mem, int size, void *arg) +{ + struct mbuf *m; + + m = (struct mbuf *)mem; + if ((m->m_flags & M_PKTHDR) != 0) + m_tag_delete_chain(m, NULL); + mbstat.m_mbufs -= 1; /* XXX */ + mbstat.m_mclusts -= 1; /* XXX */ +} + +/* + * The Cluster zone constructor. + * + * Here the 'arg' pointer points to the Mbuf which we + * are configuring cluster storage for. + */ +static void +mb_ctor_clust(void *mem, int size, void *arg) +{ + struct mbuf *m; + + m = (struct mbuf *)arg; + m->m_ext.ext_buf = (caddr_t)mem; + m->m_data = m->m_ext.ext_buf; + m->m_flags |= M_EXT; + m->m_ext.ext_free = NULL; + m->m_ext.ext_args = NULL; + m->m_ext.ext_size = MCLBYTES; + m->m_ext.ext_type = EXT_CLUSTER; + m->m_ext.ref_cnt = (u_int *)uma_find_refcnt(zone_clust, + m->m_ext.ext_buf); + *(m->m_ext.ref_cnt) = 1; + mbstat.m_mclusts += 1; /* XXX */ +/* return 1; +*/ +} + +/* XXX */ +static void +mb_dtor_clust(void *mem, int size, void *arg) +{ + mbstat.m_mclusts -= 1; /* XXX */ +} + +/* + * The Packet secondary zone's init routine, executed on the + * object's transition from keg slab to zone cache. + */ +static void +mb_init_pack(void *mem, int size) +{ + struct mbuf *m; + + m = (struct mbuf *)mem; + m->m_ext.ext_buf = NULL; + uma_zalloc_arg(zone_clust, m, M_NOWAIT); + if (m->m_ext.ext_buf == NULL) /* XXX */ + panic("mb_init_pack(): Can't deal with failure yet."); + mbstat.m_mclusts -= 1; /* XXX */ +} + +/* + * The Packet secondary zone's fini routine, executed on the + * object's transition from zone cache to keg slab. + */ +static void +mb_fini_pack(void *mem, int size) +{ + struct mbuf *m; + + m = (struct mbuf *)mem; + uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); + m->m_ext.ext_buf = NULL; + mbstat.m_mclusts += 1; /* XXX */ +} + +/* + * The "packet" keg constructor. + */ +static void +mb_ctor_pack(void *mem, int size, void *arg) +{ + struct mbuf *m; + struct mb_args *args; + int flags, how; + short type; + + m = (struct mbuf *)mem; + args = (struct mb_args *)arg; + flags = args->flags; + type = args->type; + how = args->how; + + m->m_type = type; + m->m_next = NULL; + m->m_data = m->m_ext.ext_buf; + m->m_flags = flags|M_EXT; + m->m_ext.ext_free = NULL; + m->m_ext.ext_args = NULL; + m->m_ext.ext_size = MCLBYTES; + m->m_ext.ext_type = EXT_PACKET; + *(m->m_ext.ref_cnt) = 1; + + if (flags & M_PKTHDR) { + m->m_nextpkt = NULL; + m->m_pkthdr.rcvif = NULL; + m->m_pkthdr.csum_flags = 0; + SLIST_INIT(&m->m_pkthdr.tags); +#ifdef MAC + /* If the label init fails, fail the alloc */ + if (mac_init_mbuf(m, how) != 0) { + m_free(m); +/* XXX*/ panic("mb_ctor_pack(): can't deal with failure!"); +/* return 0; */ + } +#endif + } + mbstat.m_mbufs += 1; /* XXX */ + mbstat.m_mclusts += 1; /* XXX */ +/* return 1; +*/ +} + +/* + * This is the protocol drain routine. + * + * No locks should be held when this is called. The drain routines have to + * presently acquire some locks which raises the possibility of lock order + * reversal. + */ +static void +mb_reclaim(void *junk) +{ + struct domain *dp; + struct protosw *pr; + + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, + "mb_reclaim()"); + + mbstat.m_drain++; + for (dp = domains; dp != NULL; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_drain != NULL) + (*pr->pr_drain)(); +} diff --git a/sys/kern/subr_mbuf.c b/sys/kern/subr_mbuf.c deleted file mode 100644 index d84ef313681c..000000000000 --- a/sys/kern/subr_mbuf.c +++ /dev/null @@ -1,1548 +0,0 @@ -/*- - * Copyright (c) 2001, 2002, 2003 - * Bosko Milekic . All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include "opt_mac.h" -#include "opt_param.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/* - * mb_alloc: network buffer allocator - * - * XXX: currently, the "low watermark" sysctl is marked read-only as its - * effects are not completely implemented. To be fixed soon. - */ - -/* - * Maximum number of PCPU containers. If you know what you're doing you could - * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your - * system during compilation, and thus prevent kernel structure bloat. - * - * SMP and non-SMP kernels clearly have a different number of possible CPUs, - * but because we cannot assume a dense array of CPUs, we always allocate - * and traverse PCPU containers up to NCPU amount and merely check for - * CPU availability. - */ -#ifdef MBALLOC_NCPU -#define NCPU MBALLOC_NCPU -#else -#define NCPU MAXCPU -#endif - -/*- - * The mbuf allocator is based on Alfred Perlstein's - * "memcache" proof-of-concept allocator which was itself based on - * several well-known SMP-friendly allocators. - * - * The mb_alloc mbuf allocator is a special when compared to other - * general-purpose allocators. Some things to take note of: - * - * Mbufs and mbuf clusters are two different objects. Sometimes we - * will allocate a single mbuf, other times a single cluster, - * other times both. Further, we may sometimes wish to allocate a - * whole chain of mbufs with clusters. This allocator will perform - * the common case of each scenario in one function call (this - * includes constructing or destructing the object) while only - * locking/unlocking the cache once, if it can get away with it. - * The caches consist of pure mbufs and pure clusters; that is - * there are no 'zones' containing mbufs with already pre-hooked - * clusters. Since we can allocate both objects atomically anyway, - * we don't bother fragmenting our caches for any particular 'scenarios.' - * - * We allocate from seperate sub-maps of kmem_map, thus imposing - * an ultimate upper-limit on the number of allocatable clusters - * and mbufs and also, since the clusters all come from a - * virtually contiguous region, we can keep reference counters - * for them and "allocate" them purely by indexing into a - * dense refcount vector. - * - * We call out to protocol drain routines (which can be hooked - * into us) when we're low on space. - * - * The mbuf allocator keeps all objects that it allocates in mb_buckets. - * The buckets keep a number of objects (an object can be an mbuf or an - * mbuf cluster) and facilitate moving larger sets of contiguous objects - * from the per-CPU caches to the global cache. The buckets also have - * the added advantage that objects, when migrated from cache to cache, - * are migrated in chunks that keep contiguous objects together, - * minimizing TLB pollution. - * - * The buckets are kept on singly-linked lists called "containers." A container - * is protected by a mutex in order to ensure consistency. The mutex - * itself is allocated separately and attached to the container at boot time, - * thus allowing for certain containers to share the same lock. Per-CPU - * containers for mbufs and mbuf clusters all share the same per-CPU - * lock whereas the global cache containers for these objects share one - * global lock. - */ -struct mb_bucket { - SLIST_ENTRY(mb_bucket) mb_blist; - int mb_owner; - int mb_numfree; - void *mb_free[0]; -}; - -struct mb_container { - SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead; - struct mtx *mc_lock; - int mc_numowner; - u_int mc_starved; - long *mc_types; - u_long *mc_objcount; - u_long *mc_numbucks; -}; - -struct mb_gen_list { - struct mb_container mb_cont; - struct cv mgl_mstarved; -}; - -struct mb_pcpu_list { - struct mb_container mb_cont; -}; - -/* - * Boot-time configurable object counts that will determine the maximum - * number of permitted objects in the mbuf and mcluster cases. In the - * ext counter (nmbcnt) case, it's just an indicator serving to scale - * kmem_map size properly - in other words, we may be allowed to allocate - * more than nmbcnt counters, whereas we will never be allowed to allocate - * more than nmbufs mbufs or nmbclusters mclusters. - * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be - * allocatable by the sfbuf allocator (found in uipc_syscalls.c) - */ -#ifndef NMBCLUSTERS -#define NMBCLUSTERS (1024 + maxusers * 64) -#endif -#ifndef NMBUFS -#define NMBUFS (nmbclusters * 2) -#endif -#ifndef NSFBUFS -#define NSFBUFS (512 + maxusers * 16) -#endif -#ifndef NMBCNTS -#define NMBCNTS (nmbclusters + nsfbufs) -#endif -int nmbufs; -int nmbclusters; -int nmbcnt; -int nsfbufs; -int nsfbufspeak; -int nsfbufsused; - -/* - * Sizes of objects per bucket. There are this size's worth of mbufs - * or clusters in each bucket. Please keep these a power-of-2. - */ -#define MBUF_BUCK_SZ (PAGE_SIZE * 2) -#define CLUST_BUCK_SZ (PAGE_SIZE * 4) - -/* - * Perform sanity checks of tunables declared above. - */ -static void -tunable_mbinit(void *dummy) -{ - - /* - * This has to be done before VM init. - */ - nmbclusters = NMBCLUSTERS; - TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); - nmbufs = NMBUFS; - TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); - nsfbufs = NSFBUFS; - TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); - nmbcnt = NMBCNTS; - TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt); - /* Sanity checks */ - if (nmbufs < nmbclusters * 2) - nmbufs = nmbclusters * 2; - if (nmbcnt < nmbclusters + nsfbufs) - nmbcnt = nmbclusters + nsfbufs; -} -SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL); - -/* - * The freelist structures and mutex locks. The number statically declared - * here depends on the number of CPUs. - * - * We set up in such a way that all the objects (mbufs, clusters) - * share the same mutex lock. It has been established that we do not benefit - * from different locks for different objects, so we use the same lock, - * regardless of object type. This also allows us to do optimised - * multi-object allocations without dropping the lock in between. - */ -struct mb_lstmngr { - struct mb_gen_list *ml_genlist; - struct mb_pcpu_list *ml_cntlst[NCPU]; - struct mb_bucket **ml_btable; - vm_map_t ml_map; - vm_offset_t ml_mapbase; - vm_offset_t ml_maptop; - int ml_mapfull; - u_int ml_objsize; - u_int ml_objbucks; - u_int *ml_wmhigh; - u_int *ml_wmlow; -}; -static struct mb_lstmngr mb_list_mbuf, mb_list_clust; -static struct mtx mbuf_gen, mbuf_pcpu[NCPU]; -static u_int *cl_refcntmap; - -/* - * Local macros for internal allocator structure manipulations. - */ -#ifdef SMP -#define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)] -#else -#define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[0] -#endif - -#define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist - -#define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock) - -#define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock) - -#define MB_GET_PCPU_LIST_NUM(mb_lst, num) \ - (mb_lst)->ml_cntlst[(num)] - -#define MB_BUCKET_INDX(mb_obj, mb_lst) \ - (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / \ - ((mb_lst)->ml_objbucks * (mb_lst)->ml_objsize)) - -#define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \ -{ \ - struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \ - \ - (mb_bckt)->mb_numfree--; \ - (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \ - (*((mb_lst)->mb_cont.mc_objcount))--; \ - if ((mb_bckt)->mb_numfree == 0) { \ - SLIST_REMOVE_HEAD(_mchd, mb_blist); \ - SLIST_NEXT((mb_bckt), mb_blist) = NULL; \ - (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \ - } \ -} - -#define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \ - (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \ - (mb_bckt)->mb_numfree++; \ - (*((mb_lst)->mb_cont.mc_objcount))++; - -#define MB_MBTYPES_INC(mb_cnt, mb_type, mb_num) \ - if ((mb_type) != MT_NOTMBUF) \ - (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num) - -#define MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num) \ - if ((mb_type) != MT_NOTMBUF) \ - (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num) - -/* - * Ownership of buckets/containers is represented by integers. The PCPU - * lists range from 0 to NCPU-1. We need a free numerical id for the general - * list (we use NCPU). We also need a non-conflicting free bit to indicate - * that the bucket is free and removed from a container, while not losing - * the bucket's originating container id. We use the highest bit - * for the free marker. - */ -#define MB_GENLIST_OWNER (NCPU) -#define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1)) - -/* Statistics structures for allocator (per-CPU and general). */ -static struct mbpstat mb_statpcpu[NCPU + 1]; -struct mbstat mbstat; - -/* Sleep time for wait code (in ticks). */ -static int mbuf_wait = 64; - -static u_int mbuf_hiwm = 512; /* High wm on # of mbufs per cache */ -static u_int mbuf_lowm = 128; /* Low wm on # of mbufs per cache */ -static u_int clust_hiwm = 128; /* High wm on # of clusters per cache */ -static u_int clust_lowm = 16; /* Low wm on # of clusters per cache */ - -/* - * Objects exported by sysctl(8). - */ -SYSCTL_DECL(_kern_ipc); -SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RDTUN, &nmbclusters, 0, - "Maximum number of mbuf clusters available"); -SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RDTUN, &nmbufs, 0, - "Maximum number of mbufs available"); -SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RDTUN, &nmbcnt, 0, - "Number used to scale kmem_map to ensure sufficient space for counters"); -SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, - "Maximum number of sendfile(2) sf_bufs available"); -SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, - "Number of sendfile(2) sf_bufs at peak usage"); -SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, - "Number of sendfile(2) sf_bufs in use"); -SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0, - "Sleep time of mbuf subsystem wait allocations during exhaustion"); -SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_hiwm, CTLFLAG_RW, &mbuf_hiwm, 0, - "Upper limit of number of mbufs allowed in each cache"); -SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_lowm, CTLFLAG_RD, &mbuf_lowm, 0, - "Lower limit of number of mbufs allowed in each cache"); -SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_hiwm, CTLFLAG_RW, &clust_hiwm, 0, - "Upper limit of number of mbuf clusters allowed in each cache"); -SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_lowm, CTLFLAG_RD, &clust_lowm, 0, - "Lower limit of number of mbuf clusters allowed in each cache"); -SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, - "Mbuf general information and statistics"); -SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu, - sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics"); - -/* - * Prototypes of local allocator routines. - */ -static void *mb_alloc_wait(struct mb_lstmngr *, short); -static struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int, - struct mb_pcpu_list *); -static void mb_reclaim(void); -static void mbuf_init(void *); - -/* - * Initial allocation numbers. Each parameter represents the number of buckets - * of each object that will be placed initially in each PCPU container for - * said object. - */ -#define NMB_MBUF_INIT 2 -#define NMB_CLUST_INIT 8 - -/* - * Internal flags that allow for cache locks to remain "persistent" across - * allocation and free calls. They may be used in combination. - */ -#define MBP_PERSIST 0x1 /* Return with lock still held. */ -#define MBP_PERSISTENT 0x2 /* Cache lock is already held coming in. */ - -/* - * Initialize the mbuf subsystem. - * - * We sub-divide the kmem_map into several submaps; this way, we don't have - * to worry about artificially limiting the number of mbuf or mbuf cluster - * allocations, due to fear of one type of allocation "stealing" address - * space initially reserved for another. - * - * Set up both the general containers and all the PCPU containers. Populate - * the PCPU containers with initial numbers. - */ -MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures"); -SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL) -static void -mbuf_init(void *dummy) -{ - struct mb_pcpu_list *pcpu_cnt; - vm_size_t mb_map_size; - int i, j; - - /* - * Set up all the submaps, for each type of object that we deal - * with in this allocator. - */ - mb_map_size = (vm_size_t)(nmbufs * MSIZE); - mb_map_size = rounddown(mb_map_size, MBUF_BUCK_SZ); - mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / - MBUF_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); - if (mb_list_mbuf.ml_btable == NULL) - goto bad; - mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase), - &(mb_list_mbuf.ml_maptop), mb_map_size); - mb_list_mbuf.ml_map->system_map = 1; - mb_list_mbuf.ml_mapfull = 0; - mb_list_mbuf.ml_objsize = MSIZE; - mb_list_mbuf.ml_objbucks = MBUF_BUCK_SZ / mb_list_mbuf.ml_objsize; - mb_list_mbuf.ml_wmhigh = &mbuf_hiwm; - mb_list_mbuf.ml_wmlow = &mbuf_lowm; - - mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES); - mb_map_size = rounddown(mb_map_size, CLUST_BUCK_SZ); - mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / - CLUST_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT); - if (mb_list_clust.ml_btable == NULL) - goto bad; - mb_list_clust.ml_map = kmem_suballoc(kmem_map, - &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop), - mb_map_size); - mb_list_clust.ml_map->system_map = 1; - mb_list_clust.ml_mapfull = 0; - mb_list_clust.ml_objsize = MCLBYTES; - mb_list_clust.ml_objbucks = CLUST_BUCK_SZ / mb_list_clust.ml_objsize; - mb_list_clust.ml_wmhigh = &clust_hiwm; - mb_list_clust.ml_wmlow = &clust_lowm; - - /* - * Allocate required general (global) containers for each object type. - */ - mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF, - M_NOWAIT); - mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF, - M_NOWAIT); - if ((mb_list_mbuf.ml_genlist == NULL) || - (mb_list_clust.ml_genlist == NULL)) - goto bad; - - /* - * Initialize condition variables and general container mutex locks. - */ - mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, MTX_DEF); - cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved"); - cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved), - "mcluster pool starved"); - mb_list_mbuf.ml_genlist->mb_cont.mc_lock = - mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen; - - /* - * Set up the general containers for each object. - */ - mb_list_mbuf.ml_genlist->mb_cont.mc_numowner = - mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER; - mb_list_mbuf.ml_genlist->mb_cont.mc_starved = - mb_list_clust.ml_genlist->mb_cont.mc_starved = 0; - mb_list_mbuf.ml_genlist->mb_cont.mc_objcount = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree); - mb_list_clust.ml_genlist->mb_cont.mc_objcount = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree); - mb_list_mbuf.ml_genlist->mb_cont.mc_numbucks = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbbucks); - mb_list_clust.ml_genlist->mb_cont.mc_numbucks = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_clbucks); - mb_list_mbuf.ml_genlist->mb_cont.mc_types = - &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]); - mb_list_clust.ml_genlist->mb_cont.mc_types = NULL; - SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead)); - SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead)); - - /* - * Allocate all the required counters for clusters. This makes - * cluster allocations/deallocations much faster. - */ - cl_refcntmap = malloc(nmbclusters * sizeof(u_int), M_MBUF, M_NOWAIT); - if (cl_refcntmap == NULL) - goto bad; - - /* - * Initialize general mbuf statistics. - */ - mbstat.m_msize = mb_list_mbuf.ml_objsize; - mbstat.m_mclbytes = mb_list_clust.ml_objsize; - mbstat.m_minclsize = MINCLSIZE; - mbstat.m_mlen = MLEN; - mbstat.m_mhlen = MHLEN; - mbstat.m_numtypes = MT_NTYPES; - mbstat.m_mbperbuck = mb_list_mbuf.ml_objbucks; - mbstat.m_clperbuck = mb_list_clust.ml_objbucks; - - /* - * Allocate and initialize PCPU containers. - */ - for (i = 0; i < NCPU; i++) { - if (CPU_ABSENT(i)) { - mb_statpcpu[i].mb_active = 0; - continue; - } - - mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list), - M_MBUF, M_NOWAIT); - mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list), - M_MBUF, M_NOWAIT); - if ((mb_list_mbuf.ml_cntlst[i] == NULL) || - (mb_list_clust.ml_cntlst[i] == NULL)) - goto bad; - - mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, MTX_DEF); - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock = - mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i]; - - mb_statpcpu[i].mb_active = 1; - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner = - mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i; - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved = - mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0; - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount = - &(mb_statpcpu[i].mb_mbfree); - mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount = - &(mb_statpcpu[i].mb_clfree); - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numbucks = - &(mb_statpcpu[i].mb_mbbucks); - mb_list_clust.ml_cntlst[i]->mb_cont.mc_numbucks = - &(mb_statpcpu[i].mb_clbucks); - mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types = - &(mb_statpcpu[i].mb_mbtypes[0]); - mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL; - - SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead)); - SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead)); - - /* - * Perform initial allocations. - */ - pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i); - MB_LOCK_CONT(pcpu_cnt); - for (j = 0; j < NMB_MBUF_INIT; j++) { - if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt) - == NULL) - goto bad; - } - MB_UNLOCK_CONT(pcpu_cnt); - - pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i); - MB_LOCK_CONT(pcpu_cnt); - for (j = 0; j < NMB_CLUST_INIT; j++) { - if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt) - == NULL) - goto bad; - } - MB_UNLOCK_CONT(pcpu_cnt); - } - - return; -bad: - panic("mbuf_init(): failed to initialize mbuf subsystem!"); -} - -/* - * Populate a given mbuf PCPU container with a bucket full of fresh new - * buffers. Return a pointer to the new bucket (already in the container if - * successful), or return NULL on failure. - * - * LOCKING NOTES: - * PCPU container lock must be held when this is called. - * The lock is dropped here so that we can cleanly call the underlying VM - * code. If we fail, we return with no locks held. If we succeed (i.e., return - * non-NULL), we return with the PCPU lock held, ready for allocation from - * the returned bucket. - */ -static struct mb_bucket * -mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst) -{ - struct mb_bucket *bucket; - caddr_t p; - int i; - - MB_UNLOCK_CONT(cnt_lst); - /* - * If our object's (finite) map is starved now (i.e., no more address - * space), bail out now. - */ - if (mb_list->ml_mapfull) - return (NULL); - - bucket = malloc(sizeof(struct mb_bucket) + - mb_list->ml_objbucks * sizeof(void *), M_MBUF, MBTOM(how)); - if (bucket == NULL) - return (NULL); - - p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_objsize * - mb_list->ml_objbucks, MBTOM(how)); - if (p == NULL) { - free(bucket, M_MBUF); - if (how == M_TRYWAIT) - mb_list->ml_mapfull = 1; - return (NULL); - } - - bucket->mb_numfree = 0; - mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket; - for (i = 0; i < mb_list->ml_objbucks; i++) { - bucket->mb_free[i] = p; - bucket->mb_numfree++; - p += mb_list->ml_objsize; - } - - MB_LOCK_CONT(cnt_lst); - bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; - SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist); - (*(cnt_lst->mb_cont.mc_numbucks))++; - *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree; - - return (bucket); -} - -/* - * Allocate a network buffer. - * The general case is very easy. Complications only arise if our PCPU - * container is empty. Things get worse if the PCPU container is empty, - * the general container is empty, and we've run out of address space - * in our map; then we try to block if we're willing to (M_TRYWAIT). - */ -static -void * -mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist, - int *pers_list) -{ - static int last_report; - struct mb_pcpu_list *cnt_lst; - struct mb_bucket *bucket; - void *m; - -#ifdef INVARIANTS - int flags; - - flags = how & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT); - if (flags != M_DONTWAIT && flags != M_TRYWAIT) { - static struct timeval lasterr; - static int curerr; - if (ppsratecheck(&lasterr, &curerr, 1)) { - printf("Bad mbuf alloc flags: %x\n", flags); - backtrace(); - how = M_TRYWAIT; - } - } -#endif - - m = NULL; - if ((persist & MBP_PERSISTENT) != 0) { - /* - * If we're a "persistent" call, then the per-CPU #(pers_list) - * cache lock is already held, and we just need to refer to - * the correct cache descriptor. - */ - cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list); - } else { - cnt_lst = MB_GET_PCPU_LIST(mb_list); - MB_LOCK_CONT(cnt_lst); - } - - if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) { - /* - * This is the easy allocation case. We just grab an object - * from a bucket in the PCPU container. At worst, we - * have just emptied the bucket and so we remove it - * from the container. - */ - MB_GET_OBJECT(m, bucket, cnt_lst); - MB_MBTYPES_INC(cnt_lst, type, 1); - - /* If asked to persist, do not drop the lock. */ - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(cnt_lst); - else - *pers_list = cnt_lst->mb_cont.mc_numowner; - } else { - struct mb_gen_list *gen_list; - - /* - * This is the less-common more difficult case. We must - * first verify if the general list has anything for us - * and if that also fails, we must allocate a page from - * the map and create a new bucket to place in our PCPU - * container (already locked). If the map is starved then - * we're really in for trouble, as we have to wait on - * the general container's condition variable. - */ - gen_list = MB_GET_GEN_LIST(mb_list); - MB_LOCK_CONT(gen_list); - - if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) - != NULL) { - /* - * Give ownership of the bucket to our CPU's - * container, but only actually put the bucket - * in the container if it doesn't become free - * upon removing an mbuf from it. - */ - SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead), - mb_blist); - bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; - (*(gen_list->mb_cont.mc_numbucks))--; - (*(cnt_lst->mb_cont.mc_numbucks))++; - *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree; - bucket->mb_numfree--; - m = bucket->mb_free[(bucket->mb_numfree)]; - if (bucket->mb_numfree == 0) { - SLIST_NEXT(bucket, mb_blist) = NULL; - bucket->mb_owner |= MB_BUCKET_FREE; - } else { - SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), - bucket, mb_blist); - *(cnt_lst->mb_cont.mc_objcount) += - bucket->mb_numfree; - } - MB_UNLOCK_CONT(gen_list); - MB_MBTYPES_INC(cnt_lst, type, 1); - - /* If asked to persist, do not drop the lock. */ - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(cnt_lst); - else - *pers_list = cnt_lst->mb_cont.mc_numowner; - } else { - /* - * We'll have to allocate a new page. - */ - MB_UNLOCK_CONT(gen_list); - bucket = mb_pop_cont(mb_list, how, cnt_lst); - if (bucket != NULL) { - MB_GET_OBJECT(m, bucket, cnt_lst); - MB_MBTYPES_INC(cnt_lst, type, 1); - - /* If asked to persist, do not drop the lock. */ - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(cnt_lst); - else - *pers_list=cnt_lst->mb_cont.mc_numowner; - } else { - if (how == M_TRYWAIT) { - /* - * Absolute worst-case scenario. - * We block if we're willing to, but - * only after trying to steal from - * other lists. - */ - m = mb_alloc_wait(mb_list, type); - } else { - /* XXX: No consistency. */ - mbstat.m_drops++; - - if (ticks < last_report || - (ticks - last_report) >= hz) { - last_report = ticks; - printf( -"All mbufs or mbuf clusters exhausted, please see tuning(7).\n"); - } - - } - if (m != NULL && (persist & MBP_PERSIST) != 0) { - cnt_lst = MB_GET_PCPU_LIST(mb_list); - MB_LOCK_CONT(cnt_lst); - *pers_list=cnt_lst->mb_cont.mc_numowner; - } - } - } - } - - return (m); -} - -/* - * This is the worst-case scenario called only if we're allocating with - * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf - * by looking in every PCPU container. If we're still unsuccesful, we - * try the general container one last time and possibly block on our - * starved cv. - */ -static void * -mb_alloc_wait(struct mb_lstmngr *mb_list, short type) -{ - struct mb_pcpu_list *cnt_lst; - struct mb_gen_list *gen_list; - struct mb_bucket *bucket; - void *m; - int i, cv_ret; - - /* - * Try to reclaim mbuf-related objects (mbufs, clusters). - */ - mb_reclaim(); - - /* - * Cycle all the PCPU containers. Increment starved counts if found - * empty. - */ - for (i = 0; i < NCPU; i++) { - if (CPU_ABSENT(i)) - continue; - cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i); - MB_LOCK_CONT(cnt_lst); - - /* - * If container is non-empty, get a single object from it. - * If empty, increment starved count. - */ - if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != - NULL) { - MB_GET_OBJECT(m, bucket, cnt_lst); - MB_MBTYPES_INC(cnt_lst, type, 1); - MB_UNLOCK_CONT(cnt_lst); - mbstat.m_wait++; /* XXX: No consistency. */ - return (m); - } else - cnt_lst->mb_cont.mc_starved++; - - MB_UNLOCK_CONT(cnt_lst); - } - - /* - * We're still here, so that means it's time to get the general - * container lock, check it one more time (now that mb_reclaim() - * has been called) and if we still get nothing, block on the cv. - */ - gen_list = MB_GET_GEN_LIST(mb_list); - MB_LOCK_CONT(gen_list); - if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) { - MB_GET_OBJECT(m, bucket, gen_list); - MB_MBTYPES_INC(gen_list, type, 1); - MB_UNLOCK_CONT(gen_list); - mbstat.m_wait++; /* XXX: No consistency. */ - return (m); - } - - gen_list->mb_cont.mc_starved++; - cv_ret = cv_timedwait(&(gen_list->mgl_mstarved), - gen_list->mb_cont.mc_lock, mbuf_wait); - gen_list->mb_cont.mc_starved--; - - if ((cv_ret == 0) && - ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) { - MB_GET_OBJECT(m, bucket, gen_list); - MB_MBTYPES_INC(gen_list, type, 1); - mbstat.m_wait++; /* XXX: No consistency. */ - } else { - mbstat.m_drops++; /* XXX: No consistency. */ - m = NULL; - } - - MB_UNLOCK_CONT(gen_list); - - return (m); -} - -/*- - * Free an object to its rightful container. - * In the very general case, this operation is really very easy. - * Complications arise primarily if: - * (a) We've hit the high limit on number of free objects allowed in - * our PCPU container. - * (b) We're in a critical situation where our container has been - * marked 'starved' and we need to issue wakeups on the starved - * condition variable. - * (c) Minor (odd) cases: our bucket has migrated while we were - * waiting for the lock; our bucket is in the general container; - * our bucket is empty. - */ -static -void -mb_free(struct mb_lstmngr *mb_list, void *m, short type, short persist, - int *pers_list) -{ - struct mb_pcpu_list *cnt_lst; - struct mb_gen_list *gen_list; - struct mb_bucket *bucket; - u_int owner; - - bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)]; - - /* - * Make sure that if after we lock the bucket's present container the - * bucket has migrated, that we drop the lock and get the new one. - */ -retry_lock: - owner = bucket->mb_owner & ~MB_BUCKET_FREE; - switch (owner) { - case MB_GENLIST_OWNER: - gen_list = MB_GET_GEN_LIST(mb_list); - if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) { - if (*pers_list != MB_GENLIST_OWNER) { - cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, - *pers_list); - MB_UNLOCK_CONT(cnt_lst); - MB_LOCK_CONT(gen_list); - } - } else { - MB_LOCK_CONT(gen_list); - } - if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) { - MB_UNLOCK_CONT(gen_list); - *pers_list = -1; - goto retry_lock; - } - - /* - * If we're intended for the general container, this is - * real easy: no migrating required. The only `bogon' - * is that we're now contending with all the threads - * dealing with the general list, but this is expected. - */ - MB_PUT_OBJECT(m, bucket, gen_list); - MB_MBTYPES_DEC(gen_list, type, 1); - if (bucket->mb_owner & MB_BUCKET_FREE) { - SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead), - bucket, mb_blist); - bucket->mb_owner = MB_GENLIST_OWNER; - } - if (gen_list->mb_cont.mc_starved > 0) - cv_signal(&(gen_list->mgl_mstarved)); - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(gen_list); - else - *pers_list = MB_GENLIST_OWNER; - break; - - default: - cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner); - if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) { - if (*pers_list == MB_GENLIST_OWNER) { - gen_list = MB_GET_GEN_LIST(mb_list); - MB_UNLOCK_CONT(gen_list); - MB_LOCK_CONT(cnt_lst); - } else { - cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, - *pers_list); - owner = *pers_list; - } - } else { - MB_LOCK_CONT(cnt_lst); - } - if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) { - MB_UNLOCK_CONT(cnt_lst); - *pers_list = -1; - goto retry_lock; - } - - MB_PUT_OBJECT(m, bucket, cnt_lst); - MB_MBTYPES_DEC(cnt_lst, type, 1); - if ((*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) || - (cnt_lst->mb_cont.mc_starved > 0)) { - /* - * We've hit the high limit of allowed numbers of mbufs - * on this PCPU list or we've been flagged that we need - * to transfer a bucket over to the general cache. - * We must now migrate a bucket over to the general - * container. - */ - gen_list = MB_GET_GEN_LIST(mb_list); - MB_LOCK_CONT(gen_list); - if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) { - bucket = - SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead)); - SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead), - mb_blist); - } - SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead), - bucket, mb_blist); - bucket->mb_owner = MB_GENLIST_OWNER; - *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree; - *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree; - (*(cnt_lst->mb_cont.mc_numbucks))--; - (*(gen_list->mb_cont.mc_numbucks))++; - - /* - * While we're at it, transfer some of the mbtypes - * "count load" onto the general list's mbtypes - * array, seeing as how we're moving the bucket - * there now, meaning that the freeing of objects - * there will now decrement the _general list's_ - * mbtypes counters, and no longer our PCPU list's - * mbtypes counters. We do this for the type presently - * being freed in an effort to keep the mbtypes - * counters approximately balanced across all lists. - */ - MB_MBTYPES_DEC(cnt_lst, type, - mb_list->ml_objbucks - bucket->mb_numfree); - MB_MBTYPES_INC(gen_list, type, - mb_list->ml_objbucks - bucket->mb_numfree); - - if (cnt_lst->mb_cont.mc_starved > 0) { - /* - * Determine whether or not to keep - * transferring buckets to the general list - * or whether we've transferred enough already. - * The thread that is blocked may end up waking - * up in the meantime, but transferring an - * extra bucket in a constrained situation - * is not so bad, as we're likely to need - * it soon anyway. - */ - if (gen_list->mb_cont.mc_starved > 0) { - cnt_lst->mb_cont.mc_starved--; - cv_signal(&(gen_list->mgl_mstarved)); - } else - cnt_lst->mb_cont.mc_starved = 0; - } - MB_UNLOCK_CONT(gen_list); - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(cnt_lst); - else - *pers_list = owner; - break; - } - - if (bucket->mb_owner & MB_BUCKET_FREE) { - SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), - bucket, mb_blist); - bucket->mb_owner = cnt_lst->mb_cont.mc_numowner; - } - - if ((persist & MBP_PERSIST) == 0) - MB_UNLOCK_CONT(cnt_lst); - else - *pers_list = owner; - break; - } -} - -/* - * Drain protocols in hopes to free up some resources. - * - * LOCKING NOTES: - * No locks should be held when this is called. The drain routines have to - * presently acquire some locks which raises the possibility of lock order - * violation if we're holding any mutex if that mutex is acquired in reverse - * order relative to one of the locks in the drain routines. - */ -static void -mb_reclaim(void) -{ - struct domain *dp; - struct protosw *pr; - - WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, - "mb_reclaim()"); - - mbstat.m_drain++; /* XXX: No consistency. */ - - for (dp = domains; dp != NULL; dp = dp->dom_next) - for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) - if (pr->pr_drain != NULL) - (*pr->pr_drain)(); -} - -/****************************************************************************** - * Internal setup macros. - */ - -#define _mb_setup(m, type) do { \ - (m)->m_type = (type); \ - (m)->m_next = NULL; \ - (m)->m_nextpkt = NULL; \ - (m)->m_data = (m)->m_dat; \ - (m)->m_flags = 0; \ -} while (0) - -#define _mbhdr_setup(m, type) do { \ - (m)->m_type = (type); \ - (m)->m_next = NULL; \ - (m)->m_nextpkt = NULL; \ - (m)->m_data = (m)->m_pktdat; \ - (m)->m_flags = M_PKTHDR; \ - (m)->m_pkthdr.rcvif = NULL; \ - (m)->m_pkthdr.csum_flags = 0; \ - SLIST_INIT(&(m)->m_pkthdr.tags); \ -} while (0) - -#define _mcl_setup(m) do { \ - (m)->m_data = (m)->m_ext.ext_buf; \ - (m)->m_flags |= M_EXT; \ - (m)->m_ext.ext_free = NULL; \ - (m)->m_ext.ext_args = NULL; \ - (m)->m_ext.ext_size = MCLBYTES; \ - (m)->m_ext.ext_type = EXT_CLUSTER; \ -} while (0) - -#define _mext_init_ref(m, ref) do { \ - (m)->m_ext.ref_cnt = ((ref) == NULL) ? \ - malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)(ref); \ - if ((m)->m_ext.ref_cnt != NULL) { \ - *((m)->m_ext.ref_cnt) = 0; \ - MEXT_ADD_REF((m)); \ - } \ -} while (0) - -#define cl2ref(cl) \ - (((uintptr_t)(cl) - (uintptr_t)mb_list_clust.ml_mapbase) >> MCLSHIFT) - -#define _mext_dealloc_ref(m) \ - if ((m)->m_ext.ext_type != EXT_EXTREF) \ - free((m)->m_ext.ref_cnt, M_MBUF) - -/****************************************************************************** - * Internal routines. - * - * Because mb_alloc() and mb_free() are inlines (to keep the common - * cases down to a maximum of one function call), below are a few - * routines used only internally for the sole purpose of making certain - * functions smaller. - * - * - _mext_free(): frees associated storage when the ref. count is - * exactly one and we're freeing. - * - * - _mgetm_internal(): common "persistent-lock" routine that allocates - * an mbuf and a cluster in one shot, but where the lock is already - * held coming in (which is what makes it different from the exported - * m_getcl()). The lock is dropped when done. This is used by m_getm() - * and, therefore, is very m_getm()-specific. - */ -static struct mbuf *_mgetm_internal(int, short, short, int); - -void -_mext_free(struct mbuf *mb) -{ - - if (mb->m_ext.ext_type == EXT_CLUSTER) { - mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF, - 0, NULL); - } else { - (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args); - _mext_dealloc_ref(mb); - } -} - -static struct mbuf * -_mgetm_internal(int how, short type, short persist, int cchnum) -{ - struct mbuf *mb; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, persist,&cchnum); - if (mb == NULL) - return NULL; - _mb_setup(mb, type); - - if ((persist & MBP_PERSIST) != 0) { - mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, - how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum); - if (mb->m_ext.ext_buf == NULL) { - (void)m_free(mb); - mb = NULL; - } - _mcl_setup(mb); - _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); - } - return (mb); -} - -/****************************************************************************** - * Exported buffer allocation and de-allocation routines. - */ - -/* - * Allocate and return a single (normal) mbuf. NULL is returned on failure. - * - * Arguments: - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - */ -struct mbuf * -m_get(int how, short type) -{ - struct mbuf *mb; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); - if (mb != NULL) - _mb_setup(mb, type); - return (mb); -} - -/* - * Allocate a given length worth of mbufs and/or clusters (whatever fits - * best) and return a pointer to the top of the allocated chain. If an - * existing mbuf chain is provided, then we will append the new chain - * to the existing one but still return the top of the newly allocated - * chain. NULL is returned on failure, in which case the [optional] - * provided chain is left untouched, and any memory already allocated - * is freed. - * - * Arguments: - * - m: existing chain to which to append new chain (optional). - * - len: total length of data to append, either in mbufs or clusters - * (we allocate whatever combination yields the best fit). - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - */ -struct mbuf * -m_getm(struct mbuf *m, int len, int how, short type) -{ - struct mbuf *mb, *top, *cur, *mtail; - int num, rem, cchnum; - short persist; - int i; - - KASSERT(len >= 0, ("m_getm(): len is < 0")); - - /* If m != NULL, we will append to the end of that chain. */ - if (m != NULL) - for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next); - else - mtail = NULL; - - /* - * In the best-case scenario (which should be the common case - * unless we're in a starvation situation), we will be able to - * go through the allocation of all the desired mbufs and clusters - * here without dropping our per-CPU cache lock in between. - */ - num = len / MCLBYTES; - rem = len % MCLBYTES; - persist = 0; - cchnum = -1; - top = cur = NULL; - for (i = 0; i < num; i++) { - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, - MBP_PERSIST | persist, &cchnum); - if (mb == NULL) - goto failed; - _mb_setup(mb, type); - mb->m_len = 0; - - persist = (i != (num - 1) || rem > 0) ? MBP_PERSIST : 0; - mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, - how, MT_NOTMBUF, persist | MBP_PERSISTENT, &cchnum); - if (mb->m_ext.ext_buf == NULL) { - (void)m_free(mb); - goto failed; - } - _mcl_setup(mb); - _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); - persist = MBP_PERSISTENT; - - if (cur == NULL) - top = cur = mb; - else - cur = (cur->m_next = mb); - } - if (rem > 0) { - if (cchnum >= 0) { - persist = MBP_PERSISTENT; - persist |= (rem > MINCLSIZE) ? MBP_PERSIST : 0; - mb = _mgetm_internal(how, type, persist, cchnum); - if (mb == NULL) - goto failed; - } else if (rem > MINCLSIZE) { - mb = m_getcl(how, type, 0); - } else { - mb = m_get(how, type); - } - if (mb != NULL) { - mb->m_len = 0; - if (cur == NULL) - top = mb; - else - cur->m_next = mb; - } else - goto failed; - } - - if (mtail != NULL) - mtail->m_next = top; - return top; -failed: - if (top != NULL) - m_freem(top); - return NULL; -} - -/* - * Allocate and return a single M_PKTHDR mbuf. NULL is returned on failure. - * - * Arguments: - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - */ -struct mbuf * -m_gethdr(int how, short type) -{ - struct mbuf *mb; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); - if (mb != NULL) { - _mbhdr_setup(mb, type); -#ifdef MAC - if (mac_init_mbuf(mb, MBTOM(how)) != 0) { - m_free(mb); - return (NULL); - } -#endif - } - return (mb); -} - -/* - * Allocate and return a single (normal) pre-zero'd mbuf. NULL is - * returned on failure. - * - * Arguments: - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - */ -struct mbuf * -m_get_clrd(int how, short type) -{ - struct mbuf *mb; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); - if (mb != NULL) { - _mb_setup(mb, type); - bzero(mtod(mb, caddr_t), MLEN); - } - return (mb); -} - -/* - * Allocate and return a single M_PKTHDR pre-zero'd mbuf. NULL is - * returned on failure. - * - * Arguments: - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - */ -struct mbuf * -m_gethdr_clrd(int how, short type) -{ - struct mbuf *mb; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL); - if (mb != NULL) { - _mbhdr_setup(mb, type); -#ifdef MAC - if (mac_init_mbuf(mb, MBTOM(how)) != 0) { - m_free(mb); - return (NULL); - } -#endif - bzero(mtod(mb, caddr_t), MHLEN); - } - return (mb); -} - -/* - * Free a single mbuf and any associated storage that it may have attached - * to it. The associated storage may not be immediately freed if its - * reference count is above 1. Returns the next mbuf in the chain following - * the mbuf being freed. - * - * Arguments: - * - mb: the mbuf to free. - */ -struct mbuf * -m_free(struct mbuf *mb) -{ - struct mbuf *nb; - int cchnum; - short persist = 0; - -#ifdef INVARIANTS - if (mb->m_flags & M_FREELIST) - panic("m_free detected a mbuf double-free"); - mb->m_flags |= M_FREELIST; -#endif - if ((mb->m_flags & M_PKTHDR) != 0) - m_tag_delete_chain(mb, NULL); - nb = mb->m_next; - if ((mb->m_flags & M_EXT) != 0) { - MEXT_REM_REF(mb); - if (atomic_cmpset_int(mb->m_ext.ref_cnt, 0, 1)) { - if (mb->m_ext.ext_type == EXT_CLUSTER) { - mb_free(&mb_list_clust, - (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF, - MBP_PERSIST, &cchnum); - persist = MBP_PERSISTENT; - } else { - (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, - mb->m_ext.ext_args); - _mext_dealloc_ref(mb); - persist = 0; - } - } - } - mb_free(&mb_list_mbuf, mb, mb->m_type, persist, &cchnum); - return (nb); -} - -/* - * Free an entire chain of mbufs and associated external buffers, if - * applicable. Right now, we only optimize a little so that the cache - * lock may be held across a single mbuf+cluster free. Hopefully, - * we'll eventually be holding the lock across more than merely two - * consecutive frees but right now this is hard to implement because of - * things like _mext_dealloc_ref (may do a free()) and atomic ops in the - * loop. - * - * - mb: the mbuf chain to free. - */ -void -m_freem(struct mbuf *mb) -{ - - while (mb != NULL) - mb = m_free(mb); -} - -/* - * Fetch an mbuf with a cluster attached to it. If one of the - * allocations fails, the entire allocation fails. This routine is - * the preferred way of fetching both the mbuf and cluster together, - * as it avoids having to unlock/relock between allocations. Returns - * NULL on failure. - * - * Arguments: - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - * - type: the type of the mbuf being allocated. - * - flags: any flags to pass to the mbuf being allocated; if this includes - * the M_PKTHDR bit, then the mbuf is configured as a M_PKTHDR mbuf. - */ -struct mbuf * -m_getcl(int how, short type, int flags) -{ - struct mbuf *mb; - int cchnum; - - mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, - MBP_PERSIST, &cchnum); - if (mb == NULL) - return NULL; - mb->m_type = type; - mb->m_next = NULL; - mb->m_flags = flags; - if ((flags & M_PKTHDR) != 0) { - mb->m_nextpkt = NULL; - mb->m_pkthdr.rcvif = NULL; - mb->m_pkthdr.csum_flags = 0; - SLIST_INIT(&mb->m_pkthdr.tags); - } - - mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, - MT_NOTMBUF, MBP_PERSISTENT, &cchnum); - if (mb->m_ext.ext_buf == NULL) { - (void)m_free(mb); - mb = NULL; - } else { - _mcl_setup(mb); - _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); -#ifdef MAC - if (flags & M_PKTHDR) { - if (mac_init_mbuf(mb, MBTOM(how)) != 0) { - m_free(mb); - return (NULL); - } - } -#endif - } - return (mb); -} - -/* - * Fetch a single mbuf cluster and attach it to an existing mbuf. If - * successfull, configures the provided mbuf to have mbuf->m_ext.ext_buf - * pointing to the cluster, and sets the M_EXT bit in the mbuf's flags. - * The M_EXT bit is not set on failure. - * - * Arguments: - * - mb: the existing mbuf to which to attach the allocated cluster. - * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks - * if really starved for memory. M_DONTWAIT to never block. - */ -void -m_clget(struct mbuf *mb, int how) -{ - - mb->m_ext.ext_buf= (caddr_t)mb_alloc(&mb_list_clust,how,MT_NOTMBUF, - 0, NULL); - if (mb->m_ext.ext_buf != NULL) { - _mcl_setup(mb); - _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]); - } -} - -/* - * Configure a provided mbuf to refer to the provided external storage - * buffer and setup a reference count for said buffer. If the setting - * up of the reference count fails, the M_EXT bit will not be set. If - * successfull, the M_EXT bit is set in the mbuf's flags. - * - * Arguments: - * - mb: the existing mbuf to which to attach the provided buffer. - * - buf: the address of the provided external storage buffer. - * - size: the size of the provided buffer. - * - freef: a pointer to a routine that is responsible for freeing the - * provided external storage buffer. - * - args: a pointer to an argument structure (of any type) to be passed - * to the provided freef routine (may be NULL). - * - flags: any other flags to be passed to the provided mbuf. - * - type: the type that the external storage buffer should be labeled with. - */ -void -m_extadd(struct mbuf *mb, caddr_t buf, u_int size, - void (*freef)(void *, void *), void *args, int flags, int type) -{ - u_int *ref_cnt = NULL; - - if (type == EXT_CLUSTER) - ref_cnt = &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]; - else if (type == EXT_EXTREF) - ref_cnt = mb->m_ext.ref_cnt; - _mext_init_ref(mb, ref_cnt); - if (mb->m_ext.ref_cnt != NULL) { - mb->m_flags |= (M_EXT | flags); - mb->m_ext.ext_buf = buf; - mb->m_data = mb->m_ext.ext_buf; - mb->m_ext.ext_size = size; - mb->m_ext.ext_free = freef; - mb->m_ext.ext_args = args; - mb->m_ext.ext_type = type; - } -} - -/* - * Change type of provided mbuf. This is a relatively expensive operation - * (due to the cost of statistics manipulations) and should be avoided, where - * possible. - * - * Arguments: - * - mb: the provided mbuf for which the type needs to be changed. - * - new_type: the new type to change the mbuf to. - */ -void -m_chtype(struct mbuf *mb, short new_type) -{ - struct mb_gen_list *gen_list; - - gen_list = MB_GET_GEN_LIST(&mb_list_mbuf); - MB_LOCK_CONT(gen_list); - MB_MBTYPES_DEC(gen_list, mb->m_type, 1); - MB_MBTYPES_INC(gen_list, new_type, 1); - MB_UNLOCK_CONT(gen_list); - mb->m_type = new_type; -} diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c index 5815faef9baf..e14aba12c310 100644 --- a/sys/kern/uipc_mbuf.c +++ b/sys/kern/uipc_mbuf.c @@ -85,6 +85,161 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, &m_defragrandomfailures, 0, ""); #endif +/* + * Malloc-type for external ext_buf ref counts. + */ +MALLOC_DEFINE(M_MBUF, "mbextcnt", "mbuf external ref counts"); + +/* + * Allocate a given length worth of mbufs and/or clusters (whatever fits + * best) and return a pointer to the top of the allocated chain. If an + * existing mbuf chain is provided, then we will append the new chain + * to the existing one but still return the top of the newly allocated + * chain. + */ +struct mbuf * +m_getm(struct mbuf *m, int len, int how, short type) +{ + struct mbuf *mb, *top, *cur, *mtail; + int num, rem; + int i; + + KASSERT(len >= 0, ("m_getm(): len is < 0")); + + /* If m != NULL, we will append to the end of that chain. */ + if (m != NULL) + for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next); + else + mtail = NULL; + + /* + * Calculate how many mbufs+clusters ("packets") we need and how much + * leftover there is after that and allocate the first mbuf+cluster + * if required. + */ + num = len / MCLBYTES; + rem = len % MCLBYTES; + top = cur = NULL; + if (num > 0) { + if ((top = cur = m_getcl(how, type, 0)) == NULL) + goto failed; + } + num--; + top->m_len = 0; + + for (i = 0; i < num; i++) { + mb = m_getcl(how, type, 0); + if (mb == NULL) + goto failed; + mb->m_len = 0; + cur = (cur->m_next = mb); + } + if (rem > 0) { + mb = (rem > MINCLSIZE) ? + m_getcl(how, type, 0) : m_get(how, type); + if (mb == NULL) + goto failed; + mb->m_len = 0; + if (cur == NULL) + top = mb; + else + cur->m_next = mb; + } + + if (mtail != NULL) + mtail->m_next = top; + return top; +failed: + if (top != NULL) + m_freem(top); + return NULL; +} + +/* + * Free an entire chain of mbufs and associated external buffers, if + * applicable. + */ +void +m_freem(struct mbuf *mb) +{ + + while (mb != NULL) + mb = m_free(mb); +} + +/*- + * Configure a provided mbuf to refer to the provided external storage + * buffer and setup a reference count for said buffer. If the setting + * up of the reference count fails, the M_EXT bit will not be set. If + * successfull, the M_EXT bit is set in the mbuf's flags. + * + * Arguments: + * mb The existing mbuf to which to attach the provided buffer. + * buf The address of the provided external storage buffer. + * size The size of the provided buffer. + * freef A pointer to a routine that is responsible for freeing the + * provided external storage buffer. + * args A pointer to an argument structure (of any type) to be passed + * to the provided freef routine (may be NULL). + * flags Any other flags to be passed to the provided mbuf. + * type The type that the external storage buffer should be + * labeled with. + * + * Returns: + * Nothing. + */ +void +m_extadd(struct mbuf *mb, caddr_t buf, u_int size, + void (*freef)(void *, void *), void *args, int flags, int type) +{ + u_int *ref_cnt = NULL; + + /* XXX Shouldn't be adding EXT_CLUSTER with this API */ + if (type == EXT_CLUSTER) + ref_cnt = (u_int *)uma_find_refcnt(zone_clust, + mb->m_ext.ext_buf); + else if (type == EXT_EXTREF) + ref_cnt = mb->m_ext.ref_cnt; + mb->m_ext.ref_cnt = (ref_cnt == NULL) ? + malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)ref_cnt; + if (mb->m_ext.ref_cnt != NULL) { + *(mb->m_ext.ref_cnt) = 1; + mb->m_flags |= (M_EXT | flags); + mb->m_ext.ext_buf = buf; + mb->m_data = mb->m_ext.ext_buf; + mb->m_ext.ext_size = size; + mb->m_ext.ext_free = freef; + mb->m_ext.ext_args = args; + mb->m_ext.ext_type = type; + } +} + +/* + * Non-directly-exported function to clean up after mbufs with M_EXT + * storage attached to them if the reference count hits 0. + */ +void +mb_free_ext(struct mbuf *m) +{ + + MEXT_REM_REF(m); + if (atomic_cmpset_int(m->m_ext.ref_cnt, 0, 1)) { + if (m->m_ext.ext_type == EXT_PACKET) { + uma_zfree(zone_pack, m); + return; + } else if (m->m_ext.ext_type == EXT_CLUSTER) { + uma_zfree(zone_clust, m->m_ext.ext_buf); + m->m_ext.ext_buf = NULL; + } else { + (*(m->m_ext.ext_free))(m->m_ext.ext_buf, + m->m_ext.ext_args); + if (m->m_ext.ext_type != EXT_EXTREF) + free(m->m_ext.ref_cnt, M_MBUF); + } + } + uma_zfree(zone_mbuf, m); +} + /* * "Move" mbuf pkthdr from "from" to "to". * "from" must have M_PKTHDR set, and "to" must be empty. @@ -364,22 +519,22 @@ m_dup(struct mbuf *m, int how) struct mbuf *n; /* Get the next new mbuf */ - MGET(n, how, m->m_type); + if (remain >= MINCLSIZE) { + n = m_getcl(how, m->m_type, 0); + nsize = MCLBYTES; + } else { + n = m_get(how, m->m_type); + nsize = MLEN; + } if (n == NULL) goto nospace; - if (top == NULL) { /* first one, must be PKTHDR */ - if (!m_dup_pkthdr(n, m, how)) - goto nospace; - nsize = MHLEN; - } else /* not the first one */ - nsize = MLEN; - if (remain >= MINCLSIZE) { - MCLGET(n, how); - if ((n->m_flags & M_EXT) == 0) { - (void)m_free(n); + + if (top == NULL) { /* First one, must be PKTHDR */ + if (!m_dup_pkthdr(n, m, how)) { + m_free(n); goto nospace; } - nsize = MCLBYTES; + nsize = MHLEN; } n->m_len = 0; @@ -651,39 +806,42 @@ m_devget(char *buf, int totlen, int off, struct ifnet *ifp, void (*copy)(char *from, caddr_t to, u_int len)) { struct mbuf *m; - struct mbuf *top = 0, **mp = ⊤ + struct mbuf *top = NULL, **mp = ⊤ int len; if (off < 0 || off > MHLEN) return (NULL); - MGETHDR(m, M_DONTWAIT, MT_DATA); - if (m == NULL) - return (NULL); - m->m_pkthdr.rcvif = ifp; - m->m_pkthdr.len = totlen; - len = MHLEN; - while (totlen > 0) { - if (top) { - MGET(m, M_DONTWAIT, MT_DATA); + if (top == NULL) { /* First one, must be PKTHDR */ + if (totlen + off >= MINCLSIZE) { + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + len = MCLBYTES; + } else { + m = m_gethdr(M_DONTWAIT, MT_DATA); + len = MHLEN; + + /* Place initial small packet/header at end of mbuf */ + if (m && totlen + off + max_linkhdr <= MLEN) { + m->m_data += max_linkhdr; + len -= max_linkhdr; + } + } + if (m == NULL) + return NULL; + m->m_pkthdr.rcvif = ifp; + m->m_pkthdr.len = totlen; + } else { + if (totlen + off >= MINCLSIZE) { + m = m_getcl(M_DONTWAIT, MT_DATA, 0); + len = MCLBYTES; + } else { + m = m_get(M_DONTWAIT, MT_DATA); + len = MLEN; + } if (m == NULL) { m_freem(top); - return (NULL); - } - len = MLEN; - } - if (totlen + off >= MINCLSIZE) { - MCLGET(m, M_DONTWAIT); - if (m->m_flags & M_EXT) - len = MCLBYTES; - } else { - /* - * Place initial small packet/header at end of mbuf. - */ - if (top == NULL && totlen + off + max_linkhdr <= len) { - m->m_data += max_linkhdr; - len -= max_linkhdr; + return NULL; } } if (off) { @@ -722,9 +880,10 @@ m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) off -= mlen; totlen += mlen; if (m->m_next == NULL) { - n = m_get_clrd(M_DONTWAIT, m->m_type); + n = m_get(M_DONTWAIT, m->m_type); if (n == NULL) goto out; + bzero(mtod(n, caddr_t), MLEN); n->m_len = min(MLEN, len + off); m->m_next = n; } diff --git a/sys/kern/uipc_mbuf2.c b/sys/kern/uipc_mbuf2.c index 0d11aaccc662..ff7944d58084 100644 --- a/sys/kern/uipc_mbuf2.c +++ b/sys/kern/uipc_mbuf2.c @@ -230,14 +230,10 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) * now, we need to do the hard way. don't m_copy as there's no room * on both end. */ - MGET(o, M_DONTWAIT, m->m_type); - if (o && len > MLEN) { - MCLGET(o, M_DONTWAIT); - if ((o->m_flags & M_EXT) == 0) { - m_free(o); - o = NULL; - } - } + if (len > MLEN) + o = m_getcl(M_DONTWAIT, m->m_type, 0); + else + o = m_get(M_DONTWAIT, m->m_type); if (!o) { m_freem(m); return NULL; /* ENOBUFS */ @@ -274,29 +270,27 @@ static struct mbuf * m_dup1(struct mbuf *m, int off, int len, int wait) { struct mbuf *n; - int l; int copyhdr; if (len > MCLBYTES) return NULL; - if (off == 0 && (m->m_flags & M_PKTHDR) != 0) { + if (off == 0 && (m->m_flags & M_PKTHDR) != 0) copyhdr = 1; - MGETHDR(n, wait, m->m_type); - l = MHLEN; - } else { + else copyhdr = 0; - MGET(n, wait, m->m_type); - l = MLEN; - } - if (n && len > l) { - MCLGET(n, wait); - if ((n->m_flags & M_EXT) == 0) { - m_free(n); - n = NULL; - } + if (len >= MINCLSIZE) { + if (copyhdr == 1) + n = m_getcl(wait, m->m_type, M_PKTHDR); + else + n = m_getcl(wait, m->m_type, 0); + } else { + if (copyhdr == 1) + n = m_gethdr(wait, m->m_type); + else + n = m_get(wait, m->m_type); } if (!n) - return NULL; + return NULL; /* ENOBUFS */ if (copyhdr && !m_dup_pkthdr(n, m, wait)) { m_free(n); diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c index 3ab8f3a0e7cd..a404d698740c 100644 --- a/sys/kern/uipc_sockbuf.c +++ b/sys/kern/uipc_sockbuf.c @@ -959,15 +959,12 @@ sbcreatecontrol(p, size, type, level) if (CMSG_SPACE((u_int)size) > MCLBYTES) return ((struct mbuf *) NULL); - if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) + if (CMSG_SPACE((u_int)size > MLEN)) + m = m_getcl(M_DONTWAIT, MT_CONTROL, 0); + else + m = m_get(M_DONTWAIT, MT_CONTROL); + if (m == NULL) return ((struct mbuf *) NULL); - if (CMSG_SPACE((u_int)size) > MLEN) { - MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - m_free(m); - return ((struct mbuf *) NULL); - } - } cp = mtod(m, struct cmsghdr *); m->m_len = 0; KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m), diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index e07f4eff676b..6735e494a976 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -527,8 +527,8 @@ sosend(so, addr, uio, top, control, flags, td) { struct mbuf **mp; struct mbuf *m; - long space, len, resid; - int clen = 0, error, s, dontroute, mlen; + long space, len = 0, resid; + int clen = 0, error, s, dontroute; int atomic = sosendallatonce(so) || top; #ifdef ZERO_COPY_SOCKETS int cow_send; @@ -624,25 +624,23 @@ sosend(so, addr, uio, top, control, flags, td) #ifdef ZERO_COPY_SOCKETS cow_send = 0; #endif /* ZERO_COPY_SOCKETS */ - if (top == 0) { - MGETHDR(m, M_TRYWAIT, MT_DATA); - if (m == NULL) { - error = ENOBUFS; - goto release; - } - mlen = MHLEN; - m->m_pkthdr.len = 0; - m->m_pkthdr.rcvif = (struct ifnet *)0; - } else { - MGET(m, M_TRYWAIT, MT_DATA); - if (m == NULL) { - error = ENOBUFS; - goto release; - } - mlen = MLEN; - } if (resid >= MINCLSIZE) { #ifdef ZERO_COPY_SOCKETS + if (top == NULL) { + MGETHDR(m, M_TRYWAIT, MT_DATA); + if (m == NULL) { + error = ENOBUFS; + goto release; + } + m->m_pkthdr.len = 0; + m->m_pkthdr.rcvif = (struct ifnet *)0; + } else { + MGET(m, M_TRYWAIT, MT_DATA); + if (m == NULL) { + error = ENOBUFS; + goto release; + } + } if (so_zero_copy_send && resid>=PAGE_SIZE && space>=PAGE_SIZE && @@ -654,29 +652,48 @@ sosend(so, addr, uio, top, control, flags, td) cow_send = socow_setup(m, uio); } } - if (!cow_send){ -#endif /* ZERO_COPY_SOCKETS */ - MCLGET(m, M_TRYWAIT); - if ((m->m_flags & M_EXT) == 0) - goto nopages; - mlen = MCLBYTES; - len = min(min(mlen, resid), space); - } else { -#ifdef ZERO_COPY_SOCKETS + if (!cow_send) { + MCLGET(m, M_TRYWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); + m = NULL; + } else { + len = min(min(MCLBYTES, resid), space); + } + } else len = PAGE_SIZE; - } - - } else { +#else /* ZERO_COPY_SOCKETS */ + if (top == NULL) { + m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); + m->m_pkthdr.len = 0; + m->m_pkthdr.rcvif = (struct ifnet *)0; + } else + m = m_getcl(M_TRYWAIT, MT_DATA, 0); + len = min(min(MCLBYTES, resid), space); #endif /* ZERO_COPY_SOCKETS */ -nopages: - len = min(min(mlen, resid), space); - /* - * For datagram protocols, leave room - * for protocol headers in first mbuf. - */ - if (atomic && top == 0 && len < mlen) - MH_ALIGN(m, len); + } else { + if (top == NULL) { + m = m_gethdr(M_TRYWAIT, MT_DATA); + m->m_pkthdr.len = 0; + m->m_pkthdr.rcvif = (struct ifnet *)0; + + len = min(min(MHLEN, resid), space); + /* + * For datagram protocols, leave room + * for protocol headers in first mbuf. + */ + if (atomic && m && len < MHLEN) + MH_ALIGN(m, len); + } else { + m = m_get(M_TRYWAIT, MT_DATA); + len = min(min(MLEN, resid), space); + } } + if (m == NULL) { + error = ENOBUFS; + goto release; + } + space -= len; #ifdef ZERO_COPY_SOCKETS if (cow_send) diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c index 3ab8f3a0e7cd..a404d698740c 100644 --- a/sys/kern/uipc_socket2.c +++ b/sys/kern/uipc_socket2.c @@ -959,15 +959,12 @@ sbcreatecontrol(p, size, type, level) if (CMSG_SPACE((u_int)size) > MCLBYTES) return ((struct mbuf *) NULL); - if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) + if (CMSG_SPACE((u_int)size > MLEN)) + m = m_getcl(M_DONTWAIT, MT_CONTROL, 0); + else + m = m_get(M_DONTWAIT, MT_CONTROL); + if (m == NULL) return ((struct mbuf *) NULL); - if (CMSG_SPACE((u_int)size) > MLEN) { - MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - m_free(m); - return ((struct mbuf *) NULL); - } - } cp = mtod(m, struct cmsghdr *); m->m_len = 0; KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m), diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 1b886f54f959..978c30ebfde7 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #ifdef KTRACE @@ -84,6 +85,21 @@ static int getsockname1(struct thread *td, struct getsockname_args *uap, static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); +/* + * NSFBUFS-related variables and associated sysctls + */ +int nsfbufs; +int nsfbufspeak; +int nsfbufsused; + +SYSCTL_DECL(_kern_ipc); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, + "Maximum number of sendfile(2) sf_bufs available"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, + "Number of sendfile(2) sf_bufs at peak usage"); +SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, + "Number of sendfile(2) sf_bufs in use"); + /* * System call interface to the socket abstraction. */ diff --git a/sys/sparc64/sparc64/vm_machdep.c b/sys/sparc64/sparc64/vm_machdep.c index fe263f167fe6..4a34567dee0f 100644 --- a/sys/sparc64/sparc64/vm_machdep.c +++ b/sys/sparc64/sparc64/vm_machdep.c @@ -86,6 +86,10 @@ #include #include +#ifndef NSFBUFS +#define NSFBUFS (512 + maxusers * 16) +#endif + static void sf_buf_init(void *arg); SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) @@ -351,6 +355,9 @@ sf_buf_init(void *arg) vm_offset_t sf_base; int i; + nsfbufs = NSFBUFS; + TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); + mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF); SLIST_INIT(&sf_freelist.sf_head); sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE); diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index d86c57c11528..2170599f5bc6 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -10,7 +10,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -33,7 +33,12 @@ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ +/* XXX: These includes suck. Sorry! */ #include +#ifdef _KERNEL +#include +#include +#endif /* * Mbufs are of a single size, MSIZE (sys/param.h), which @@ -57,6 +62,16 @@ */ #define mtod(m, t) ((t)((m)->m_data)) #define dtom(x) ((struct mbuf *)((intptr_t)(x) & ~(MSIZE-1))) + +/* + * Argument structure passed to UMA routines during mbuf and packet + * allocations. + */ +struct mb_args { + int flags; /* Flags for mbuf being allocated */ + int how; /* How to allocate: M_WAITOK or M_DONTWAIT */ + short type; /* Type of mbuf being allocated */ +}; #endif /* _KERNEL */ /* @@ -167,6 +182,7 @@ struct mbuf { */ #define EXT_CLUSTER 1 /* mbuf cluster */ #define EXT_SFBUF 2 /* sendfile(2)'s sf_bufs */ +#define EXT_PACKET 3 /* came out of Packet zone */ #define EXT_NET_DRV 100 /* custom ext_buf provided by net driver(s) */ #define EXT_MOD_TYPE 200 /* custom module's ext_buf type */ #define EXT_DISPOSABLE 300 /* can throw this buffer away w/page flipping */ @@ -222,29 +238,13 @@ struct mbuf { #define MT_OOBDATA 15 /* expedited data */ #define MT_NTYPES 16 /* number of mbuf types for mbtypes[] */ -/* - * Mbuf and cluster allocation statistics PCPU structure. - */ -struct mbpstat { - u_long mb_mbfree; - u_long mb_mbbucks; - u_long mb_clfree; - u_long mb_clbucks; - long mb_mbtypes[MT_NTYPES]; - short mb_active; -}; - /* * General mbuf allocator statistics structure. - * XXX: Modifications of these are not protected by any mutex locks nor by - * any atomic() manipulations. As a result, we may occasionally lose - * a count or two. Luckily, not all of these fields are modified at all - * and remain static, and those that are manipulated are only manipulated - * in failure situations, which do not occur (hopefully) very often. */ struct mbstat { - u_long m_drops; /* times failed to allocate */ - u_long m_wait; /* times succesfully returned from wait */ + u_long m_mbufs; /* XXX */ + u_long m_mclusts; /* XXX */ + u_long m_drain; /* times drained protocols for space */ u_long m_mcfail; /* XXX: times m_copym failed */ u_long m_mpfail; /* XXX: times m_pullup failed */ @@ -253,10 +253,10 @@ struct mbstat { u_long m_minclsize; /* min length of data to allocate a cluster */ u_long m_mlen; /* length of data in an mbuf */ u_long m_mhlen; /* length of data in a header mbuf */ - u_int m_mbperbuck; /* number of mbufs per "bucket" */ - u_int m_clperbuck; /* number of clusters per "bucket" */ - /* Number of mbtypes (gives # elems in mbpstat's mb_mbtypes[] array: */ + + /* Number of mbtypes (gives # elems in mbtypes[] array: */ short m_numtypes; + /* XXX: Sendfile stats should eventually move to their own struct */ u_long sf_iocnt; /* times sendfile had to do disk I/O */ u_long sf_allocfail; /* times sfbuf allocation failed */ @@ -265,14 +265,23 @@ struct mbstat { /* * Flags specifying how an allocation should be made. - * M_DONTWAIT means "don't block if nothing is available" whereas - * M_TRYWAIT means "block for mbuf_wait ticks at most if nothing is - * available." + * + * The flag to use is as follows: + * - M_DONTWAIT or M_NOWAIT from an interrupt handler to not block allocation. + * - M_WAIT or M_WAITOK or M_TRYWAIT from wherever it is safe to block. + * + * M_DONTWAIT/M_NOWAIT means that we will not block the thread explicitly + * and if we cannot allocate immediately we may return NULL, + * whereas M_WAIT/M_WAITOK/M_TRYWAIT means that if we cannot allocate + * resources we will block until they are available, and thus never + * return NULL. + * + * XXX Eventually just phase this out to use M_WAITOK/M_NOWAIT. */ -#define M_DONTWAIT 0x4 /* don't conflict with M_NOWAIT */ -#define M_TRYWAIT 0x8 /* or M_WAITOK */ -#define M_WAIT M_TRYWAIT /* XXX: deprecated */ -#define MBTOM(how) ((how) & M_TRYWAIT ? M_WAITOK : M_NOWAIT) +#define MBTOM(how) (how) +#define M_DONTWAIT M_NOWAIT +#define M_TRYWAIT M_WAITOK +#define M_WAIT M_WAITOK #ifdef _KERNEL /*- @@ -295,36 +304,121 @@ struct mbstat { #define MEXT_ADD_REF(m) atomic_add_int((m)->m_ext.ref_cnt, 1) +/* + * Network buffer allocation API + * + * The rest of it is defined in kern/subr_mbuf.c + */ + +extern uma_zone_t zone_mbuf; +extern uma_zone_t zone_clust; +extern uma_zone_t zone_pack; + +static __inline struct mbuf *m_get(int how, short type); +static __inline struct mbuf *m_gethdr(int how, short type); +static __inline struct mbuf *m_getcl(int how, short type, int flags); +static __inline struct mbuf *m_getclr(int how, short type); /* XXX */ +static __inline struct mbuf *m_free(struct mbuf *m); +static __inline void m_clget(struct mbuf *m, int how); +static __inline void m_chtype(struct mbuf *m, short new_type); +void mb_free_ext(struct mbuf *); + +static __inline +struct mbuf * +m_get(int how, short type) +{ + struct mb_args args; + + args.flags = 0; + args.how = how; + args.type = type; + return (uma_zalloc_arg(zone_mbuf, &args, how)); +} + +/* XXX This should be depracated, very little use */ +static __inline +struct mbuf * +m_getclr(int how, short type) +{ + struct mbuf *m; + struct mb_args args; + + args.flags = 0; + args.how = how; + args.type = type; + m = uma_zalloc_arg(zone_mbuf, &args, how); + if (m != NULL) + bzero(m->m_data, MLEN); + return m; +} + +static __inline +struct mbuf * +m_gethdr(int how, short type) +{ + struct mb_args args; + + args.flags = M_PKTHDR; + args.how = how; + args.type = type; + return (uma_zalloc_arg(zone_mbuf, &args, how)); +} + +static __inline +struct mbuf * +m_getcl(int how, short type, int flags) +{ + struct mb_args args; + + args.flags = flags; + args.how = how; + args.type = type; + return (uma_zalloc_arg(zone_pack, &args, how)); +} + +static __inline +struct mbuf * +m_free(struct mbuf *m) +{ + struct mbuf *n = m->m_next; + +#ifdef INVARIANTS + m->m_flags |= M_FREELIST; +#endif + if (m->m_flags & M_EXT) + mb_free_ext(m); + else + uma_zfree(zone_mbuf, m); + return n; +} + +static __inline +void +m_clget(struct mbuf *m, int how) +{ + m->m_ext.ext_buf = NULL; + uma_zalloc_arg(zone_clust, m, how); +} + +static __inline +void +m_chtype(struct mbuf *m, short new_type) +{ + m->m_type = new_type; +} + /* * mbuf, cluster, and external object allocation macros * (for compatibility purposes). */ /* NB: M_COPY_PKTHDR is deprecated. Use M_MOVE_PKTHDR or m_dup_pktdr. */ #define M_MOVE_PKTHDR(to, from) m_move_pkthdr((to), (from)) -#define m_getclr(how, type) m_get_clrd((how), (type)) #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) #define MCLGET(m, how) m_clget((m), (how)) #define MEXTADD(m, buf, size, free, args, flags, type) \ m_extadd((m), (caddr_t)(buf), (size), (free), (args), (flags), (type)) -/* - * MEXTFREE(m): disassociate (and possibly free) an external object from (m). - * - * If the atomic_cmpset_int() returns 0, then we effectively do nothing - * in terms of "cleaning up" (freeing the ext buf and ref. counter) as - * this means that either there are still references, or another thread - * is taking care of the clean-up. - */ -#define MEXTFREE(m) do { \ - struct mbuf *_mb = (m); \ - \ - MEXT_REM_REF(_mb); \ - if (atomic_cmpset_int(_mb->m_ext.ref_cnt, 0, 1)) \ - _mext_free(_mb); \ - _mb->m_flags &= ~M_EXT; \ -} while (0) - /* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this * can be both the local data payload, or an external buffer area, @@ -425,18 +519,13 @@ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern struct mbstat mbstat; /* General mbuf stats/infos */ extern int nmbclusters; /* Maximum number of clusters */ -extern int nmbcnt; /* Scale kmem_map for counter space */ -extern int nmbufs; /* Maximum number of mbufs */ struct uio; -void _mext_free(struct mbuf *); void m_adj(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, u_int), void *); void m_cat(struct mbuf *, struct mbuf *); -void m_chtype(struct mbuf *, short); -void m_clget(struct mbuf *, int); void m_extadd(struct mbuf *, caddr_t, u_int, void (*)(void *, void *), void *, int, int); void m_copyback(struct mbuf *, int, int, c_caddr_t); @@ -451,13 +540,7 @@ struct mbuf *m_dup(struct mbuf *, int); int m_dup_pkthdr(struct mbuf *, struct mbuf *, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_fragment(struct mbuf *, int, int); -struct mbuf *m_free(struct mbuf *); void m_freem(struct mbuf *); -struct mbuf *m_get(int, short); -struct mbuf *m_get_clrd(int, short); -struct mbuf *m_getcl(int, short, int); -struct mbuf *m_gethdr(int, short); -struct mbuf *m_gethdr_clrd(int, short); struct mbuf *m_getm(struct mbuf *, int, int, short); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); @@ -470,7 +553,7 @@ struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int); /*- - * Packets may have annotations attached by affixing a list + * Network packets may have annotations attached by affixing a list * of "packet tags" to the pkthdr structure. Packet tags are * dynamically allocated semi-opaque data structures that have * a fixed header (struct m_tag) that specifies the size of the diff --git a/sys/vm/uma.h b/sys/vm/uma.h index 4de1efadf0ad..0d34ca375e55 100644 --- a/sys/vm/uma.h +++ b/sys/vm/uma.h @@ -43,7 +43,7 @@ /* Types and type defs */ -struct uma_zone; +struct uma_zone; /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; @@ -157,11 +157,45 @@ typedef void (*uma_fini)(void *mem, int size); * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ - uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, u_int16_t flags); +/* + * Create a secondary uma zone + * + * Arguments: + * name The text name of the zone for debugging and stats, this memory + * should not be freed until the zone has been deallocated. + * ctor The constructor that is called when the object is allocated + * dtor The destructor that is called when the object is freed. + * zinit An initializer that sets up the initial state of the memory + * as the object passes from the Keg's slab to the Zone's cache. + * zfini A discard function that undoes initialization done by init + * as the object passes from the Zone's cache to the Keg's slab. + * + * ctor/dtor/zinit/zfini may all be null, see notes above. + * Note that the zinit and zfini specified here are NOT + * exactly the same as the init/fini specified to uma_zcreate() + * when creating a master zone. These zinit/zfini are called + * on the TRANSITION from keg to zone (and vice-versa). Once + * these are set, the primary zone may alter its init/fini + * (which are called when the object passes from VM to keg) + * using uma_zone_set_init/fini()) as well as its own + * zinit/zfini (unset by default for master zone) with + * uma_zone_set_zinit/zfini() (note subtle 'z' prefix). + * + * align A bitmask that corisponds to the requested alignment + * eg 4 would be 0x3 + * flags A set of parameters that control the behavior of the zone + * + * Returns: + * A pointer to a structure which is intended to be opaque to users of + * the interface. The value may be null if the wait flag is not set. + */ +uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, + uma_init zinit, uma_fini zfini, uma_zone_t master); + /* * Definitions for uma_zcreate flags * @@ -185,6 +219,9 @@ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, * Use a hash table instead of caching * information in the vm_page. */ +#define UMA_ZONE_SECONDARY 0x0200 /* Zone is a Secondary Zone */ +#define UMA_ZONE_REFCNT 0x0400 /* Allocate refcnts in slabs */ +#define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets */ /* Definitions for align */ #define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */ @@ -201,7 +238,6 @@ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, * zone The zone we want to destroy. * */ - void uma_zdestroy(uma_zone_t zone); /* @@ -375,6 +411,28 @@ int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int size); */ void uma_zone_set_max(uma_zone_t zone, int nitems); +/* + * The following two routines (uma_zone_set_init/fini) + * are used to set the backend init/fini pair which acts on an + * object as it becomes allocated and is placed in a slab within + * the specified zone's backing keg. These should probably not + * be changed once allocations have already begun and only + * immediately upon zone creation. + */ +void uma_zone_set_init(uma_zone_t zone, uma_init uminit); +void uma_zone_set_fini(uma_zone_t zone, uma_fini fini); + +/* + * The following two routines (uma_zone_set_zinit/zfini) are + * used to set the zinit/zfini pair which acts on an object as + * it passes from the backing Keg's slab cache to the + * specified Zone's bucket cache. These should probably not + * be changed once allocations have already begun and + * only immediately upon zone creation. + */ +void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit); +void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini); + /* * Replaces the standard page_alloc or obj_alloc functions for this zone * @@ -430,5 +488,19 @@ void uma_zone_set_freef(uma_zone_t zone, uma_free freef); */ void uma_prealloc(uma_zone_t zone, int itemcnt); +/* + * Used to lookup the reference counter allocated for an item + * from a UMA_ZONE_REFCNT zone. For UMA_ZONE_REFCNT zones, + * reference counters are allocated for items and stored in + * the underlying slab header. + * + * Arguments: + * zone The UMA_ZONE_REFCNT zone to which the item belongs. + * item The address of the item for which we want a refcnt. + * + * Returns: + * A pointer to a u_int32_t reference counter. + */ +u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item); #endif diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index f6935407372a..82d60c6daa10 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -84,15 +84,19 @@ __FBSDID("$FreeBSD$"); #include /* - * This is the zone from which all zones are spawned. The idea is that even - * the zone heads are allocated from the allocator, so we use the bss section - * to bootstrap us. + * This is the zone and keg from which all zones are spawned. The idea is that + * even the zone & keg heads are allocated from the allocator, so we use the + * bss section to bootstrap us. */ -static struct uma_zone masterzone; -static uma_zone_t zones = &masterzone; +static struct uma_keg masterkeg; +static struct uma_zone masterzone_k; +static struct uma_zone masterzone_z; +static uma_zone_t kegs = &masterzone_k; +static uma_zone_t zones = &masterzone_z; /* This is the zone from which all of uma_slab_t's are allocated. */ static uma_zone_t slabzone; +static uma_zone_t slabrefzone; /* With refcounters (for UMA_ZONE_REFCNT) */ /* * The initial hash tables come out of this zone so they can be allocated @@ -107,10 +111,10 @@ static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); */ static int bucketdisable = 1; -/* Linked list of all zones in the system */ -static LIST_HEAD(,uma_zone) uma_zones = LIST_HEAD_INITIALIZER(&uma_zones); +/* Linked list of all kegs in the system */ +static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(&uma_kegs); -/* This mutex protects the zone list */ +/* This mutex protects the keg list */ static struct mtx uma_mtx; /* These are the pcpu cache locks */ @@ -144,6 +148,16 @@ struct uma_zctor_args { uma_dtor dtor; uma_init uminit; uma_fini fini; + uma_keg_t keg; + int align; + u_int16_t flags; +}; + +struct uma_kctor_args { + uma_zone_t zone; + size_t size; + uma_init uminit; + uma_fini fini; int align; u_int16_t flags; }; @@ -179,6 +193,8 @@ static uma_slab_t slab_zalloc(uma_zone_t, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_drain(uma_zone_t zone); +static void keg_ctor(void *, int, void *); +static void keg_dtor(void *, int, void *); static void zone_ctor(void *, int, void *); static void zone_dtor(void *, int, void *); static void zero_init(void *, int); @@ -202,6 +218,8 @@ static int uma_zalloc_bucket(uma_zone_t zone, int flags); static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags); static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab); static void zone_drain(uma_zone_t); +static void uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, + uma_fini fini, int align, u_int16_t flags); void uma_print_zone(uma_zone_t); void uma_print_stats(void); @@ -328,10 +346,12 @@ uma_timeout(void *unused) static void zone_timeout(uma_zone_t zone) { + uma_keg_t keg; uma_cache_t cache; u_int64_t alloc; int cpu; + keg = zone->uz_keg; alloc = 0; /* @@ -344,7 +364,7 @@ zone_timeout(uma_zone_t zone) * to lock and do it here instead so that the statistics don't get too * far out of sync. */ - if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) { + if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL)) { for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) continue; @@ -369,8 +389,8 @@ zone_timeout(uma_zone_t zone) * may be a little aggressive. Should I allow for two collisions max? */ - if (zone->uz_flags & UMA_ZONE_HASH && - zone->uz_pages / zone->uz_ppera >= zone->uz_hash.uh_hashsize) { + if (keg->uk_flags & UMA_ZONE_HASH && + keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; @@ -381,14 +401,14 @@ zone_timeout(uma_zone_t zone) * I have to do everything in stages and check for * races. */ - newhash = zone->uz_hash; + newhash = keg->uk_hash; ZONE_UNLOCK(zone); ret = hash_alloc(&newhash); ZONE_LOCK(zone); if (ret) { - if (hash_expand(&zone->uz_hash, &newhash)) { - oldhash = zone->uz_hash; - zone->uz_hash = newhash; + if (hash_expand(&keg->uk_hash, &newhash)) { + oldhash = keg->uk_hash; + keg->uk_hash = newhash; } else oldhash = newhash; @@ -530,7 +550,7 @@ bucket_drain(uma_zone_t zone, uma_bucket_t bucket) mzone = 0; /* We have to lookup the slab again for malloc.. */ - if (zone->uz_flags & UMA_ZONE_MALLOC) + if (zone->uz_keg->uk_flags & UMA_ZONE_MALLOC) mzone = 1; while (bucket->ub_cnt > 0) { @@ -636,29 +656,32 @@ static void zone_drain(uma_zone_t zone) { struct slabhead freeslabs = {}; + uma_keg_t keg; uma_slab_t slab; uma_slab_t n; u_int8_t flags; u_int8_t *mem; int i; + keg = zone->uz_keg; + /* - * We don't want to take pages from staticly allocated zones at this + * We don't want to take pages from statically allocated zones at this * time */ - if (zone->uz_flags & UMA_ZONE_NOFREE || zone->uz_freef == NULL) + if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL) return; ZONE_LOCK(zone); #ifdef UMA_DEBUG - printf("%s free items: %u\n", zone->uz_name, zone->uz_free); + printf("%s free items: %u\n", zone->uz_name, keg->uk_free); #endif bucket_cache_drain(zone); - if (zone->uz_free == 0) + if (keg->uk_free == 0) goto finished; - slab = LIST_FIRST(&zone->uz_free_slab); + slab = LIST_FIRST(&keg->uk_free_slab); while (slab) { n = LIST_NEXT(slab, us_link); @@ -669,11 +692,11 @@ zone_drain(uma_zone_t zone) } LIST_REMOVE(slab, us_link); - zone->uz_pages -= zone->uz_ppera; - zone->uz_free -= zone->uz_ipers; + keg->uk_pages -= keg->uk_ppera; + keg->uk_free -= keg->uk_ipers; - if (zone->uz_flags & UMA_ZONE_HASH) - UMA_HASH_REMOVE(&zone->uz_hash, slab, slab->us_data); + if (keg->uk_flags & UMA_ZONE_HASH) + UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data); SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); @@ -684,34 +707,34 @@ zone_drain(uma_zone_t zone) while ((slab = SLIST_FIRST(&freeslabs)) != NULL) { SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink); - if (zone->uz_fini) - for (i = 0; i < zone->uz_ipers; i++) - zone->uz_fini( - slab->us_data + (zone->uz_rsize * i), - zone->uz_size); + if (keg->uk_fini) + for (i = 0; i < keg->uk_ipers; i++) + keg->uk_fini( + slab->us_data + (keg->uk_rsize * i), + keg->uk_size); flags = slab->us_flags; mem = slab->us_data; - if (zone->uz_flags & UMA_ZONE_OFFPAGE) - uma_zfree_internal(slabzone, slab, NULL, 0); - if (zone->uz_flags & UMA_ZONE_MALLOC) { + if ((keg->uk_flags & UMA_ZONE_MALLOC) || + (keg->uk_flags & UMA_ZONE_REFCNT)) { vm_object_t obj; if (flags & UMA_SLAB_KMEM) obj = kmem_object; else obj = NULL; - for (i = 0; i < zone->uz_ppera; i++) + for (i = 0; i < keg->uk_ppera; i++) vsetobj((vm_offset_t)mem + (i * PAGE_SIZE), obj); } + if (keg->uk_flags & UMA_ZONE_OFFPAGE) + uma_zfree_internal(keg->uk_slabzone, slab, NULL, 0); #ifdef UMA_DEBUG printf("%s: Returning %d bytes.\n", - zone->uz_name, UMA_SLAB_SIZE * zone->uz_ppera); + zone->uz_name, UMA_SLAB_SIZE * keg->uk_ppera); #endif - zone->uz_freef(mem, UMA_SLAB_SIZE * zone->uz_ppera, flags); + keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags); } - } /* @@ -728,20 +751,23 @@ zone_drain(uma_zone_t zone) static uma_slab_t slab_zalloc(uma_zone_t zone, int wait) { - uma_slab_t slab; /* Starting slab */ + uma_slabrefcnt_t slabref; + uma_slab_t slab; + uma_keg_t keg; u_int8_t *mem; u_int8_t flags; int i; slab = NULL; + keg = zone->uz_keg; #ifdef UMA_DEBUG printf("slab_zalloc: Allocating a new slab for %s\n", zone->uz_name); #endif ZONE_UNLOCK(zone); - if (zone->uz_flags & UMA_ZONE_OFFPAGE) { - slab = uma_zalloc_internal(slabzone, NULL, wait); + if (keg->uk_flags & UMA_ZONE_OFFPAGE) { + slab = uma_zalloc_internal(keg->uk_slabzone, NULL, wait); if (slab == NULL) { ZONE_LOCK(zone); return NULL; @@ -755,12 +781,12 @@ slab_zalloc(uma_zone_t zone, int wait) * Malloced items are zeroed in uma_zalloc. */ - if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0) + if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) wait |= M_ZERO; else wait &= ~M_ZERO; - mem = zone->uz_allocf(zone, zone->uz_ppera * UMA_SLAB_SIZE, + mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE, &flags, wait); if (mem == NULL) { ZONE_LOCK(zone); @@ -768,32 +794,39 @@ slab_zalloc(uma_zone_t zone, int wait) } /* Point the slab into the allocated memory */ - if (!(zone->uz_flags & UMA_ZONE_OFFPAGE)) - slab = (uma_slab_t )(mem + zone->uz_pgoff); + if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) + slab = (uma_slab_t )(mem + keg->uk_pgoff); - if (zone->uz_flags & UMA_ZONE_MALLOC) - for (i = 0; i < zone->uz_ppera; i++) + if ((keg->uk_flags & UMA_ZONE_MALLOC) || + (keg->uk_flags & UMA_ZONE_REFCNT)) + for (i = 0; i < keg->uk_ppera; i++) vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab); - slab->us_zone = zone; + slab->us_keg = keg; slab->us_data = mem; - slab->us_freecount = zone->uz_ipers; + slab->us_freecount = keg->uk_ipers; slab->us_firstfree = 0; slab->us_flags = flags; - for (i = 0; i < zone->uz_ipers; i++) - slab->us_freelist[i] = i+1; + for (i = 0; i < keg->uk_ipers; i++) + slab->us_freelist[i].us_item = i+1; - if (zone->uz_init) - for (i = 0; i < zone->uz_ipers; i++) - zone->uz_init(slab->us_data + (zone->uz_rsize * i), - zone->uz_size); + if (keg->uk_flags & UMA_ZONE_REFCNT) { + slabref = (uma_slabrefcnt_t)slab; + for (i = 0; i < keg->uk_ipers; i++) + slabref->us_freelist[i].us_refcnt = 0; + } + + if (keg->uk_init) + for (i = 0; i < keg->uk_ipers; i++) + keg->uk_init(slab->us_data + (keg->uk_rsize * i), + keg->uk_size); ZONE_LOCK(zone); - if (zone->uz_flags & UMA_ZONE_HASH) - UMA_HASH_INSERT(&zone->uz_hash, slab, mem); + if (keg->uk_flags & UMA_ZONE_HASH) + UMA_HASH_INSERT(&keg->uk_hash, slab, mem); - zone->uz_pages += zone->uz_ppera; - zone->uz_free += zone->uz_ipers; + keg->uk_pages += keg->uk_ppera; + keg->uk_free += keg->uk_ipers; return (slab); } @@ -806,6 +839,10 @@ slab_zalloc(uma_zone_t zone, int wait) static void * startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait) { + uma_keg_t keg; + + keg = zone->uz_keg; + /* * Check our small startup cache to see if it has pages remaining. */ @@ -827,11 +864,11 @@ startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait) * Now that we've booted reset these users to their real allocator. */ #ifdef UMA_MD_SMALL_ALLOC - zone->uz_allocf = uma_small_alloc; + keg->uk_allocf = uma_small_alloc; #else - zone->uz_allocf = page_alloc; + keg->uk_allocf = page_alloc; #endif - return zone->uz_allocf(zone, bytes, pflag, wait); + return keg->uk_allocf(zone, bytes, pflag, wait); } /* @@ -877,7 +914,7 @@ obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) vm_page_t p; int pages, startpages; - object = zone->uz_obj; + object = zone->uz_keg->uk_obj; retkva = 0; /* @@ -887,7 +924,7 @@ obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) p = TAILQ_LAST(&object->memq, pglist); pages = p != NULL ? p->pindex + 1 : 0; startpages = pages; - zkva = zone->uz_kva + pages * PAGE_SIZE; + zkva = zone->uz_keg->uk_kva + pages * PAGE_SIZE; for (; bytes > 0; bytes -= PAGE_SIZE) { p = vm_page_alloc(object, pages, VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); @@ -965,29 +1002,33 @@ zero_init(void *mem, int size) static void zone_small_init(uma_zone_t zone) { + uma_keg_t keg; int rsize; int memused; int ipers; - rsize = zone->uz_size; + keg = zone->uz_keg; + KASSERT(keg != NULL, ("Keg is null in zone_small_init")); + rsize = keg->uk_size; if (rsize < UMA_SMALLEST_UNIT) rsize = UMA_SMALLEST_UNIT; - if (rsize & zone->uz_align) - rsize = (rsize & ~zone->uz_align) + (zone->uz_align + 1); + if (rsize & keg->uk_align) + rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1); - zone->uz_rsize = rsize; + keg->uk_rsize = rsize; rsize += 1; /* Account for the byte of linkage */ - zone->uz_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize; - zone->uz_ppera = 1; + keg->uk_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize; + keg->uk_ppera = 1; - KASSERT(zone->uz_ipers != 0, ("zone_small_init: ipers is 0, uh-oh!")); - memused = zone->uz_ipers * zone->uz_rsize; + KASSERT(keg->uk_ipers != 0, ("zone_small_init: ipers is 0, uh-oh!")); + memused = keg->uk_ipers * keg->uk_rsize; /* Can we do any better? */ - if ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE) { + if ((keg->uk_flags & UMA_ZONE_REFCNT) || + ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE)) { /* * We can't do this if we're internal or if we've been * asked to not go to the VM for buckets. If we do this we @@ -995,15 +1036,16 @@ zone_small_init(uma_zone_t zone) * do not want to do if we're UMA_ZFLAG_CACHEONLY as a * result of UMA_ZONE_VM, which clearly forbids it. */ - if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) || - (zone->uz_flags & UMA_ZFLAG_CACHEONLY)) + if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) || + (keg->uk_flags & UMA_ZFLAG_CACHEONLY)) return; - ipers = UMA_SLAB_SIZE / zone->uz_rsize; - if (ipers > zone->uz_ipers) { - zone->uz_flags |= UMA_ZONE_OFFPAGE; - if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0) - zone->uz_flags |= UMA_ZONE_HASH; - zone->uz_ipers = ipers; + ipers = UMA_SLAB_SIZE / keg->uk_rsize; + if ((keg->uk_flags & UMA_ZONE_REFCNT) || + (ipers > keg->uk_ipers)) { + keg->uk_flags |= UMA_ZONE_OFFPAGE; + if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) + keg->uk_flags |= UMA_ZONE_HASH; + keg->uk_ipers = ipers; } } } @@ -1022,33 +1064,156 @@ zone_small_init(uma_zone_t zone) static void zone_large_init(uma_zone_t zone) { + uma_keg_t keg; int pages; - KASSERT((zone->uz_flags & UMA_ZFLAG_CACHEONLY) == 0, + keg = zone->uz_keg; + + KASSERT(keg != NULL, ("Keg is null in zone_large_init")); + KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0, ("zone_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY zone")); - pages = zone->uz_size / UMA_SLAB_SIZE; + pages = keg->uk_size / UMA_SLAB_SIZE; /* Account for remainder */ - if ((pages * UMA_SLAB_SIZE) < zone->uz_size) + if ((pages * UMA_SLAB_SIZE) < keg->uk_size) pages++; - zone->uz_ppera = pages; - zone->uz_ipers = 1; + keg->uk_ppera = pages; + keg->uk_ipers = 1; - zone->uz_flags |= UMA_ZONE_OFFPAGE; - if ((zone->uz_flags & UMA_ZONE_MALLOC) == 0) - zone->uz_flags |= UMA_ZONE_HASH; + keg->uk_flags |= UMA_ZONE_OFFPAGE; + if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) + keg->uk_flags |= UMA_ZONE_HASH; - zone->uz_rsize = zone->uz_size; + keg->uk_rsize = keg->uk_size; } /* - * Zone header ctor. This initializes all fields, locks, etc. And inserts - * the zone onto the global zone list. + * Keg header ctor. This initializes all fields, locks, etc. And inserts + * the keg onto the global keg list. * * Arguments/Returns follow uma_ctor specifications - * udata Actually uma_zcreat_args + * udata Actually uma_kctor_args + */ +static void +keg_ctor(void *mem, int size, void *udata) +{ + struct uma_kctor_args *arg = udata; + uma_keg_t keg = mem; + uma_zone_t zone; + + bzero(keg, size); + keg->uk_size = arg->size; + keg->uk_init = arg->uminit; + keg->uk_fini = arg->fini; + keg->uk_align = arg->align; + keg->uk_free = 0; + keg->uk_pages = 0; + keg->uk_flags = arg->flags; + keg->uk_allocf = page_alloc; + keg->uk_freef = page_free; + keg->uk_recurse = 0; + keg->uk_slabzone = NULL; + + /* + * The master zone is passed to us at keg-creation time. + */ + zone = arg->zone; + zone->uz_keg = keg; + + if (arg->flags & UMA_ZONE_VM) + keg->uk_flags |= UMA_ZFLAG_CACHEONLY; + + if (arg->flags & UMA_ZONE_ZINIT) + keg->uk_init = zero_init; + + /* + * The +1 byte added to uk_size is to account for the byte of + * linkage that is added to the size in zone_small_init(). If + * we don't account for this here then we may end up in + * zone_small_init() with a calculated 'ipers' of 0. + */ + if ((keg->uk_size+1) > (UMA_SLAB_SIZE - sizeof(struct uma_slab))) + zone_large_init(zone); + else + zone_small_init(zone); + + if (keg->uk_flags & UMA_ZONE_REFCNT) + keg->uk_slabzone = slabrefzone; + else if (keg->uk_flags & UMA_ZONE_OFFPAGE) + keg->uk_slabzone = slabzone; + + /* + * If we haven't booted yet we need allocations to go through the + * startup cache until the vm is ready. + */ + if (keg->uk_ppera == 1) { +#ifdef UMA_MD_SMALL_ALLOC + keg->uk_allocf = uma_small_alloc; + keg->uk_freef = uma_small_free; +#endif + if (booted == 0) + keg->uk_allocf = startup_alloc; + } + + /* + * Initialize keg's lock (shared among zones) through + * Master zone + */ + zone->uz_lock = &keg->uk_lock; + if (arg->flags & UMA_ZONE_MTXCLASS) + ZONE_LOCK_INIT(zone, 1); + else + ZONE_LOCK_INIT(zone, 0); + + /* + * If we're putting the slab header in the actual page we need to + * figure out where in each page it goes. This calculates a right + * justified offset into the memory on an ALIGN_PTR boundary. + */ + if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) { + int totsize; + + /* Size of the slab struct and free list */ + totsize = sizeof(struct uma_slab) + keg->uk_ipers; + if (totsize & UMA_ALIGN_PTR) + totsize = (totsize & ~UMA_ALIGN_PTR) + + (UMA_ALIGN_PTR + 1); + keg->uk_pgoff = UMA_SLAB_SIZE - totsize; + totsize = keg->uk_pgoff + sizeof(struct uma_slab) + + keg->uk_ipers; + /* I don't think it's possible, but I'll make sure anyway */ + if (totsize > UMA_SLAB_SIZE) { + printf("zone %s ipers %d rsize %d size %d\n", + zone->uz_name, keg->uk_ipers, keg->uk_rsize, + keg->uk_size); + panic("UMA slab won't fit.\n"); + } + } + + if (keg->uk_flags & UMA_ZONE_HASH) + hash_alloc(&keg->uk_hash); + +#ifdef UMA_DEBUG + printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n", + zone->uz_name, zone, + keg->uk_size, keg->uk_ipers, + keg->uk_ppera, keg->uk_pgoff); +#endif + + LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); + + mtx_lock(&uma_mtx); + LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); + mtx_unlock(&uma_mtx); +} + +/* + * Zone header ctor. This initializes all fields, locks, etc. + * + * Arguments/Returns follow uma_ctor specifications + * udata Actually uma_zctor_args */ static void @@ -1056,145 +1221,141 @@ zone_ctor(void *mem, int size, void *udata) { struct uma_zctor_args *arg = udata; uma_zone_t zone = mem; - int privlc; + uma_zone_t z; + uma_keg_t keg; bzero(zone, size); zone->uz_name = arg->name; - zone->uz_size = arg->size; zone->uz_ctor = arg->ctor; zone->uz_dtor = arg->dtor; - zone->uz_init = arg->uminit; - zone->uz_fini = arg->fini; - zone->uz_align = arg->align; - zone->uz_free = 0; - zone->uz_pages = 0; - zone->uz_flags = arg->flags; - zone->uz_allocf = page_alloc; - zone->uz_freef = page_free; + zone->uz_init = NULL; + zone->uz_fini = NULL; + zone->uz_allocs = 0; + zone->uz_fills = zone->uz_count = 0; - if (arg->flags & UMA_ZONE_ZINIT) - zone->uz_init = zero_init; - - if (arg->flags & UMA_ZONE_VM) - zone->uz_flags |= UMA_ZFLAG_CACHEONLY; - - /* - * XXX: - * The +1 byte added to uz_size is to account for the byte of - * linkage that is added to the size in zone_small_init(). If - * we don't account for this here then we may end up in - * zone_small_init() with a calculated 'ipers' of 0. - */ - if ((zone->uz_size+1) > (UMA_SLAB_SIZE - sizeof(struct uma_slab))) - zone_large_init(zone); - else - zone_small_init(zone); - /* - * If we haven't booted yet we need allocations to go through the - * startup cache until the vm is ready. - */ - if (zone->uz_ppera == 1) { -#ifdef UMA_MD_SMALL_ALLOC - zone->uz_allocf = uma_small_alloc; - zone->uz_freef = uma_small_free; -#endif - if (booted == 0) - zone->uz_allocf = startup_alloc; - } - if (arg->flags & UMA_ZONE_MTXCLASS) - privlc = 1; - else - privlc = 0; - - /* - * If we're putting the slab header in the actual page we need to - * figure out where in each page it goes. This calculates a right - * justified offset into the memory on an ALIGN_PTR boundary. - */ - if (!(zone->uz_flags & UMA_ZONE_OFFPAGE)) { - int totsize; - - /* Size of the slab struct and free list */ - totsize = sizeof(struct uma_slab) + zone->uz_ipers; - if (totsize & UMA_ALIGN_PTR) - totsize = (totsize & ~UMA_ALIGN_PTR) + - (UMA_ALIGN_PTR + 1); - zone->uz_pgoff = UMA_SLAB_SIZE - totsize; - totsize = zone->uz_pgoff + sizeof(struct uma_slab) - + zone->uz_ipers; - /* I don't think it's possible, but I'll make sure anyway */ - if (totsize > UMA_SLAB_SIZE) { - printf("zone %s ipers %d rsize %d size %d\n", - zone->uz_name, zone->uz_ipers, zone->uz_rsize, - zone->uz_size); - panic("UMA slab won't fit.\n"); + if (arg->flags & UMA_ZONE_SECONDARY) { + KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); + keg = arg->keg; + zone->uz_keg = keg; + zone->uz_init = arg->uminit; + zone->uz_fini = arg->fini; + zone->uz_lock = &keg->uk_lock; + mtx_lock(&uma_mtx); + ZONE_LOCK(zone); + keg->uk_flags |= UMA_ZONE_SECONDARY; + LIST_FOREACH(z, &keg->uk_zones, uz_link) { + if (LIST_NEXT(z, uz_link) == NULL) { + LIST_INSERT_AFTER(z, zone, uz_link); + break; + } } + ZONE_UNLOCK(zone); + mtx_unlock(&uma_mtx); + } else if (arg->keg == NULL) { + uma_kcreate(zone, arg->size, arg->uminit, arg->fini, + arg->align, arg->flags); + } else { + struct uma_kctor_args karg; + + /* We should only be here from uma_startup() */ + karg.size = arg->size; + karg.uminit = arg->uminit; + karg.fini = arg->fini; + karg.align = arg->align; + karg.flags = arg->flags; + karg.zone = zone; + keg_ctor(arg->keg, sizeof(struct uma_keg), &karg); } - - if (zone->uz_flags & UMA_ZONE_HASH) - hash_alloc(&zone->uz_hash); - -#ifdef UMA_DEBUG - printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n", - zone->uz_name, zone, - zone->uz_size, zone->uz_ipers, - zone->uz_ppera, zone->uz_pgoff); -#endif - ZONE_LOCK_INIT(zone, privlc); - - mtx_lock(&uma_mtx); - LIST_INSERT_HEAD(&uma_zones, zone, uz_link); - mtx_unlock(&uma_mtx); + keg = zone->uz_keg; + zone->uz_lock = &keg->uk_lock; /* * Some internal zones don't have room allocated for the per cpu * caches. If we're internal, bail out here. */ - if (zone->uz_flags & UMA_ZFLAG_INTERNAL) + if (keg->uk_flags & UMA_ZFLAG_INTERNAL) { + KASSERT((keg->uk_flags & UMA_ZONE_SECONDARY) == 0, + ("Secondary zone requested UMA_ZFLAG_INTERNAL")); return; + } - if (zone->uz_ipers <= BUCKET_MAX) - zone->uz_count = zone->uz_ipers; + if (keg->uk_flags & UMA_ZONE_MAXBUCKET) + zone->uz_count = BUCKET_MAX; + else if (keg->uk_ipers <= BUCKET_MAX) + zone->uz_count = keg->uk_ipers; else zone->uz_count = BUCKET_MAX; } /* - * Zone header dtor. This frees all data, destroys locks, frees the hash table - * and removes the zone from the global list. + * Keg header dtor. This frees all data, destroys locks, frees the hash + * table and removes the keg from the global list. * * Arguments/Returns follow uma_dtor specifications * udata unused */ +static void +keg_dtor(void *arg, int size, void *udata) +{ + uma_keg_t keg; + keg = (uma_keg_t)arg; + mtx_lock(&keg->uk_lock); + if (keg->uk_free != 0) { + printf("Freed UMA keg was not empty (%d items). " + " Lost %d pages of memory.\n", + keg->uk_free, keg->uk_pages); + } + mtx_unlock(&keg->uk_lock); + + if (keg->uk_flags & UMA_ZONE_HASH) + hash_free(&keg->uk_hash); + + mtx_destroy(&keg->uk_lock); +} + +/* + * Zone header dtor. + * + * Arguments/Returns follow uma_dtor specifications + * udata unused + */ static void zone_dtor(void *arg, int size, void *udata) { uma_zone_t zone; + uma_keg_t keg; zone = (uma_zone_t)arg; + keg = zone->uz_keg; - if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) + if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone); + mtx_lock(&uma_mtx); - LIST_REMOVE(zone, uz_link); zone_drain(zone); - mtx_unlock(&uma_mtx); - - ZONE_LOCK(zone); - if (zone->uz_free != 0) { - printf("Zone %s was not empty (%d items). " - " Lost %d pages of memory.\n", - zone->uz_name, zone->uz_free, zone->uz_pages); - uma_print_zone(zone); + if (keg->uk_flags & UMA_ZONE_SECONDARY) { + LIST_REMOVE(zone, uz_link); + /* + * XXX there are some races here where + * the zone can be drained but zone lock + * released and then refilled before we + * remove it... we dont care for now + */ + ZONE_LOCK(zone); + if (LIST_EMPTY(&keg->uk_zones)) + keg->uk_flags &= ~UMA_ZONE_SECONDARY; + ZONE_UNLOCK(zone); + mtx_unlock(&uma_mtx); + } else { + LIST_REMOVE(keg, uk_link); + LIST_REMOVE(zone, uz_link); + mtx_unlock(&uma_mtx); + uma_zfree_internal(kegs, keg, NULL, 0); } - - ZONE_UNLOCK(zone); - if (zone->uz_flags & UMA_ZONE_HASH) - hash_free(&zone->uz_hash); - - ZONE_LOCK_FINI(zone); + zone->uz_keg = NULL; } + /* * Traverses every zone in the system and calls a callback * @@ -1208,11 +1369,14 @@ zone_dtor(void *arg, int size, void *udata) static void zone_foreach(void (*zfunc)(uma_zone_t)) { + uma_keg_t keg; uma_zone_t zone; mtx_lock(&uma_mtx); - LIST_FOREACH(zone, &uma_zones, uz_link) - zfunc(zone); + LIST_FOREACH(keg, &uma_kegs, uk_link) { + LIST_FOREACH(zone, &keg->uk_zones, uz_link) + zfunc(zone); + } mtx_unlock(&uma_mtx); } @@ -1227,25 +1391,23 @@ uma_startup(void *bootmem) int i; #ifdef UMA_DEBUG - printf("Creating uma zone headers zone.\n"); + printf("Creating uma keg headers zone and keg.\n"); #endif mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF); - /* "manually" Create the initial zone */ - args.name = "UMA Zones"; - args.size = sizeof(struct uma_zone) + - (sizeof(struct uma_cache) * (mp_maxid + 1)); - args.ctor = zone_ctor; - args.dtor = zone_dtor; + + /* "manually" create the initial zone */ + args.name = "UMA Kegs"; + args.size = sizeof(struct uma_keg); + args.ctor = keg_ctor; + args.dtor = keg_dtor; args.uminit = zero_init; args.fini = NULL; + args.keg = &masterkeg; args.align = 32 - 1; args.flags = UMA_ZFLAG_INTERNAL; /* The initial zone has no Per cpu queues so it's smaller */ - zone_ctor(zones, sizeof(struct uma_zone), &args); + zone_ctor(kegs, sizeof(struct uma_zone), &args); - /* Initialize the pcpu cache lock set once and for all */ - for (i = 0; i <= mp_maxid; i++) - CPU_LOCK_INIT(i); #ifdef UMA_DEBUG printf("Filling boot free list.\n"); #endif @@ -1258,7 +1420,30 @@ uma_startup(void *bootmem) } #ifdef UMA_DEBUG - printf("Creating slab zone.\n"); + printf("Creating uma zone headers zone and keg.\n"); +#endif + args.name = "UMA Zones"; + args.size = sizeof(struct uma_zone) + + (sizeof(struct uma_cache) * (mp_maxid + 1)); + args.ctor = zone_ctor; + args.dtor = zone_dtor; + args.uminit = zero_init; + args.fini = NULL; + args.keg = NULL; + args.align = 32 - 1; + args.flags = UMA_ZFLAG_INTERNAL; + /* The initial zone has no Per cpu queues so it's smaller */ + zone_ctor(zones, sizeof(struct uma_zone), &args); + +#ifdef UMA_DEBUG + printf("Initializing pcpu cache locks.\n"); +#endif + /* Initialize the pcpu cache lock set once and for all */ + for (i = 0; i <= mp_maxid; i++) + CPU_LOCK_INIT(i); + +#ifdef UMA_DEBUG + printf("Creating slab and hash zones.\n"); #endif /* @@ -1276,6 +1461,20 @@ uma_startup(void *bootmem) NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); + /* + * We also create a zone for the bigger slabs with reference + * counts in them, to accomodate UMA_ZONE_REFCNT zones. + */ + slabsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt); + slabsize /= UMA_MAX_WASTE; + slabsize++; + slabsize += 4 * slabsize; + slabsize += sizeof(struct uma_slab_refcnt); + slabrefzone = uma_zcreate("UMA RCntSlabs", + slabsize, + NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); + hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, @@ -1321,6 +1520,21 @@ uma_startup3(void) #endif } +static void +uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, + int align, u_int16_t flags) +{ + struct uma_kctor_args args; + + args.size = size; + args.uminit = uminit; + args.fini = fini; + args.align = align; + args.flags = flags; + args.zone = zone; + zone = uma_zalloc_internal(kegs, &args, M_WAITOK); +} + /* See uma.h */ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, @@ -1338,6 +1552,27 @@ uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, args.fini = fini; args.align = align; args.flags = flags; + args.keg = NULL; + + return (uma_zalloc_internal(zones, &args, M_WAITOK)); +} + +/* See uma.h */ +uma_zone_t +uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, + uma_init zinit, uma_fini zfini, uma_zone_t master) +{ + struct uma_zctor_args args; + + args.name = name; + args.size = master->uz_keg->uk_size; + args.ctor = ctor; + args.dtor = dtor; + args.uminit = zinit; + args.fini = zfini; + args.align = master->uz_keg->uk_align; + args.flags = master->uz_keg->uk_flags | UMA_ZONE_SECONDARY; + args.keg = master->uz_keg; return (uma_zalloc_internal(zones, &args, M_WAITOK)); } @@ -1357,35 +1592,25 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) uma_cache_t cache; uma_bucket_t bucket; int cpu; + int badness = 1; /* This is the fast path allocation */ #ifdef UMA_DEBUG_ALLOC_1 printf("Allocating one item from %s(%p)\n", zone->uz_name, zone); #endif -#ifdef INVARIANTS - /* - * To make sure that WAITOK or NOWAIT is set, but not more than - * one, and check against the API botches that are common. - * The uma code implies M_WAITOK if M_NOWAIT is not set, so - * we default to waiting if none of the flags is set. - */ - cpu = flags & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT); - if (cpu != M_NOWAIT && cpu != M_WAITOK) { - static struct timeval lasterr; - static int curerr, once; - if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) { - printf("Bad uma_zalloc flags: %x\n", cpu); - backtrace(); - once++; - } - } -#endif if (!(flags & M_NOWAIT)) { KASSERT(curthread->td_intr_nesting_level == 0, ("malloc(M_WAITOK) in interrupt context")); - WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, - "malloc() of \"%s\"", zone->uz_name); +#ifdef WITNESS + badness = WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, + "malloc(M_WAITOK) of \"%s\", forcing M_NOWAIT", + zone->uz_name); +#endif + if (badness) { + flags &= ~M_WAITOK; + flags |= M_NOWAIT; + } } zalloc_restart: @@ -1413,9 +1638,9 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) #endif CPU_UNLOCK(cpu); if (zone->uz_ctor) - zone->uz_ctor(item, zone->uz_size, udata); + zone->uz_ctor(item,zone->uz_keg->uk_size,udata); if (flags & M_ZERO) - bzero(item, zone->uz_size); + bzero(item, zone->uz_keg->uk_size); return (item); } else if (cache->uc_freebucket) { /* @@ -1465,6 +1690,7 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) /* Bump up our uz_count so we get here less */ if (zone->uz_count < BUCKET_MAX) zone->uz_count++; + /* * Now lets just fill a bucket and put it on the free list. If that * works we'll restart the allocation from the begining. @@ -1488,6 +1714,9 @@ static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags) { uma_slab_t slab; + uma_keg_t keg; + + keg = zone->uz_keg; /* * This is to prevent us from recursively trying to allocate @@ -1498,7 +1727,7 @@ uma_zone_slab(uma_zone_t zone, int flags) * things happen. So instead we return a NULL bucket, and make * the code that allocates buckets smart enough to deal with it */ - if (zone->uz_flags & UMA_ZFLAG_INTERNAL && zone->uz_recurse != 0) + if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0) return (NULL); slab = NULL; @@ -1509,14 +1738,14 @@ uma_zone_slab(uma_zone_t zone, int flags) * used over those that are totally full. This helps to reduce * fragmentation. */ - if (zone->uz_free != 0) { - if (!LIST_EMPTY(&zone->uz_part_slab)) { - slab = LIST_FIRST(&zone->uz_part_slab); + if (keg->uk_free != 0) { + if (!LIST_EMPTY(&keg->uk_part_slab)) { + slab = LIST_FIRST(&keg->uk_part_slab); } else { - slab = LIST_FIRST(&zone->uz_free_slab); + slab = LIST_FIRST(&keg->uk_free_slab); LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&zone->uz_part_slab, slab, - us_link); + LIST_INSERT_HEAD(&keg->uk_part_slab, slab, + us_link); } return (slab); } @@ -1527,27 +1756,28 @@ uma_zone_slab(uma_zone_t zone, int flags) if (flags & M_NOVM) break; - if (zone->uz_maxpages && - zone->uz_pages >= zone->uz_maxpages) { - zone->uz_flags |= UMA_ZFLAG_FULL; + if (keg->uk_maxpages && + keg->uk_pages >= keg->uk_maxpages) { + keg->uk_flags |= UMA_ZFLAG_FULL; if (flags & M_NOWAIT) break; else - msleep(zone, &zone->uz_lock, PVM, + msleep(keg, &keg->uk_lock, PVM, "zonelimit", 0); continue; } - zone->uz_recurse++; + keg->uk_recurse++; slab = slab_zalloc(zone, flags); - zone->uz_recurse--; + keg->uk_recurse--; + /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove * at least one item. */ if (slab) { - LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link); + LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); return (slab); } /* @@ -1564,22 +1794,25 @@ uma_zone_slab(uma_zone_t zone, int flags) static void * uma_slab_alloc(uma_zone_t zone, uma_slab_t slab) { + uma_keg_t keg; void *item; u_int8_t freei; + keg = zone->uz_keg; + freei = slab->us_firstfree; - slab->us_firstfree = slab->us_freelist[freei]; - item = slab->us_data + (zone->uz_rsize * freei); + slab->us_firstfree = slab->us_freelist[freei].us_item; + item = slab->us_data + (keg->uk_rsize * freei); slab->us_freecount--; - zone->uz_free--; + keg->uk_free--; #ifdef INVARIANTS uma_dbg_alloc(zone, slab, item); #endif /* Move this slab to the full list */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&zone->uz_full_slab, slab, us_link); + LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link); } return (item); @@ -1590,6 +1823,7 @@ uma_zalloc_bucket(uma_zone_t zone, int flags) { uma_bucket_t bucket; uma_slab_t slab; + int16_t saved; int max; /* @@ -1603,7 +1837,7 @@ uma_zalloc_bucket(uma_zone_t zone, int flags) int bflags; bflags = (flags & ~M_ZERO); - if (zone->uz_flags & UMA_ZFLAG_CACHEONLY) + if (zone->uz_keg->uk_flags & UMA_ZFLAG_CACHEONLY) bflags |= M_NOVM; ZONE_UNLOCK(zone); @@ -1628,18 +1862,36 @@ uma_zalloc_bucket(uma_zone_t zone, int flags) max = MIN(bucket->ub_entries, zone->uz_count); /* Try to keep the buckets totally full */ + saved = bucket->ub_cnt; while (bucket->ub_cnt < max && (slab = uma_zone_slab(zone, flags)) != NULL) { while (slab->us_freecount && bucket->ub_cnt < max) { bucket->ub_bucket[bucket->ub_cnt++] = uma_slab_alloc(zone, slab); } + /* Don't block on the next fill */ flags |= M_NOWAIT; } - zone->uz_fills--; + /* + * We unlock here because we need to call the zone's init. + * It should be safe to unlock because the slab dealt with + * above is already on the appropriate list within the keg + * and the bucket we filled is not yet on any list, so we + * own it. + */ + if (zone->uz_init != NULL) { + int i; + ZONE_UNLOCK(zone); + for (i = saved; i < bucket->ub_cnt; i++) + zone->uz_init(bucket->ub_bucket[i], + zone->uz_keg->uk_size); + ZONE_LOCK(zone); + } + + zone->uz_fills--; if (bucket->ub_cnt != 0) { LIST_INSERT_HEAD(&zone->uz_full_bucket, bucket, ub_link); @@ -1668,10 +1920,12 @@ uma_zalloc_bucket(uma_zone_t zone, int flags) static void * uma_zalloc_internal(uma_zone_t zone, void *udata, int flags) { + uma_keg_t keg; uma_slab_t slab; void *item; item = NULL; + keg = zone->uz_keg; #ifdef UMA_DEBUG_ALLOC printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone); @@ -1688,10 +1942,18 @@ uma_zalloc_internal(uma_zone_t zone, void *udata, int flags) ZONE_UNLOCK(zone); + /* + * We have to call both the zone's init (not the keg's init) + * and the zone's ctor. This is because the item is going from + * a keg slab directly to the user, and the user is expecting it + * to be both zone-init'd as well as zone-ctor'd. + */ + if (zone->uz_init != NULL) + zone->uz_init(item, keg->uk_size); if (zone->uz_ctor != NULL) - zone->uz_ctor(item, zone->uz_size, udata); + zone->uz_ctor(item, keg->uk_size, udata); if (flags & M_ZERO) - bzero(item, zone->uz_size); + bzero(item, keg->uk_size); return (item); } @@ -1700,6 +1962,7 @@ uma_zalloc_internal(uma_zone_t zone, void *udata, int flags) void uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { + uma_keg_t keg; uma_cache_t cache; uma_bucket_t bucket; int bflags; @@ -1708,6 +1971,8 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata) /* This is the fast path free */ skip = 0; + keg = zone->uz_keg; + #ifdef UMA_DEBUG_ALLOC_1 printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone); #endif @@ -1716,11 +1981,11 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata) * a little longer for the limits to be reset. */ - if (zone->uz_flags & UMA_ZFLAG_FULL) + if (keg->uk_flags & UMA_ZFLAG_FULL) goto zfree_internal; if (zone->uz_dtor) { - zone->uz_dtor(item, zone->uz_size, udata); + zone->uz_dtor(item, keg->uk_size, udata); skip = 1; } @@ -1745,7 +2010,7 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata) bucket->ub_cnt++; #ifdef INVARIANTS ZONE_LOCK(zone); - if (zone->uz_flags & UMA_ZONE_MALLOC) + if (keg->uk_flags & UMA_ZONE_MALLOC) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); @@ -1810,7 +2075,7 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata) #endif bflags = M_NOWAIT; - if (zone->uz_flags & UMA_ZFLAG_CACHEONLY) + if (keg->uk_flags & UMA_ZFLAG_CACHEONLY) bflags |= M_NOVM; bucket = bucket_alloc(zone->uz_count, bflags); if (bucket) { @@ -1836,7 +2101,7 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata) */ if (skip) { ZONE_LOCK(zone); - if (zone->uz_flags & UMA_ZONE_MALLOC) + if (keg->uk_flags & UMA_ZONE_MALLOC) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); @@ -1846,7 +2111,6 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata) uma_zfree_internal(zone, item, udata, skip); return; - } /* @@ -1862,20 +2126,25 @@ static void uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip) { uma_slab_t slab; + uma_keg_t keg; u_int8_t *mem; u_int8_t freei; + keg = zone->uz_keg; + if (!skip && zone->uz_dtor) - zone->uz_dtor(item, zone->uz_size, udata); + zone->uz_dtor(item, keg->uk_size, udata); + if (zone->uz_fini) + zone->uz_fini(item, keg->uk_size); ZONE_LOCK(zone); - if (!(zone->uz_flags & UMA_ZONE_MALLOC)) { + if (!(keg->uk_flags & UMA_ZONE_MALLOC)) { mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK)); - if (zone->uz_flags & UMA_ZONE_HASH) - slab = hash_sfind(&zone->uz_hash, mem); + if (keg->uk_flags & UMA_ZONE_HASH) + slab = hash_sfind(&keg->uk_hash, mem); else { - mem += zone->uz_pgoff; + mem += keg->uk_pgoff; slab = (uma_slab_t)mem; } } else { @@ -1883,36 +2152,36 @@ uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip) } /* Do we need to remove from any lists? */ - if (slab->us_freecount+1 == zone->uz_ipers) { + if (slab->us_freecount+1 == keg->uk_ipers) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link); + LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); - LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link); + LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link); } /* Slab management stuff */ freei = ((unsigned long)item - (unsigned long)slab->us_data) - / zone->uz_rsize; + / keg->uk_rsize; #ifdef INVARIANTS if (!skip) uma_dbg_free(zone, slab, item); #endif - slab->us_freelist[freei] = slab->us_firstfree; + slab->us_freelist[freei].us_item = slab->us_firstfree; slab->us_firstfree = freei; slab->us_freecount++; /* Zone statistics */ - zone->uz_free++; + keg->uk_free++; - if (zone->uz_flags & UMA_ZFLAG_FULL) { - if (zone->uz_pages < zone->uz_maxpages) - zone->uz_flags &= ~UMA_ZFLAG_FULL; + if (keg->uk_flags & UMA_ZFLAG_FULL) { + if (keg->uk_pages < keg->uk_maxpages) + keg->uk_flags &= ~UMA_ZFLAG_FULL; /* We can handle one more allocation */ - wakeup_one(zone); + wakeup_one(keg); } ZONE_UNLOCK(zone); @@ -1922,24 +2191,71 @@ uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip) void uma_zone_set_max(uma_zone_t zone, int nitems) { + uma_keg_t keg; + + keg = zone->uz_keg; ZONE_LOCK(zone); - if (zone->uz_ppera > 1) - zone->uz_maxpages = nitems * zone->uz_ppera; + if (keg->uk_ppera > 1) + keg->uk_maxpages = nitems * keg->uk_ppera; else - zone->uz_maxpages = nitems / zone->uz_ipers; + keg->uk_maxpages = nitems / keg->uk_ipers; - if (zone->uz_maxpages * zone->uz_ipers < nitems) - zone->uz_maxpages++; + if (keg->uk_maxpages * keg->uk_ipers < nitems) + keg->uk_maxpages++; ZONE_UNLOCK(zone); } +/* See uma.h */ +void +uma_zone_set_init(uma_zone_t zone, uma_init uminit) +{ + ZONE_LOCK(zone); + KASSERT(zone->uz_keg->uk_pages == 0, + ("uma_zone_set_init on non-empty keg")); + zone->uz_keg->uk_init = uminit; + ZONE_UNLOCK(zone); +} + +/* See uma.h */ +void +uma_zone_set_fini(uma_zone_t zone, uma_fini fini) +{ + ZONE_LOCK(zone); + KASSERT(zone->uz_keg->uk_pages == 0, + ("uma_zone_set_fini on non-empty keg")); + zone->uz_keg->uk_fini = fini; + ZONE_UNLOCK(zone); +} + +/* See uma.h */ +void +uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) +{ + ZONE_LOCK(zone); + KASSERT(zone->uz_keg->uk_pages == 0, + ("uma_zone_set_zinit on non-empty keg")); + zone->uz_init = zinit; + ZONE_UNLOCK(zone); +} + +/* See uma.h */ +void +uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) +{ + ZONE_LOCK(zone); + KASSERT(zone->uz_keg->uk_pages == 0, + ("uma_zone_set_zfini on non-empty keg")); + zone->uz_fini = zfini; + ZONE_UNLOCK(zone); +} + /* See uma.h */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef) { ZONE_LOCK(zone); - zone->uz_freef = freef; + zone->uz_keg->uk_freef = freef; ZONE_UNLOCK(zone); } @@ -1948,8 +2264,8 @@ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) { ZONE_LOCK(zone); - zone->uz_flags |= UMA_ZFLAG_PRIVALLOC; - zone->uz_allocf = allocf; + zone->uz_keg->uk_flags |= UMA_ZFLAG_PRIVALLOC; + zone->uz_keg->uk_allocf = allocf; ZONE_UNLOCK(zone); } @@ -1957,12 +2273,14 @@ uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count) { - int pages; + uma_keg_t keg; vm_offset_t kva; + int pages; - pages = count / zone->uz_ipers; + keg = zone->uz_keg; + pages = count / keg->uk_ipers; - if (pages * zone->uz_ipers < count) + if (pages * keg->uk_ipers < count) pages++; kva = kmem_alloc_pageable(kernel_map, pages * UMA_SLAB_SIZE); @@ -1978,11 +2296,11 @@ uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count) pages, obj); } ZONE_LOCK(zone); - zone->uz_kva = kva; - zone->uz_obj = obj; - zone->uz_maxpages = pages; - zone->uz_allocf = obj_alloc; - zone->uz_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC; + keg->uk_kva = kva; + keg->uk_obj = obj; + keg->uk_maxpages = pages; + keg->uk_allocf = obj_alloc; + keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC; ZONE_UNLOCK(zone); return (1); } @@ -1993,19 +2311,40 @@ uma_prealloc(uma_zone_t zone, int items) { int slabs; uma_slab_t slab; + uma_keg_t keg; + keg = zone->uz_keg; ZONE_LOCK(zone); - slabs = items / zone->uz_ipers; - if (slabs * zone->uz_ipers < items) + slabs = items / keg->uk_ipers; + if (slabs * keg->uk_ipers < items) slabs++; while (slabs > 0) { slab = slab_zalloc(zone, M_WAITOK); - LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link); + LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link); slabs--; } ZONE_UNLOCK(zone); } +/* See uma.h */ +u_int32_t * +uma_find_refcnt(uma_zone_t zone, void *item) +{ + uma_slabrefcnt_t slab; + uma_keg_t keg; + u_int32_t *refcnt; + int idx; + + keg = zone->uz_keg; + slab = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK)); + KASSERT(slab != NULL, + ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT")); + idx = ((unsigned long)item - (unsigned long)slab->us_data) + / keg->uk_rsize; + refcnt = &(slab->us_freelist[idx].us_refcnt); + return refcnt; +} + /* See uma.h */ void uma_reclaim(void) @@ -2021,6 +2360,7 @@ uma_reclaim(void) * zones are drained. We have to do the same for buckets. */ zone_drain(slabzone); + zone_drain(slabrefzone); bucket_zone_drain(); } @@ -2044,7 +2384,6 @@ uma_large_malloc(int size, int wait) uma_zfree_internal(slabzone, slab, NULL, 0); } - return (mem); } @@ -2065,8 +2404,8 @@ uma_print_stats(void) static void slab_print(uma_slab_t slab) { - printf("slab: zone %p, data %p, freecount %d, firstfree %d\n", - slab->us_zone, slab->us_data, slab->us_freecount, + printf("slab: keg %p, data %p, freecount %d, firstfree %d\n", + slab->us_keg, slab->us_data, slab->us_freecount, slab->us_firstfree); } @@ -2084,21 +2423,23 @@ void uma_print_zone(uma_zone_t zone) { uma_cache_t cache; + uma_keg_t keg; uma_slab_t slab; int i; + keg = zone->uz_keg; printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n", - zone->uz_name, zone, zone->uz_size, zone->uz_rsize, zone->uz_flags, - zone->uz_ipers, zone->uz_ppera, - (zone->uz_ipers * zone->uz_pages) - zone->uz_free, zone->uz_free); + zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags, + keg->uk_ipers, keg->uk_ppera, + (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free); printf("Part slabs:\n"); - LIST_FOREACH(slab, &zone->uz_part_slab, us_link) + LIST_FOREACH(slab, &keg->uk_part_slab, us_link) slab_print(slab); printf("Free slabs:\n"); - LIST_FOREACH(slab, &zone->uz_free_slab, us_link) + LIST_FOREACH(slab, &keg->uk_free_slab, us_link) slab_print(slab); printf("Full slabs:\n"); - LIST_FOREACH(slab, &zone->uz_full_slab, us_link) + LIST_FOREACH(slab, &keg->uk_full_slab, us_link) slab_print(slab); for (i = 0; i <= mp_maxid; i++) { if (CPU_ABSENT(i)) @@ -2122,6 +2463,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS) int totalfree; char *tmpbuf, *offset; uma_zone_t z; + uma_keg_t zk; char *p; int cpu; int cachefree; @@ -2130,8 +2472,10 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS) cnt = 0; mtx_lock(&uma_mtx); - LIST_FOREACH(z, &uma_zones, uz_link) - cnt++; + LIST_FOREACH(zk, &uma_kegs, uk_link) { + LIST_FOREACH(z, &zk->uk_zones, uz_link) + cnt++; + } mtx_unlock(&uma_mtx); MALLOC(tmpbuf, char *, (cnt == 0 ? 1 : cnt) * linesize, M_TEMP, M_WAITOK); @@ -2144,10 +2488,11 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS) goto out; offset = tmpbuf; mtx_lock(&uma_mtx); - LIST_FOREACH(z, &uma_zones, uz_link) { + LIST_FOREACH(zk, &uma_kegs, uk_link) { + LIST_FOREACH(z, &zk->uk_zones, uz_link) { if (cnt == 0) /* list may have changed size */ break; - if (!(z->uz_flags & UMA_ZFLAG_INTERNAL)) { + if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) { for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) continue; @@ -2156,7 +2501,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS) } ZONE_LOCK(z); cachefree = 0; - if (!(z->uz_flags & UMA_ZFLAG_INTERNAL)) { + if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) { for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) continue; @@ -2171,12 +2516,12 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS) LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) { cachefree += bucket->ub_cnt; } - totalfree = z->uz_free + cachefree; + totalfree = zk->uk_free + cachefree; len = snprintf(offset, linesize, "%-12.12s %6.6u, %8.8u, %6.6u, %6.6u, %8.8llu\n", - z->uz_name, z->uz_size, - z->uz_maxpages * z->uz_ipers, - (z->uz_ipers * (z->uz_pages / z->uz_ppera)) - totalfree, + z->uz_name, zk->uk_size, + zk->uk_maxpages * zk->uk_ipers, + (zk->uk_ipers * (zk->uk_pages / zk->uk_ppera)) - totalfree, totalfree, (unsigned long long)z->uz_allocs); ZONE_UNLOCK(z); @@ -2185,6 +2530,7 @@ sysctl_vm_zone(SYSCTL_HANDLER_ARGS) p[1] = ':'; cnt--; offset += len; + } } mtx_unlock(&uma_mtx); *offset++ = '\0'; diff --git a/sys/vm/uma_dbg.c b/sys/vm/uma_dbg.c index 85d067d543d4..0f845cffa7da 100644 --- a/sys/vm/uma_dbg.c +++ b/sys/vm/uma_dbg.c @@ -192,15 +192,17 @@ static uma_slab_t uma_dbg_getslab(uma_zone_t zone, void *item) { uma_slab_t slab; + uma_keg_t keg; u_int8_t *mem; + keg = zone->uz_keg; mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK)); - if (zone->uz_flags & UMA_ZONE_MALLOC) { + if (keg->uk_flags & UMA_ZONE_MALLOC) { slab = vtoslab((vm_offset_t)mem); - } else if (zone->uz_flags & UMA_ZONE_HASH) { - slab = hash_sfind(&zone->uz_hash, mem); + } else if (keg->uk_flags & UMA_ZONE_HASH) { + slab = hash_sfind(&keg->uk_hash, mem); } else { - mem += zone->uz_pgoff; + mem += keg->uk_pgoff; slab = (uma_slab_t)mem; } @@ -215,8 +217,10 @@ uma_dbg_getslab(uma_zone_t zone, void *item) void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) { + uma_keg_t keg; int freei; + keg = zone->uz_keg; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) @@ -225,9 +229,9 @@ uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) } freei = ((unsigned long)item - (unsigned long)slab->us_data) - / zone->uz_rsize; + / keg->uk_rsize; - slab->us_freelist[freei] = 255; + slab->us_freelist[freei].us_item = 255; return; } @@ -241,8 +245,10 @@ uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) { + uma_keg_t keg; int freei; + keg = zone->uz_keg; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) @@ -251,22 +257,22 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) } freei = ((unsigned long)item - (unsigned long)slab->us_data) - / zone->uz_rsize; + / keg->uk_rsize; - if (freei >= zone->uz_ipers) + if (freei >= keg->uk_ipers) panic("zone: %s(%p) slab %p freelist %d out of range 0-%d\n", - zone->uz_name, zone, slab, freei, zone->uz_ipers-1); + zone->uz_name, zone, slab, freei, keg->uk_ipers-1); - if (((freei * zone->uz_rsize) + slab->us_data) != item) { + if (((freei * keg->uk_rsize) + slab->us_data) != item) { printf("zone: %s(%p) slab %p freed address %p unaligned.\n", zone->uz_name, zone, slab, item); panic("should be %p\n", - (freei * zone->uz_rsize) + slab->us_data); + (freei * keg->uk_rsize) + slab->us_data); } - if (slab->us_freelist[freei] != 255) { + if (slab->us_freelist[freei].us_item != 255) { printf("Slab at %p, freei %d = %d.\n", - slab, freei, slab->us_freelist[freei]); + slab, freei, slab->us_freelist[freei].us_item); panic("Duplicate free of item %p from zone %p(%s)\n", item, zone, zone->uz_name); } @@ -276,5 +282,5 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) * Until then the count of valid slabs will make sure we don't * accidentally follow this and assume it's a valid index. */ - slab->us_freelist[freei] = 0; + slab->us_freelist[freei].us_item = 0; } diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h index 35acfdead8a8..a4cbe5f8e1a8 100644 --- a/sys/vm/uma_int.h +++ b/sys/vm/uma_int.h @@ -35,10 +35,10 @@ /* * Here's a quick description of the relationship between the objects: * - * Zones contain lists of slabs which are stored in either the full bin, empty + * Kegs contain lists of slabs which are stored in either the full bin, empty * bin, or partially allocated bin, to reduce fragmentation. They also contain * the user supplied value for size, which is adjusted for alignment purposes - * and rsize is the result of that. The zone also stores information for + * and rsize is the result of that. The Keg also stores information for * managing a hash of page addresses that maps pages to uma_slab_t structures * for pages that don't have embedded uma_slab_t's. * @@ -67,6 +67,20 @@ * so at this time it may not make sense to optimize for it. This can, of * course, be solved with dynamic slab sizes. * + * Kegs may serve multiple Zones but by far most of the time they only serve + * one. When a Zone is created, a Keg is allocated and setup for it. While + * the backing Keg stores slabs, the Zone caches Buckets of items allocated + * from the slabs. Each Zone is equipped with an init/fini and ctor/dtor + * pair, as well as with its own set of small per-CPU caches, layered above + * the Zone's general Bucket cache. + * + * The PCPU caches are protected by their own locks, while the Zones backed + * by the same Keg all share a common Keg lock (to coalesce contention on + * the backing slabs). The backing Keg typically only serves one Zone but + * in the case of multiple Zones, one of the Zones is considered the + * Master Zone and all Zone-related stats from the Keg are done in the + * Master Zone. For an example of a Multi-Zone setup, refer to the + * Mbuf allocation code. */ /* @@ -134,28 +148,6 @@ SLIST_REMOVE(&(h)->uh_slab_hash[UMA_HASH((h), \ (mem))], (s), uma_slab, us_hlink); -/* Page management structure */ - -/* Sorry for the union, but space efficiency is important */ -struct uma_slab { - uma_zone_t us_zone; /* Zone we live in */ - union { - LIST_ENTRY(uma_slab) _us_link; /* slabs in zone */ - unsigned long _us_size; /* Size of allocation */ - } us_type; - SLIST_ENTRY(uma_slab) us_hlink; /* Link for hash table */ - u_int8_t *us_data; /* First item */ - u_int8_t us_flags; /* Page flags see uma.h */ - u_int8_t us_freecount; /* How many are free? */ - u_int8_t us_firstfree; /* First free item index */ - u_int8_t us_freelist[1]; /* Free List (actually larger) */ -}; - -#define us_link us_type._us_link -#define us_size us_type._us_size - -typedef struct uma_slab * uma_slab_t; - /* Hash table for freed address -> slab translation */ SLIST_HEAD(slabhead, uma_slab); @@ -187,6 +179,97 @@ struct uma_cache { typedef struct uma_cache * uma_cache_t; +/* + * Keg management structure + * + * TODO: Optimize for cache line size + * + */ +struct uma_keg { + LIST_ENTRY(uma_keg) uk_link; /* List of all kegs */ + + struct mtx uk_lock; /* Lock for the keg */ + struct uma_hash uk_hash; + + LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */ + LIST_HEAD(,uma_slab) uk_part_slab; /* partially allocated slabs */ + LIST_HEAD(,uma_slab) uk_free_slab; /* empty slab list */ + LIST_HEAD(,uma_slab) uk_full_slab; /* full slabs */ + + u_int32_t uk_recurse; /* Allocation recursion count */ + u_int32_t uk_align; /* Alignment mask */ + u_int32_t uk_pages; /* Total page count */ + u_int32_t uk_free; /* Count of items free in slabs */ + u_int32_t uk_size; /* Requested size of each item */ + u_int32_t uk_rsize; /* Real size of each item */ + u_int32_t uk_maxpages; /* Maximum number of pages to alloc */ + + uma_init uk_init; /* Keg's init routine */ + uma_fini uk_fini; /* Keg's fini routine */ + uma_alloc uk_allocf; /* Allocation function */ + uma_free uk_freef; /* Free routine */ + + struct vm_object *uk_obj; /* Zone specific object */ + vm_offset_t uk_kva; /* Base kva for zones with objs */ + uma_zone_t uk_slabzone; /* Slab zone backing us, if OFFPAGE */ + + u_int16_t uk_pgoff; /* Offset to uma_slab struct */ + u_int16_t uk_ppera; /* pages per allocation from backend */ + u_int16_t uk_ipers; /* Items per slab */ + u_int16_t uk_flags; /* Internal flags */ +}; + +/* Simpler reference to uma_keg for internal use. */ +typedef struct uma_keg * uma_keg_t; + +/* Page management structure */ + +/* Sorry for the union, but space efficiency is important */ +struct uma_slab_head { + uma_keg_t us_keg; /* Keg we live in */ + union { + LIST_ENTRY(uma_slab) _us_link; /* slabs in zone */ + unsigned long _us_size; /* Size of allocation */ + } us_type; + SLIST_ENTRY(uma_slab) us_hlink; /* Link for hash table */ + u_int8_t *us_data; /* First item */ + u_int8_t us_flags; /* Page flags see uma.h */ + u_int8_t us_freecount; /* How many are free? */ + u_int8_t us_firstfree; /* First free item index */ +}; + +/* The standard slab structure */ +struct uma_slab { + struct uma_slab_head us_head; /* slab header data */ + struct { + u_int8_t us_item; + } us_freelist[1]; /* actual number bigger */ +}; + +/* + * The slab structure for UMA_ZONE_REFCNT zones for whose items we + * maintain reference counters in the slab for. + */ +struct uma_slab_refcnt { + struct uma_slab_head us_head; /* slab header data */ + struct { + u_int8_t us_item; + u_int32_t us_refcnt; + } us_freelist[1]; /* actual number bigger */ +}; + +#define us_keg us_head.us_keg +#define us_link us_head.us_type._us_link +#define us_size us_head.us_type._us_size +#define us_hlink us_head.us_hlink +#define us_data us_head.us_data +#define us_flags us_head.us_flags +#define us_freecount us_head.us_freecount +#define us_firstfree us_head.us_firstfree + +typedef struct uma_slab * uma_slab_t; +typedef struct uma_slab_refcnt * uma_slabrefcnt_t; + /* * Zone management structure * @@ -195,42 +278,22 @@ typedef struct uma_cache * uma_cache_t; */ struct uma_zone { char *uz_name; /* Text name of the zone */ - LIST_ENTRY(uma_zone) uz_link; /* List of all zones */ - u_int32_t uz_align; /* Alignment mask */ - u_int32_t uz_pages; /* Total page count */ + struct mtx *uz_lock; /* Lock for the zone (keg's lock) */ + uma_keg_t uz_keg; /* Our underlying Keg */ -/* Used during alloc / free */ - struct mtx uz_lock; /* Lock for the zone */ - u_int32_t uz_free; /* Count of items free in slabs */ - u_int16_t uz_ipers; /* Items per slab */ - u_int16_t uz_flags; /* Internal flags */ - - LIST_HEAD(,uma_slab) uz_part_slab; /* partially allocated slabs */ - LIST_HEAD(,uma_slab) uz_free_slab; /* empty slab list */ - LIST_HEAD(,uma_slab) uz_full_slab; /* full slabs */ + LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ LIST_HEAD(,uma_bucket) uz_full_bucket; /* full buckets */ LIST_HEAD(,uma_bucket) uz_free_bucket; /* Buckets for frees */ - u_int32_t uz_size; /* Requested size of each item */ - u_int32_t uz_rsize; /* Real size of each item */ - - struct uma_hash uz_hash; - u_int16_t uz_pgoff; /* Offset to uma_slab struct */ - u_int16_t uz_ppera; /* pages per allocation from backend */ uma_ctor uz_ctor; /* Constructor for each allocation */ uma_dtor uz_dtor; /* Destructor */ - u_int64_t uz_allocs; /* Total number of allocations */ - uma_init uz_init; /* Initializer for each item */ uma_fini uz_fini; /* Discards memory */ - uma_alloc uz_allocf; /* Allocation function */ - uma_free uz_freef; /* Free routine */ - struct vm_object *uz_obj; /* Zone specific object */ - vm_offset_t uz_kva; /* Base kva for zones with objs */ - u_int32_t uz_maxpages; /* Maximum number of pages to alloc */ - int uz_recurse; /* Allocation recursion count */ + + u_int64_t uz_allocs; /* Total number of allocations */ uint16_t uz_fills; /* Outstanding bucket fills */ uint16_t uz_count; /* Highest value ub_ptr can have */ + /* * This HAS to be the last item because we adjust the zone size * based on NCPU and then allocate the space for the zones. @@ -256,16 +319,16 @@ void uma_large_free(uma_slab_t slab); #define ZONE_LOCK_INIT(z, lc) \ do { \ if ((lc)) \ - mtx_init(&(z)->uz_lock, (z)->uz_name, \ + mtx_init((z)->uz_lock, (z)->uz_name, \ (z)->uz_name, MTX_DEF | MTX_DUPOK); \ else \ - mtx_init(&(z)->uz_lock, (z)->uz_name, \ + mtx_init((z)->uz_lock, (z)->uz_name, \ "UMA zone", MTX_DEF | MTX_DUPOK); \ } while (0) -#define ZONE_LOCK_FINI(z) mtx_destroy(&(z)->uz_lock) -#define ZONE_LOCK(z) mtx_lock(&(z)->uz_lock) -#define ZONE_UNLOCK(z) mtx_unlock(&(z)->uz_lock) +#define ZONE_LOCK_FINI(z) mtx_destroy((z)->uz_lock) +#define ZONE_LOCK(z) mtx_lock((z)->uz_lock) +#define ZONE_UNLOCK(z) mtx_unlock((z)->uz_lock) #define CPU_LOCK_INIT(cpu) \ mtx_init(&uma_pcpu_mtx[(cpu)], "UMA pcpu", "UMA pcpu", \ diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 3e21a991cb2f..f71785f57123 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -320,16 +320,6 @@ kmem_malloc(map, size, flags) vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, &addr)) { vm_map_unlock(map); - if (map != kmem_map) { - static int last_report; /* when we did it (in ticks) */ - if (ticks < last_report || - (ticks - last_report) >= hz) { - last_report = ticks; - printf("Out of mbuf address space!\n"); - printf("Consider increasing NMBCLUSTERS\n"); - } - return (0); - } if ((flags & M_NOWAIT) == 0) panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated", (long)size, (long)map->size); diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c index ac9dd26dcc44..8992599d0891 100644 --- a/usr.bin/netstat/main.c +++ b/usr.bin/netstat/main.c @@ -256,7 +256,6 @@ static char *nlistf = NULL, *memf = NULL; int Aflag; /* show addresses of protocol control block */ int aflag; /* show all sockets (including servers) */ int bflag; /* show i/f total bytes in/out */ -int cflag; /* show mbuf cache information */ int dflag; /* show i/f dropped packets */ int gflag; /* show group (multicast) routing or stats */ int iflag; /* show interfaces */ @@ -297,9 +296,6 @@ main(int argc, char *argv[]) case 'b': bflag = 1; break; - case 'c': - cflag = 1; - break; case 'd': dflag = 1; break; @@ -425,10 +421,6 @@ main(int argc, char *argv[]) if (nlistf != NULL || memf != NULL) setgid(getgid()); - if (cflag && !mflag) { - (void)fprintf(stderr, "-c only valid with -m\n"); - usage(); - } if (mflag) { if (memf != NULL) { if (kread(0, 0, 0) == 0) diff --git a/usr.bin/netstat/mbuf.c b/usr.bin/netstat/mbuf.c index aa6a8d2853ec..98546c4a36f2 100644 --- a/usr.bin/netstat/mbuf.c +++ b/usr.bin/netstat/mbuf.c @@ -99,17 +99,12 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr, u_long mbhiaddr, u_long clhiaddr, u_long mbloaddr, u_long clloaddr, u_long cpusaddr __unused, u_long pgsaddr, u_long mbpaddr) { - int i, j, nmbufs, nmbclusters, page_size, num_objs; + int i, nmbclusters; int nsfbufs, nsfbufspeak, nsfbufsused; - u_int mbuf_hiwm, clust_hiwm, mbuf_lowm, clust_lowm; - u_long totspace[2], totused[2]; - u_long gentotnum, gentotfree, totnum, totfree; - u_long totmem, totmemalloced, totmemused; short nmbtypes; size_t mlen; long *mbtypes = NULL; struct mbstat *mbstat = NULL; - struct mbpstat **mbpstat = NULL; struct mbtypenames *mp; bool *seen = NULL; @@ -119,50 +114,12 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr, goto err; } - /* - * XXX: Unfortunately, for the time being, we have to fetch - * the total length of the per-CPU stats area via sysctl - * (regardless of whether we're looking at a core or not. - */ - if (sysctlbyname("kern.ipc.mb_statpcpu", NULL, &mlen, NULL, 0) < 0) { - warn("sysctl: retrieving mb_statpcpu len"); - goto err; - } - num_objs = (int)(mlen / sizeof(struct mbpstat)); - if ((mbpstat = calloc(num_objs, sizeof(struct mbpstat *))) == NULL) { - warn("calloc: cannot allocate memory for mbpstats pointers"); - goto err; - } - if ((mbpstat[0] = calloc(num_objs, sizeof(struct mbpstat))) == NULL) { - warn("calloc: cannot allocate memory for mbpstats"); - goto err; - } - if (mbaddr) { - if (kread(mbpaddr, (char *)mbpstat[0], mlen)) - goto err; if (kread(mbaddr, (char *)mbstat, sizeof mbstat)) goto err; if (kread(nmbcaddr, (char *)&nmbclusters, sizeof(int))) goto err; - if (kread(nmbufaddr, (char *)&nmbufs, sizeof(int))) - goto err; - if (kread(mbhiaddr, (char *)&mbuf_hiwm, sizeof(u_int))) - goto err; - if (kread(clhiaddr, (char *)&clust_hiwm, sizeof(u_int))) - goto err; - if (kread(mbloaddr, (char *)&mbuf_lowm, sizeof(u_int))) - goto err; - if (kread(clloaddr, (char *)&clust_lowm, sizeof(u_int))) - goto err; - if (kread(pgsaddr, (char *)&page_size, sizeof(int))) - goto err; } else { - if (sysctlbyname("kern.ipc.mb_statpcpu", mbpstat[0], &mlen, - NULL, 0) < 0) { - warn("sysctl: retrieving mb_statpcpu"); - goto err; - } mlen = sizeof *mbstat; if (sysctlbyname("kern.ipc.mbstat", mbstat, &mlen, NULL, 0) < 0) { @@ -175,43 +132,9 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr, warn("sysctl: retrieving nmbclusters"); goto err; } - mlen = sizeof(int); - if (sysctlbyname("kern.ipc.nmbufs", &nmbufs, &mlen, NULL, 0) - < 0) { - warn("sysctl: retrieving nmbufs"); - goto err; - } - mlen = sizeof(u_int); - if (sysctlbyname("kern.ipc.mbuf_hiwm", &mbuf_hiwm, &mlen, - NULL, 0) < 0) { - warn("sysctl: retrieving mbuf_hiwm"); - goto err; - } - mlen = sizeof(u_int); - if (sysctlbyname("kern.ipc.clust_hiwm", &clust_hiwm, &mlen, - NULL, 0) < 0) { - warn("sysctl: retrieving clust_hiwm"); - goto err; - } - mlen = sizeof(u_int); - if (sysctlbyname("kern.ipc.mbuf_lowm", &mbuf_lowm, &mlen, - NULL, 0) < 0) { - warn("sysctl: retrieving mbuf_lowm"); - goto err; - } - mlen = sizeof(u_int); - if (sysctlbyname("kern.ipc.clust_lowm", &clust_lowm, &mlen, - NULL, 0) < 0) { - warn("sysctl: retrieving clust_lowm"); - goto err; - } - mlen = sizeof(int); - if (sysctlbyname("hw.pagesize", &page_size, &mlen, NULL, 0) - < 0) { - warn("sysctl: retrieving hw.pagesize"); - goto err; - } } + if (mbstat->m_mbufs < 0) mbstat->m_mbufs = 0; /* XXX */ + if (mbstat->m_mclusts < 0) mbstat->m_mclusts = 0; /* XXX */ nmbtypes = mbstat->m_numtypes; if ((seen = calloc(nmbtypes, sizeof(*seen))) == NULL) { @@ -223,59 +146,13 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr, goto err; } - for (i = 0; i < num_objs; i++) - mbpstat[i] = mbpstat[0] + i; - #undef MSIZE #define MSIZE (mbstat->m_msize) #undef MCLBYTES #define MCLBYTES (mbstat->m_mclbytes) -#define GENLST (num_objs - 1) - totnum = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck; - totfree = mbpstat[GENLST]->mb_mbfree; - for (j = 1; j < nmbtypes; j++) - mbtypes[j] += mbpstat[GENLST]->mb_mbtypes[j]; - totspace[0] = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck * MSIZE; - for (i = 0; i < (num_objs - 1); i++) { - if (mbpstat[i]->mb_active == 0) - continue; - totspace[0] += mbpstat[i]->mb_mbbucks*mbstat->m_mbperbuck*MSIZE; - totnum += mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck; - totfree += mbpstat[i]->mb_mbfree; - for (j = 1; j < nmbtypes; j++) - mbtypes[j] += mbpstat[i]->mb_mbtypes[j]; - } - totused[0] = totnum - totfree; - if (cflag) { - printf("mbuf usage:\n" - "\tTotal:\t\t%lu/%lu/%d (in use/in pool/max)\n", - totused[0], totnum, nmbufs); - gentotnum = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck; - gentotfree = mbpstat[GENLST]->mb_mbfree; - printf("\tGEN cache:\t%lu/%lu (in use/in pool)\n", - gentotnum - gentotfree, gentotnum); - } else { - /* XXX: peak is now wrong. */ - printf("%lu/%lu/%d mbufs in use (current/peak/max):\n", - totused[0], totnum, nmbufs); - } + printf("%lu mbufs in use\n", mbstat->m_mbufs); - for (i = 0; cflag && i < (num_objs - 1); i++) { - if (mbpstat[i]->mb_active == 0) - continue; - printf("\tCPU #%d cache:\t%lu/%lu (in use/in pool)\n", - i, - (mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck - - mbpstat[i]->mb_mbfree), - (mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck)); - } - if (cflag) { - printf("\tMbuf cache high watermark: %d\n", mbuf_hiwm); -#ifdef NOTYET - printf("\tMbuf cache low watermark: %d\n", mbuf_lowm); -#endif - } for (mp = mbtypenames; mp->mt_name; mp++) { if (mbtypes[mp->mt_type]) { seen[mp->mt_type] = YES; @@ -288,53 +165,10 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr, printf("\t %lu mbufs allocated to \n", mbtypes[i], i); } - if (cflag) - printf("\t%.1f%% of mbuf map consumed\n", - totspace[0] * 100.0 / (nmbufs * MSIZE)); - totnum = mbpstat[GENLST]->mb_clbucks * mbstat->m_clperbuck; - totfree = mbpstat[GENLST]->mb_clfree; - totspace[1] = mbpstat[GENLST]->mb_clbucks*mbstat->m_clperbuck*MCLBYTES; - for (i = 0; i < (num_objs - 1); i++) { - if (mbpstat[i]->mb_active == 0) - continue; - totspace[1] += mbpstat[i]->mb_clbucks * mbstat->m_clperbuck - * MCLBYTES; - totnum += mbpstat[i]->mb_clbucks * mbstat->m_clperbuck; - totfree += mbpstat[i]->mb_clfree; - } - totused[1] = totnum - totfree; - if (cflag) { - printf("mbuf cluster usage:\n" - "\tTotal:\t\t%lu/%lu/%d (in use/in pool/max)\n", - totused[1], totnum, nmbclusters); - gentotnum = mbpstat[GENLST]->mb_clbucks * mbstat->m_clperbuck; - gentotfree = mbpstat[GENLST]->mb_clfree; - printf("\tGEN cache:\t%lu/%lu (in use/in pool)\n", - gentotnum - gentotfree, gentotnum); - } else { - /* XXX: peak is now wrong. */ - printf("%lu/%lu/%d mbuf clusters in use (current/peak/max)\n", - totused[1], totnum, nmbclusters); - } - for (i = 0; cflag && i < (num_objs - 1); i++) { - if (mbpstat[i]->mb_active == 0) - continue; - printf("\tCPU #%d cache:\t%lu/%lu (in use/in pool)\n", - i, - (mbpstat[i]->mb_clbucks * mbstat->m_clperbuck - - mbpstat[i]->mb_clfree), - (mbpstat[i]->mb_clbucks * mbstat->m_clperbuck)); - } - if (cflag) { - printf("\tCluster cache high watermark: %d\n", clust_hiwm); -#ifdef NOTYET - printf("\tCluster cache low watermark: %d\n", clust_lowm); -#endif - } - if (cflag) - printf("\t%.1f%% of cluster map consumed\n", - totspace[1] * 100.0 / (nmbclusters * MCLBYTES)); + printf("%lu/%d mbuf clusters in use (current/max)\n", + mbstat->m_mclusts, nmbclusters); + mlen = sizeof(nsfbufs); if (!sysctlbyname("kern.ipc.nsfbufs", &nsfbufs, &mlen, NULL, 0) && !sysctlbyname("kern.ipc.nsfbufsused", &nsfbufsused, &mlen, NULL, @@ -344,15 +178,8 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr, printf("%d/%d/%d sfbufs in use (current/peak/max)\n", nsfbufsused, nsfbufspeak, nsfbufs); } - totmem = nmbufs * MSIZE + nmbclusters * MCLBYTES; - totmemalloced = totspace[0] + totspace[1]; - totmemused = totused[0] * MSIZE + totused[1] * MCLBYTES; - printf( - "%lu KBytes allocated to network (%.1f%% in use, %.1f%% wired)\n", - totmem / 1024, totmemused * 100.0 / totmem, - totmemalloced * 100.0 / totmem); - printf("%lu requests for memory denied\n", mbstat->m_drops); - printf("%lu requests for memory delayed\n", mbstat->m_wait); + printf("%lu KBytes allocated to network\n", (mbstat->m_mbufs * MSIZE + + mbstat->m_mclusts * MCLBYTES) / 1024); printf("%lu requests for sfbufs denied\n", mbstat->sf_allocfail); printf("%lu requests for sfbufs delayed\n", mbstat->sf_allocwait); printf("%lu requests for I/O initiated by sendfile\n", @@ -366,9 +193,4 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr, free(seen); if (mbstat != NULL) free(mbstat); - if (mbpstat != NULL) { - if (mbpstat[0] != NULL) - free(mbpstat[0]); - free(mbpstat); - } } diff --git a/usr.bin/netstat/netstat.1 b/usr.bin/netstat/netstat.1 index 45023fe8688c..32edfec3e537 100644 --- a/usr.bin/netstat/netstat.1 +++ b/usr.bin/netstat/netstat.1 @@ -181,7 +181,6 @@ or for a single .Bk -words .Nm .Fl m -.Op Fl c .Op Fl M Ar core .Op Fl N Ar system .Ek @@ -189,9 +188,6 @@ or for a single Show statistics recorded by the memory management routines .Pq Xr mbuf 9 . The network manages a private pool of memory buffers. -The -.Fl c -option shows per-CPU statistics for caching. .It Xo .Bk -words .Nm diff --git a/usr.bin/netstat/netstat.h b/usr.bin/netstat/netstat.h index c59b7e845ece..e2b3f291a6e5 100644 --- a/usr.bin/netstat/netstat.h +++ b/usr.bin/netstat/netstat.h @@ -39,7 +39,6 @@ extern int Aflag; /* show addresses of protocol control block */ extern int aflag; /* show all sockets (including servers) */ extern int bflag; /* show i/f total bytes in/out */ -extern int cflag; /* show mbuf cache information */ extern int dflag; /* show i/f dropped packets */ extern int gflag; /* show group (multicast) routing or stats */ extern int iflag; /* show interfaces */ diff --git a/usr.bin/systat/mbufs.c b/usr.bin/systat/mbufs.c index e1b665b22827..1193a3ea77fb 100644 --- a/usr.bin/systat/mbufs.c +++ b/usr.bin/systat/mbufs.c @@ -52,12 +52,9 @@ static const char sccsid[] = "@(#)mbufs.c 8.1 (Berkeley) 6/6/93"; #include "systat.h" #include "extern.h" -static struct mbpstat **mbpstat; static struct mbstat *mbstat; -static int num_objs; static long *m_mbtypes; static short nmbtypes; -#define GENLST (num_objs - 1) static struct mtnames { short mt_type; @@ -101,20 +98,11 @@ void showmbufs() { int i, j, max, idx; - u_long totfree; + u_long totmbufs; char buf[10]; const char *mtname; - totfree = mbpstat[GENLST]->mb_mbfree; - for (i = 1; i < nmbtypes; i++) - m_mbtypes[i] += mbpstat[GENLST]->mb_mbtypes[i]; - for (i = 0; i < GENLST; i++) { - if (mbpstat[i]->mb_active == 0) - continue; - totfree += mbpstat[i]->mb_mbfree; - for (j = 1; j < nmbtypes; j++) - m_mbtypes[j] += mbpstat[i]->mb_mbtypes[j]; - } + totmbufs = mbstat->m_mbufs; /* * Print totals for different mbuf types. @@ -159,16 +147,16 @@ showmbufs() /* * Print total number of free mbufs. */ - if (totfree > 0) { - mvwprintw(wnd, 1+j, 0, "%-10.10s", "free"); - if (totfree > 60) { - snprintf(buf, sizeof(buf), " %lu", totfree); - totfree = 60; - while(totfree--) + if (totmbufs > 0) { + mvwprintw(wnd, 1+j, 0, "%-10.10s", "Mbufs"); + if (totmbufs > 60) { + snprintf(buf, sizeof(buf), " %lu", totmbufs); + totmbufs = 60; + while(totmbufs--) waddch(wnd, 'X'); waddstr(wnd, buf); } else { - while(totfree--) + while(totmbufs--) waddch(wnd, 'X'); } wclrtoeol(wnd); @@ -198,23 +186,6 @@ initmbufs() return 0; } - if (sysctlbyname("kern.ipc.mb_statpcpu", NULL, &len, NULL, 0) < 0) { - error("sysctl getting mbpstat total size failed"); - return 0; - } - num_objs = (int)(len / sizeof(struct mbpstat)); - if ((mbpstat = calloc(num_objs, sizeof(struct mbpstat *))) == NULL) { - error("calloc mbpstat pointers failed"); - return 0; - } - if ((mbpstat[0] = calloc(num_objs, sizeof(struct mbpstat))) == NULL) { - error("calloc mbpstat structures failed"); - return 0; - } - - for (i = 0; i < num_objs; i++) - mbpstat[i] = mbpstat[0] + i; - return 1; } @@ -223,7 +194,7 @@ fetchmbufs() { size_t len; - len = num_objs * sizeof(struct mbpstat); - if (sysctlbyname("kern.ipc.mb_statpcpu", mbpstat[0], &len, NULL, 0) < 0) - printw("sysctl: mbpstat: %s", strerror(errno)); + len = sizeof *mbstat; + if (sysctlbyname("kern.ipc.mbstat", mbstat, &len, NULL, 0) < 0) + printw("sysctl: mbstat: %s", strerror(errno)); }