Bring in mbuma to replace mballoc.

mbuma is an Mbuf & Cluster allocator built on top of a number of extensions to the UMA framework, all included herein. Extensions to UMA worth noting: - Better layering between slab <-> zone caches; introduce Keg structure which splits off slab cache away from the zone structure and allows multiple zones to be stacked on top of a single Keg (single type of slab cache); perhaps we should look into defining a subset API on top of the Keg for special use by malloc(9), for example. - UMA_ZONE_REFCNT zones can now be added, and reference counters automagically allocated for them within the end of the associated slab structures. uma_find_refcnt() does a kextract to fetch the slab struct reference from the underlying page, and lookup the corresponding refcnt. mbuma things worth noting: - integrates mbuf & cluster allocations with extended UMA and provides caches for commonly-allocated items; defines several zones (two primary, one secondary) and two kegs. - change up certain code paths that always used to do: m_get() + m_clget() to instead just use m_getcl() and try to take advantage of the newly defined secondary Packet zone. - netstat(1) and systat(1) quickly hacked up to do basic stat reporting but additional stats work needs to be done once some other details within UMA have been taken care of and it becomes clearer to how stats will work within the modified framework. From the user perspective, one implication is that the NMBCLUSTERS compile-time option is no longer used. The maximum number of clusters is still capped off according to maxusers, but it can be made unlimited by setting the kern.ipc.nmbclusters boot-time tunable to zero. Work should be done to write an appropriate sysctl handler allowing dynamic tuning of kern.ipc.nmbclusters at runtime. Additional things worth noting/known issues (READ): - One report of 'ips' (ServeRAID) driver acting really slow in conjunction with mbuma. Need more data. Latest report is that ips is equally sucking with and without mbuma. - Giant leak in NFS code sometimes occurs, can't reproduce but currently analyzing; brueffer is able to reproduce but THIS IS NOT an mbuma-specific problem and currently occurs even WITHOUT mbuma. - Issues in network locking: there is at least one code path in the rip code where one or more locks are acquired and we end up in m_prepend() with M_WAITOK, which causes WITNESS to whine from within UMA. Current temporary solution: force all UMA allocations to be M_NOWAIT from within UMA for now to avoid deadlocks unless WITNESS is defined and we can determine with certainty that we're not holding any locks when we're M_WAITOK. - I've seen at least one weird socketbuffer empty-but- mbuf-still-attached panic. I don't believe this to be related to mbuma but please keep your eyes open, turn on debugging, and capture crash dumps. This change removes more code than it adds. A paper is available detailing the change and considering various performance issues, it was presented at BSDCan2004: http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf Please read the paper for Future Work and implementation details, as well as credits. Testing and Debugging: rwatson, brueffer, Ketrien I. Saihr-Kesenchedra, ... Reviewed by: Lots of people (for different parts)
svn path=/head/; revision=129906
2004-05-31 21:46:06 +00:00 · 2004-05-31 21:46:06 +00:00 · 099a0e588c · 2020-12-20 02:59:44 +00:00
commit 099a0e588c
parent 251b48a1bb
23 changed files with 1739 additions and 2375 deletions
--- a/sys/conf/files
+++ b/sys/conf/files
@ -1075,6 +1075,7 @@ kern/kern_lock.c	standard
 kern/kern_lockf.c	standard
 kern/kern_mac.c		standard
 kern/kern_malloc.c	standard
+kern/kern_mbuf.c	standard
 kern/kern_mib.c		standard
 kern/kern_module.c	standard
 kern/kern_mutex.c	standard
@ -1116,7 +1117,6 @@ kern/subr_hints.c	standard
 kern/subr_kobj.c	standard
 kern/subr_log.c		standard
 kern/subr_mbpool.c	optional libmbpool
-kern/subr_mbuf.c	standard
 kern/subr_mchain.c	optional libmchain
 kern/subr_module.c	standard
 kern/subr_msgbuf.c	standard
--- a/sys/i386/i386/vm_machdep.c
+++ b/sys/i386/i386/vm_machdep.c
@ -95,6 +95,10 @@ __FBSDID("$FreeBSD$");
 #include <i386/isa/isa.h>
 #endif

+#ifndef NSFBUFS
+#define	NSFBUFS		(512 + maxusers * 16)
+#endif
+
 static void	cpu_reset_real(void);
 #ifdef SMP
 static void	cpu_reset_proxy(void);
@ -584,6 +588,9 @@ sf_buf_init(void *arg)
 	vm_offset_t sf_base;
 	int i;

+	nsfbufs = NSFBUFS;
+	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
+
 	sf_buf_active = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask);
 	TAILQ_INIT(&sf_buf_freelist);
 	sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE);
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@ -191,6 +191,7 @@ malloc(size, type, flags)
 	int indx;
 	caddr_t va;
 	uma_zone_t zone;
+	uma_keg_t keg;
 #ifdef DIAGNOSTIC
 	unsigned long osize = size;
 #endif
@ -235,6 +236,7 @@ malloc(size, type, flags)
 			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
 		indx = kmemsize[size >> KMEM_ZSHIFT];
 		zone = kmemzones[indx].kz_zone;
+		keg = zone->uz_keg;
 #ifdef MALLOC_PROFILE
 		krequests[size >> KMEM_ZSHIFT]++;
 #endif
@ -244,10 +246,11 @@ malloc(size, type, flags)
 			goto out;

 		ksp->ks_size |= 1 << indx;
-		size = zone->uz_size;
+		size = keg->uk_size;
 	} else {
 		size = roundup(size, PAGE_SIZE);
 		zone = NULL;
+		keg = NULL;
 		va = uma_large_malloc(size, flags);
 		mtx_lock(&ksp->ks_mtx);
 		if (va == NULL)
@ -309,7 +312,7 @@ free(addr, type)
 #ifdef INVARIANTS
 		struct malloc_type **mtp = addr;
 #endif
-		size = slab->us_zone->uz_size;
+		size = slab->us_keg->uk_size;
 #ifdef INVARIANTS
 		/*
 		 * Cache a pointer to the malloc_type that most recently freed
@ -325,7 +328,7 @@ free(addr, type)
 		    sizeof(struct malloc_type *);
 		*mtp = type;
 #endif
-		uma_zfree_arg(slab->us_zone, addr, slab);
+		uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
 	} else {
 		size = slab->us_size;
 		uma_large_free(slab);
@ -364,8 +367,8 @@ realloc(addr, size, type, flags)
 	    ("realloc: address %p out of range", (void *)addr));

 	/* Get the size of the original block */
-	if (slab->us_zone)
-		alloc = slab->us_zone->uz_size;
+	if (slab->us_keg)
+		alloc = slab->us_keg->uk_size;
 	else
 		alloc = slab->us_size;

@ -410,7 +413,6 @@ kmeminit(dummy)
 	void *dummy;
 {
 	u_int8_t indx;
-	u_long npg;
 	u_long mem_size;
 	int i;
 
@ -428,7 +430,7 @@ kmeminit(dummy)
 	 * Note that the kmem_map is also used by the zone allocator,
 	 * so make sure that there is enough space.
 	 */
-	vm_kmem_size = VM_KMEM_SIZE;
+	vm_kmem_size = VM_KMEM_SIZE + nmbclusters * PAGE_SIZE;
 	mem_size = cnt.v_page_count;

 #if defined(VM_KMEM_SIZE_SCALE)
@ -462,17 +464,8 @@ kmeminit(dummy)
 	 */
 	init_param3(vm_kmem_size / PAGE_SIZE);

-	/*
-	 * In mbuf_init(), we set up submaps for mbufs and clusters, in which
-	 * case we rounddown() (nmbufs * MSIZE) and (nmbclusters * MCLBYTES),
-	 * respectively. Mathematically, this means that what we do here may
-	 * amount to slightly more address space than we need for the submaps,
-	 * but it never hurts to have an extra page in kmem_map.
-	 */
-	npg = (nmbufs*MSIZE + nmbclusters*MCLBYTES + vm_kmem_size) / PAGE_SIZE; 
-
 	kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
-		(vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE));
+		(vm_offset_t *)&kmemlimit, vm_kmem_size);
 	kmem_map->system_map = 1;

 	uma_startup2();
--- a/sys/kern/kern_mbuf.c
+++ b/sys/kern/kern_mbuf.c
@ -0,0 +1,385 @@
+/*-
+ * Copyright (c) 2004
+ * 	Bosko Milekic <bmilekic@FreeBSD.org>.
+ *	All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of contributors may be
+ *    used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_mac.h"
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/mac.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/domain.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/protosw.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+/*
+ * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
+ * Zones.
+ *
+ * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
+ * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
+ * administrator so desires.
+ *
+ * Mbufs are allocated from a UMA Master Zone called the Mbuf
+ * Zone.
+ *
+ * Additionally, FreeBSD provides a Packet Zone, which it
+ * configures as a Secondary Zone to the Mbuf Master Zone,
+ * thus sharing backend Slab kegs with the Mbuf Master Zone.
+ *
+ * Thus common-case allocations and locking are simplified:
+ *
+ *  m_clget()                m_getcl()
+ *    |                         |
+ *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
+ *    |   |             [     Packet   ]            |
+ *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
+ *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Master Zone ]
+ *        |                       \________         |
+ *  [ Cluster Keg   ]                      \       /
+ *        |    	                         [ Mbuf Keg   ] 
+ *  [ Cluster Slabs ]                         |
+ *        |                              [ Mbuf Slabs ]
+ *         \____________(VM)_________________/
+ */
+
+int nmbclusters;
+struct mbstat mbstat;
+
+static void
+tunable_mbinit(void *dummy)
+{
+
+	/* This has to be done before VM init. */
+	nmbclusters = 1024 + maxusers * 64;
+	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
+}
+SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RW, &nmbclusters, 0,
+    "Maximum number of mbuf clusters allowed");
+SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
+    "Mbuf general information and statistics");
+
+/*
+ * Zones from which we allocate.
+ */
+uma_zone_t	zone_mbuf;
+uma_zone_t	zone_clust;
+uma_zone_t	zone_pack;
+
+/*
+ * Local prototypes.
+ */
+static void	mb_ctor_mbuf(void *, int, void *);
+static void	mb_ctor_clust(void *, int, void *);
+static void	mb_ctor_pack(void *, int, void *);
+static void	mb_dtor_mbuf(void *, int, void *);
+static void	mb_dtor_clust(void *, int, void *);	/* XXX */
+static void	mb_dtor_pack(void *, int, void *);	/* XXX */
+static void	mb_init_pack(void *, int);
+static void	mb_fini_pack(void *, int);
+
+static void	mb_reclaim(void *);
+static void	mbuf_init(void *);
+
+/*
+ * Initialize FreeBSD Network buffer allocation.
+ */
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
+static void
+mbuf_init(void *dummy)
+{
+
+	/*
+	 * Configure UMA zones for Mbufs, Clusters, and Packets.
+	 */
+	zone_mbuf = uma_zcreate("Mbuf", MSIZE, mb_ctor_mbuf, mb_dtor_mbuf,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MAXBUCKET);
+	zone_clust = uma_zcreate("MbufClust", MCLBYTES, mb_ctor_clust,
+	    mb_dtor_clust, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+	if (nmbclusters > 0)
+		uma_zone_set_max(zone_clust, nmbclusters);
+	zone_pack = uma_zsecond_create("Packet", mb_ctor_pack, mb_dtor_pack,
+	    mb_init_pack, mb_fini_pack, zone_mbuf);
+
+	/* uma_prealloc() goes here */
+
+	/*
+	 * Hook event handler for low-memory situation, used to
+	 * drain protocols and push data back to the caches (UMA
+	 * later pushes it back to VM).
+	 */
+	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
+	    EVENTHANDLER_PRI_FIRST);
+
+	/*
+	 * [Re]set counters and local statistics knobs.
+	 * XXX Some of these should go and be replaced, but UMA stat
+	 * gathering needs to be revised.
+	 */
+	mbstat.m_mbufs = 0;
+	mbstat.m_mclusts = 0;
+	mbstat.m_drain = 0;
+	mbstat.m_msize = MSIZE;
+	mbstat.m_mclbytes = MCLBYTES;
+	mbstat.m_minclsize = MINCLSIZE;
+	mbstat.m_mlen = MLEN;
+	mbstat.m_mhlen = MHLEN;
+	mbstat.m_numtypes = MT_NTYPES;
+
+	mbstat.m_mcfail = mbstat.m_mpfail = 0;
+	mbstat.sf_iocnt = 0;
+	mbstat.sf_allocwait = mbstat.sf_allocfail = 0;
+}
+
+/*
+ * Constructor for Mbuf master zone.
+ *
+ * The 'arg' pointer points to a mb_args structure which
+ * contains call-specific information required to support the
+ * mbuf allocation API.
+ */
+static void
+mb_ctor_mbuf(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+	struct mb_args *args;
+	int flags;
+	int how;
+	short type;
+
+	m = (struct mbuf *)mem;
+	args = (struct mb_args *)arg;
+	flags = args->flags;
+	how = args->how;
+	type = args->type;
+
+	m->m_type = type;
+	m->m_next = NULL;
+	m->m_nextpkt = NULL;
+	if (flags & M_PKTHDR) {
+		m->m_data = m->m_pktdat;
+		m->m_flags = M_PKTHDR;
+		m->m_pkthdr.rcvif = NULL;
+		m->m_pkthdr.csum_flags = 0;
+		SLIST_INIT(&m->m_pkthdr.tags);
+#ifdef MAC
+		/* If the label init fails, fail the alloc */
+		if (mac_init_mbuf(m, how) != 0) {
+			m_free(m);
+/* XXX*/		panic("mb_ctor_mbuf(): can't deal with failure!");
+/*			return 0; */
+		}
+#endif
+	} else { 
+		m->m_data = m->m_dat;
+		m->m_flags = 0;
+	}
+	mbstat.m_mbufs += 1;	/* XXX */
+/*	return 1;
+*/
+}
+
+/*
+ * The Mbuf master zone and Packet secondary zone destructor.
+ */
+static void
+mb_dtor_mbuf(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	if ((m->m_flags & M_PKTHDR) != 0)
+		m_tag_delete_chain(m, NULL);
+	mbstat.m_mbufs -= 1;	/* XXX */
+}
+
+/* XXX Only because of stats */
+static void
+mb_dtor_pack(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	if ((m->m_flags & M_PKTHDR) != 0)
+		m_tag_delete_chain(m, NULL);
+	mbstat.m_mbufs -= 1;	/* XXX */
+	mbstat.m_mclusts -= 1;	/* XXX */
+}
+
+/*
+ * The Cluster zone constructor.
+ *
+ * Here the 'arg' pointer points to the Mbuf which we
+ * are configuring cluster storage for.
+ */
+static void
+mb_ctor_clust(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)arg;
+	m->m_ext.ext_buf = (caddr_t)mem;
+	m->m_data = m->m_ext.ext_buf;
+	m->m_flags |= M_EXT;
+	m->m_ext.ext_free = NULL;
+	m->m_ext.ext_args = NULL;
+	m->m_ext.ext_size = MCLBYTES;
+	m->m_ext.ext_type = EXT_CLUSTER;
+	m->m_ext.ref_cnt = (u_int *)uma_find_refcnt(zone_clust,
+	    m->m_ext.ext_buf);
+	*(m->m_ext.ref_cnt) = 1;
+	mbstat.m_mclusts += 1;	/* XXX */
+/*	return 1;
+*/
+}
+
+/* XXX */
+static void
+mb_dtor_clust(void *mem, int size, void *arg)
+{
+	mbstat.m_mclusts -= 1;	/* XXX */
+}
+
+/*
+ * The Packet secondary zone's init routine, executed on the
+ * object's transition from keg slab to zone cache.
+ */
+static void
+mb_init_pack(void *mem, int size)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	m->m_ext.ext_buf = NULL;
+	uma_zalloc_arg(zone_clust, m, M_NOWAIT);
+	if (m->m_ext.ext_buf == NULL)	/* XXX */
+		panic("mb_init_pack(): Can't deal with failure yet.");
+	mbstat.m_mclusts -= 1;	/* XXX */
+}
+
+/*
+ * The Packet secondary zone's fini routine, executed on the
+ * object's transition from zone cache to keg slab.
+ */
+static void
+mb_fini_pack(void *mem, int size)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
+	m->m_ext.ext_buf = NULL;
+	mbstat.m_mclusts += 1;	/* XXX */
+}
+
+/*
+ * The "packet" keg constructor.
+ */
+static void
+mb_ctor_pack(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+	struct mb_args *args;
+	int flags, how;
+	short type;
+
+	m = (struct mbuf *)mem;
+	args = (struct mb_args *)arg;
+	flags = args->flags;
+	type = args->type;
+	how = args->how;
+
+	m->m_type = type;
+	m->m_next = NULL;
+	m->m_data = m->m_ext.ext_buf;
+	m->m_flags = flags|M_EXT;
+	m->m_ext.ext_free = NULL;
+	m->m_ext.ext_args = NULL;
+	m->m_ext.ext_size = MCLBYTES;
+	m->m_ext.ext_type = EXT_PACKET;
+	*(m->m_ext.ref_cnt) = 1;
+
+	if (flags & M_PKTHDR) {
+		m->m_nextpkt = NULL;
+		m->m_pkthdr.rcvif = NULL;
+		m->m_pkthdr.csum_flags = 0;
+		SLIST_INIT(&m->m_pkthdr.tags);
+#ifdef MAC
+		/* If the label init fails, fail the alloc */
+		if (mac_init_mbuf(m, how) != 0) {
+			m_free(m);
+/* XXX*/		panic("mb_ctor_pack(): can't deal with failure!");
+/*			return 0; */
+		}
+#endif
+	}
+	mbstat.m_mbufs += 1;	/* XXX */
+	mbstat.m_mclusts += 1;	/* XXX */
+/*	return 1;
+*/
+}
+
+/*
+ * This is the protocol drain routine.
+ *
+ * No locks should be held when this is called.  The drain routines have to
+ * presently acquire some locks which raises the possibility of lock order
+ * reversal.
+ */
+static void
+mb_reclaim(void *junk)
+{
+	struct domain *dp;
+	struct protosw *pr;
+
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
+	    "mb_reclaim()");
+
+	mbstat.m_drain++;
+	for (dp = domains; dp != NULL; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_drain != NULL)
+				(*pr->pr_drain)();
+}
--- a/sys/kern/subr_mbuf.c
+++ b/sys/kern/subr_mbuf.c
--- a/sys/kern/uipc_mbuf.c
+++ b/sys/kern/uipc_mbuf.c
@ -85,6 +85,161 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
 	   &m_defragrandomfailures, 0, "");
 #endif

+/*
+ * Malloc-type for external ext_buf ref counts.
+ */
+MALLOC_DEFINE(M_MBUF, "mbextcnt", "mbuf external ref counts");
+
+/*
+ * Allocate a given length worth of mbufs and/or clusters (whatever fits
+ * best) and return a pointer to the top of the allocated chain.  If an
+ * existing mbuf chain is provided, then we will append the new chain
+ * to the existing one but still return the top of the newly allocated
+ * chain.
+ */
+struct mbuf *
+m_getm(struct mbuf *m, int len, int how, short type)
+{
+	struct mbuf *mb, *top, *cur, *mtail;
+	int num, rem;
+	int i;
+
+	KASSERT(len >= 0, ("m_getm(): len is < 0"));
+
+	/* If m != NULL, we will append to the end of that chain. */
+	if (m != NULL)
+		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
+	else
+		mtail = NULL;
+
+	/*
+	 * Calculate how many mbufs+clusters ("packets") we need and how much
+	 * leftover there is after that and allocate the first mbuf+cluster
+	 * if required.
+	 */
+	num = len / MCLBYTES;
+	rem = len % MCLBYTES;
+	top = cur = NULL;
+	if (num > 0) {
+		if ((top = cur = m_getcl(how, type, 0)) == NULL)
+			goto failed;
+	}
+	num--;
+	top->m_len = 0;
+
+	for (i = 0; i < num; i++) {
+		mb = m_getcl(how, type, 0);
+		if (mb == NULL)
+			goto failed;
+		mb->m_len = 0;
+		cur = (cur->m_next = mb);
+	}
+	if (rem > 0) {
+		mb = (rem > MINCLSIZE) ?
+		    m_getcl(how, type, 0) : m_get(how, type);
+		if (mb == NULL)
+			goto failed;
+		mb->m_len = 0;
+		if (cur == NULL)
+			top = mb;
+		else
+			cur->m_next = mb;
+	}
+
+	if (mtail != NULL)
+		mtail->m_next = top;
+	return top;
+failed:
+	if (top != NULL)
+		m_freem(top);
+	return NULL;
+}
+
+/*
+ * Free an entire chain of mbufs and associated external buffers, if
+ * applicable.
+ */
+void
+m_freem(struct mbuf *mb)
+{
+
+	while (mb != NULL)
+		mb = m_free(mb);
+}
+
+/*-
+ * Configure a provided mbuf to refer to the provided external storage
+ * buffer and setup a reference count for said buffer.  If the setting
+ * up of the reference count fails, the M_EXT bit will not be set.  If
+ * successfull, the M_EXT bit is set in the mbuf's flags.
+ *
+ * Arguments:
+ *    mb     The existing mbuf to which to attach the provided buffer.
+ *    buf    The address of the provided external storage buffer.
+ *    size   The size of the provided buffer.
+ *    freef  A pointer to a routine that is responsible for freeing the
+ *           provided external storage buffer.
+ *    args   A pointer to an argument structure (of any type) to be passed
+ *           to the provided freef routine (may be NULL).
+ *    flags  Any other flags to be passed to the provided mbuf.
+ *    type   The type that the external storage buffer should be
+ *           labeled with.
+ *
+ * Returns:
+ *    Nothing.
+ */
+void
+m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
+    void (*freef)(void *, void *), void *args, int flags, int type)
+{
+	u_int *ref_cnt = NULL;
+
+	/* XXX Shouldn't be adding EXT_CLUSTER with this API */
+	if (type == EXT_CLUSTER)
+		ref_cnt = (u_int *)uma_find_refcnt(zone_clust,
+		    mb->m_ext.ext_buf);
+	else if (type == EXT_EXTREF)
+		ref_cnt = mb->m_ext.ref_cnt;
+	mb->m_ext.ref_cnt = (ref_cnt == NULL) ?
+	    malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)ref_cnt;
+	if (mb->m_ext.ref_cnt != NULL) {
+		*(mb->m_ext.ref_cnt) = 1;
+		mb->m_flags |= (M_EXT | flags);
+		mb->m_ext.ext_buf = buf;
+		mb->m_data = mb->m_ext.ext_buf;
+		mb->m_ext.ext_size = size;
+		mb->m_ext.ext_free = freef;
+		mb->m_ext.ext_args = args;
+		mb->m_ext.ext_type = type;
+        }
+}
+
+/*
+ * Non-directly-exported function to clean up after mbufs with M_EXT
+ * storage attached to them if the reference count hits 0.
+ */
+void
+mb_free_ext(struct mbuf *m)
+{
+
+	MEXT_REM_REF(m);
+	if (atomic_cmpset_int(m->m_ext.ref_cnt, 0, 1)) {
+		if (m->m_ext.ext_type == EXT_PACKET) {
+			uma_zfree(zone_pack, m);
+			return;
+		} else if (m->m_ext.ext_type == EXT_CLUSTER) {
+			uma_zfree(zone_clust, m->m_ext.ext_buf);
+			m->m_ext.ext_buf = NULL;
+		} else {
+			(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
+			    m->m_ext.ext_args);
+			if (m->m_ext.ext_type != EXT_EXTREF)
+				free(m->m_ext.ref_cnt, M_MBUF);
+		}
+	}
+	uma_zfree(zone_mbuf, m);
+}
+
 /*
 * "Move" mbuf pkthdr from "from" to "to".
 * "from" must have M_PKTHDR set, and "to" must be empty.
@ -364,22 +519,22 @@ m_dup(struct mbuf *m, int how)
 		struct mbuf *n;

 		/* Get the next new mbuf */
-		MGET(n, how, m->m_type);
+		if (remain >= MINCLSIZE) {
+			n = m_getcl(how, m->m_type, 0);
+			nsize = MCLBYTES;
+		} else {
+			n = m_get(how, m->m_type);
+			nsize = MLEN;
+		}
 		if (n == NULL)
 			goto nospace;
-		if (top == NULL) {		/* first one, must be PKTHDR */
-			if (!m_dup_pkthdr(n, m, how))
-				goto nospace;
-			nsize = MHLEN;
-		} else				/* not the first one */
-			nsize = MLEN;
-		if (remain >= MINCLSIZE) {
-			MCLGET(n, how);
-			if ((n->m_flags & M_EXT) == 0) {
-				(void)m_free(n);
+
+		if (top == NULL) {		/* First one, must be PKTHDR */
+			if (!m_dup_pkthdr(n, m, how)) {
+				m_free(n);
 				goto nospace;
 			}
-			nsize = MCLBYTES;
+			nsize = MHLEN;
 		}
 		n->m_len = 0;

@ -651,39 +806,42 @@ m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
 	 void (*copy)(char *from, caddr_t to, u_int len))
 {
 	struct mbuf *m;
-	struct mbuf *top = 0, **mp = &top;
+	struct mbuf *top = NULL, **mp = &top;
 	int len;

 	if (off < 0 || off > MHLEN)
 		return (NULL);

-	MGETHDR(m, M_DONTWAIT, MT_DATA);
-	if (m == NULL)
-		return (NULL);
-	m->m_pkthdr.rcvif = ifp;
-	m->m_pkthdr.len = totlen;
-	len = MHLEN;
-
 	while (totlen > 0) {
-		if (top) {
-			MGET(m, M_DONTWAIT, MT_DATA);
+		if (top == NULL) {	/* First one, must be PKTHDR */
+			if (totlen + off >= MINCLSIZE) {
+				m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+				len = MCLBYTES;
+			} else {
+				m = m_gethdr(M_DONTWAIT, MT_DATA);
+				len = MHLEN;
+
+				/* Place initial small packet/header at end of mbuf */
+				if (m && totlen + off + max_linkhdr <= MLEN) {
+					m->m_data += max_linkhdr;
+					len -= max_linkhdr;
+				}
+			}
+			if (m == NULL)
+				return NULL;
+			m->m_pkthdr.rcvif = ifp;
+			m->m_pkthdr.len = totlen;
+		} else {
+			if (totlen + off >= MINCLSIZE) {
+				m = m_getcl(M_DONTWAIT, MT_DATA, 0);
+				len = MCLBYTES;
+			} else {
+				m = m_get(M_DONTWAIT, MT_DATA);
+				len = MLEN;
+			}
 			if (m == NULL) {
 				m_freem(top);
-				return (NULL);
-			}
-			len = MLEN;
-		}
-		if (totlen + off >= MINCLSIZE) {
-			MCLGET(m, M_DONTWAIT);
-			if (m->m_flags & M_EXT)
-				len = MCLBYTES;
-		} else {
-			/*
-			 * Place initial small packet/header at end of mbuf.
-			 */
-			if (top == NULL && totlen + off + max_linkhdr <= len) {
-				m->m_data += max_linkhdr;
-				len -= max_linkhdr;
+				return NULL;
 			}
 		}
 		if (off) {
@ -722,9 +880,10 @@ m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
 		off -= mlen;
 		totlen += mlen;
 		if (m->m_next == NULL) {
-			n = m_get_clrd(M_DONTWAIT, m->m_type);
+			n = m_get(M_DONTWAIT, m->m_type);
 			if (n == NULL)
 				goto out;
+			bzero(mtod(n, caddr_t), MLEN);
 			n->m_len = min(MLEN, len + off);
 			m->m_next = n;
 		}
--- a/sys/kern/uipc_mbuf2.c
+++ b/sys/kern/uipc_mbuf2.c
@ -230,14 +230,10 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp)
 	 * now, we need to do the hard way.  don't m_copy as there's no room
 	 * on both end.
 	 */
-	MGET(o, M_DONTWAIT, m->m_type);
-	if (o && len > MLEN) {
-		MCLGET(o, M_DONTWAIT);
-		if ((o->m_flags & M_EXT) == 0) {
-			m_free(o);
-			o = NULL;
-		}
-	}
+	if (len > MLEN)
+		o = m_getcl(M_DONTWAIT, m->m_type, 0);
+	else
+		o = m_get(M_DONTWAIT, m->m_type);
 	if (!o) {
 		m_freem(m);
 		return NULL;	/* ENOBUFS */
@ -274,29 +270,27 @@ static struct mbuf *
 m_dup1(struct mbuf *m, int off, int len, int wait)
 {
 	struct mbuf *n;
-	int l;
 	int copyhdr;

 	if (len > MCLBYTES)
 		return NULL;
-	if (off == 0 && (m->m_flags & M_PKTHDR) != 0) {
+	if (off == 0 && (m->m_flags & M_PKTHDR) != 0)
 		copyhdr = 1;
-		MGETHDR(n, wait, m->m_type);
-		l = MHLEN;
-	} else {
+	else
 		copyhdr = 0;
-		MGET(n, wait, m->m_type);
-		l = MLEN;
-	}
-	if (n && len > l) {
-		MCLGET(n, wait);
-		if ((n->m_flags & M_EXT) == 0) {
-			m_free(n);
-			n = NULL;
-		}
+	if (len >= MINCLSIZE) {
+		if (copyhdr == 1)
+			n = m_getcl(wait, m->m_type, M_PKTHDR);
+		else
+			n = m_getcl(wait, m->m_type, 0);
+	} else {
+		if (copyhdr == 1)
+			n = m_gethdr(wait, m->m_type);
+		else
+			n = m_get(wait, m->m_type);
 	}
 	if (!n)
-		return NULL;
+		return NULL; /* ENOBUFS */

 	if (copyhdr && !m_dup_pkthdr(n, m, wait)) {
 		m_free(n);
--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@ -959,15 +959,12 @@ sbcreatecontrol(p, size, type, level)

 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
 		return ((struct mbuf *) NULL);
-	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+	if (CMSG_SPACE((u_int)size > MLEN))
+		m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
+	else
+		m = m_get(M_DONTWAIT, MT_CONTROL);
+	if (m == NULL)
 		return ((struct mbuf *) NULL);
-	if (CMSG_SPACE((u_int)size) > MLEN) {
-		MCLGET(m, M_DONTWAIT);
-		if ((m->m_flags & M_EXT) == 0) {
-			m_free(m);
-			return ((struct mbuf *) NULL);
-		}
-	}
 	cp = mtod(m, struct cmsghdr *);
 	m->m_len = 0;
 	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@ -527,8 +527,8 @@ sosend(so, addr, uio, top, control, flags, td)
 {
 	struct mbuf **mp;
 	struct mbuf *m;
-	long space, len, resid;
-	int clen = 0, error, s, dontroute, mlen;
+	long space, len = 0, resid;
+	int clen = 0, error, s, dontroute;
 	int atomic = sosendallatonce(so) || top;
 #ifdef ZERO_COPY_SOCKETS
 	int cow_send;
@ -624,25 +624,23 @@ sosend(so, addr, uio, top, control, flags, td)
 #ifdef ZERO_COPY_SOCKETS
 			cow_send = 0;
 #endif /* ZERO_COPY_SOCKETS */
-			if (top == 0) {
-				MGETHDR(m, M_TRYWAIT, MT_DATA);
-				if (m == NULL) {
-					error = ENOBUFS;
-					goto release;
-				}
-				mlen = MHLEN;
-				m->m_pkthdr.len = 0;
-				m->m_pkthdr.rcvif = (struct ifnet *)0;
-			} else {
-				MGET(m, M_TRYWAIT, MT_DATA);
-				if (m == NULL) {
-					error = ENOBUFS;
-					goto release;
-				}
-				mlen = MLEN;
-			}
 			if (resid >= MINCLSIZE) {
 #ifdef ZERO_COPY_SOCKETS
+				if (top == NULL) {
+					MGETHDR(m, M_TRYWAIT, MT_DATA);
+					if (m == NULL) {
+						error = ENOBUFS;
+						goto release;
+					}
+					m->m_pkthdr.len = 0;
+					m->m_pkthdr.rcvif = (struct ifnet *)0;
+				} else {
+					MGET(m, M_TRYWAIT, MT_DATA);
+					if (m == NULL) {
+						error = ENOBUFS;
+						goto release;
+					}
+				}
 				if (so_zero_copy_send &&
 				    resid>=PAGE_SIZE &&
 				    space>=PAGE_SIZE &&
@ -654,29 +652,48 @@ sosend(so, addr, uio, top, control, flags, td)
 						cow_send = socow_setup(m, uio);
 					}
 				}
-				if (!cow_send){
-#endif /* ZERO_COPY_SOCKETS */
-				MCLGET(m, M_TRYWAIT);
-				if ((m->m_flags & M_EXT) == 0)
-					goto nopages;
-				mlen = MCLBYTES;
-				len = min(min(mlen, resid), space);
-			} else {
-#ifdef ZERO_COPY_SOCKETS
+				if (!cow_send) {
+					MCLGET(m, M_TRYWAIT);
+					if ((m->m_flags & M_EXT) == 0) {
+						m_free(m);
+						m = NULL;
+					} else {
+						len = min(min(MCLBYTES, resid), space);
+					}
+				} else
 					len = PAGE_SIZE;
-				}
-
-			} else {
+#else /* ZERO_COPY_SOCKETS */
+				if (top == NULL) {
+					m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
+					m->m_pkthdr.len = 0;
+					m->m_pkthdr.rcvif = (struct ifnet *)0;
+				} else
+					m = m_getcl(M_TRYWAIT, MT_DATA, 0);
+				len = min(min(MCLBYTES, resid), space);
 #endif /* ZERO_COPY_SOCKETS */
-nopages:
-				len = min(min(mlen, resid), space);
-				/*
-				 * For datagram protocols, leave room
-				 * for protocol headers in first mbuf.
-				 */
-				if (atomic && top == 0 && len < mlen)
-					MH_ALIGN(m, len);
+			} else {
+				if (top == NULL) {
+					m = m_gethdr(M_TRYWAIT, MT_DATA);
+					m->m_pkthdr.len = 0;
+					m->m_pkthdr.rcvif = (struct ifnet *)0;
+
+					len = min(min(MHLEN, resid), space);
+					/*
+					 * For datagram protocols, leave room
+					 * for protocol headers in first mbuf.
+					 */
+					if (atomic && m && len < MHLEN)
+						MH_ALIGN(m, len);
+				} else {
+					m = m_get(M_TRYWAIT, MT_DATA);
+					len = min(min(MLEN, resid), space);
+				}
 			}
+			if (m == NULL) {
+				error = ENOBUFS;
+				goto release;
+			}
+
 			space -= len;
 #ifdef ZERO_COPY_SOCKETS
 			if (cow_send)
--- a/sys/kern/uipc_socket2.c
+++ b/sys/kern/uipc_socket2.c
@ -959,15 +959,12 @@ sbcreatecontrol(p, size, type, level)

 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
 		return ((struct mbuf *) NULL);
-	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+	if (CMSG_SPACE((u_int)size > MLEN))
+		m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
+	else
+		m = m_get(M_DONTWAIT, MT_CONTROL);
+	if (m == NULL)
 		return ((struct mbuf *) NULL);
-	if (CMSG_SPACE((u_int)size) > MLEN) {
-		MCLGET(m, M_DONTWAIT);
-		if ((m->m_flags & M_EXT) == 0) {
-			m_free(m);
-			return ((struct mbuf *) NULL);
-		}
-	}
 	cp = mtod(m, struct cmsghdr *);
 	m->m_len = 0;
 	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
@ -84,6 +85,21 @@ static int getsockname1(struct thread *td, struct getsockname_args *uap,
 static int getpeername1(struct thread *td, struct getpeername_args *uap,
 			int compat);

+/*
+ * NSFBUFS-related variables and associated sysctls
+ */
+int nsfbufs;
+int nsfbufspeak;
+int nsfbufsused;
+
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
+    "Maximum number of sendfile(2) sf_bufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
+    "Number of sendfile(2) sf_bufs at peak usage");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
+    "Number of sendfile(2) sf_bufs in use");
+
 /*
 * System call interface to the socket abstraction.
 */
--- a/sys/sparc64/sparc64/vm_machdep.c
+++ b/sys/sparc64/sparc64/vm_machdep.c
@ -86,6 +86,10 @@
 #include <machine/tlb.h>
 #include <machine/tstate.h>

+#ifndef NSFBUFS
+#define	NSFBUFS		(512 + maxusers * 16)
+#endif
+
 static void	sf_buf_init(void *arg);
 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)

@ -351,6 +355,9 @@ sf_buf_init(void *arg)
 	vm_offset_t sf_base;
 	int i;

+	nsfbufs = NSFBUFS;
+	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
+
 	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF);
 	SLIST_INIT(&sf_freelist.sf_head);
 	sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE);
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@ -10,7 +10,7 @@
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
@ -33,7 +33,12 @@
 #ifndef _SYS_MBUF_H_
 #define	_SYS_MBUF_H_

+/* XXX: These includes suck. Sorry! */
 #include <sys/queue.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <vm/uma.h>
+#endif

 /*
 * Mbufs are of a single size, MSIZE (sys/param.h), which
@ -57,6 +62,16 @@
 */
 #define	mtod(m, t)	((t)((m)->m_data))
 #define	dtom(x)		((struct mbuf *)((intptr_t)(x) & ~(MSIZE-1)))
+
+/*
+ * Argument structure passed to UMA routines during mbuf and packet
+ * allocations.
+ */
+struct mb_args {
+	int	flags;	/* Flags for mbuf being allocated */
+	int	how;	/* How to allocate: M_WAITOK or M_DONTWAIT */
+	short	type;	/* Type of mbuf being allocated */
+};
 #endif /* _KERNEL */

 /*
@ -167,6 +182,7 @@ struct mbuf {
 */
 #define	EXT_CLUSTER	1	/* mbuf cluster */
 #define	EXT_SFBUF	2	/* sendfile(2)'s sf_bufs */
+#define	EXT_PACKET	3	/* came out of Packet zone */
 #define	EXT_NET_DRV	100	/* custom ext_buf provided by net driver(s) */
 #define	EXT_MOD_TYPE	200	/* custom module's ext_buf type */
 #define	EXT_DISPOSABLE	300	/* can throw this buffer away w/page flipping */
@ -222,29 +238,13 @@ struct mbuf {
 #define	MT_OOBDATA	15	/* expedited data  */
 #define	MT_NTYPES	16	/* number of mbuf types for mbtypes[] */

-/*
- * Mbuf and cluster allocation statistics PCPU structure.
- */
-struct mbpstat {
-	u_long	mb_mbfree;
-	u_long	mb_mbbucks;
-	u_long	mb_clfree;
-	u_long	mb_clbucks;
-	long	mb_mbtypes[MT_NTYPES];
-	short	mb_active;
-};
-
 /*
 * General mbuf allocator statistics structure.
- * XXX: Modifications of these are not protected by any mutex locks nor by
- * any atomic() manipulations.  As a result, we may occasionally lose
- * a count or two.  Luckily, not all of these fields are modified at all
- * and remain static, and those that are manipulated are only manipulated
- * in failure situations, which do not occur (hopefully) very often.
 */
 struct mbstat {
-	u_long	m_drops;	/* times failed to allocate */
-	u_long	m_wait;		/* times succesfully returned from wait */
+	u_long	m_mbufs;	/* XXX */
+	u_long	m_mclusts;	/* XXX */
+
 	u_long	m_drain;	/* times drained protocols for space */
 	u_long	m_mcfail;	/* XXX: times m_copym failed */
 	u_long	m_mpfail;	/* XXX: times m_pullup failed */
@ -253,10 +253,10 @@ struct mbstat {
 	u_long	m_minclsize;	/* min length of data to allocate a cluster */
 	u_long	m_mlen;		/* length of data in an mbuf */
 	u_long	m_mhlen;	/* length of data in a header mbuf */
-	u_int	m_mbperbuck;	/* number of mbufs per "bucket" */
-	u_int	m_clperbuck;	/* number of clusters per "bucket" */
-	/* Number of mbtypes (gives # elems in mbpstat's mb_mbtypes[] array: */
+
+	/* Number of mbtypes (gives # elems in mbtypes[] array: */
 	short	m_numtypes;
+
 	/* XXX: Sendfile stats should eventually move to their own struct */
 	u_long	sf_iocnt;	/* times sendfile had to do disk I/O */
 	u_long	sf_allocfail;	/* times sfbuf allocation failed */
@ -265,14 +265,23 @@ struct mbstat {

 /*
 * Flags specifying how an allocation should be made.
- * M_DONTWAIT means "don't block if nothing is available" whereas
- * M_TRYWAIT means "block for mbuf_wait ticks at most if nothing is
- * available."
+ *
+ * The flag to use is as follows:
+ * - M_DONTWAIT or M_NOWAIT from an interrupt handler to not block allocation.
+ * - M_WAIT or M_WAITOK or M_TRYWAIT from wherever it is safe to block.
+ *
+ * M_DONTWAIT/M_NOWAIT means that we will not block the thread explicitly
+ * and if we cannot allocate immediately we may return NULL,
+ * whereas M_WAIT/M_WAITOK/M_TRYWAIT means that if we cannot allocate
+ * resources we will block until they are available, and thus never
+ * return NULL.
+ *
+ * XXX Eventually just phase this out to use M_WAITOK/M_NOWAIT.
 */
-#define	M_DONTWAIT	0x4		/* don't conflict with M_NOWAIT */
-#define	M_TRYWAIT	0x8		/* or M_WAITOK */
-#define	M_WAIT		M_TRYWAIT	/* XXX: deprecated */
-#define	MBTOM(how)	((how) & M_TRYWAIT ? M_WAITOK : M_NOWAIT)
+#define	MBTOM(how)	(how)
+#define	M_DONTWAIT	M_NOWAIT
+#define	M_TRYWAIT	M_WAITOK
+#define	M_WAIT		M_WAITOK

 #ifdef _KERNEL
 /*-
@ -295,36 +304,121 @@ struct mbstat {

 #define	MEXT_ADD_REF(m)	atomic_add_int((m)->m_ext.ref_cnt, 1)

+/*
+ * Network buffer allocation API
+ *
+ * The rest of it is defined in kern/subr_mbuf.c
+ */
+
+extern uma_zone_t	zone_mbuf;
+extern uma_zone_t	zone_clust;
+extern uma_zone_t	zone_pack;
+
+static __inline struct mbuf	*m_get(int how, short type);
+static __inline struct mbuf	*m_gethdr(int how, short type);
+static __inline struct mbuf	*m_getcl(int how, short type, int flags);
+static __inline struct mbuf	*m_getclr(int how, short type);	/* XXX */
+static __inline struct mbuf	*m_free(struct mbuf *m);
+static __inline void		 m_clget(struct mbuf *m, int how);
+static __inline void		 m_chtype(struct mbuf *m, short new_type);
+void				 mb_free_ext(struct mbuf *);
+
+static __inline
+struct mbuf *
+m_get(int how, short type)
+{
+	struct mb_args args;
+
+	args.flags = 0;
+	args.how = how;
+	args.type = type;
+	return (uma_zalloc_arg(zone_mbuf, &args, how));
+}
+
+/* XXX This should be depracated, very little use */
+static __inline
+struct mbuf *
+m_getclr(int how, short type)
+{
+	struct mbuf *m;
+	struct mb_args args;
+
+	args.flags = 0;
+	args.how = how;
+	args.type = type;
+	m = uma_zalloc_arg(zone_mbuf, &args, how);
+	if (m != NULL)
+		bzero(m->m_data, MLEN);
+	return m;
+}
+
+static __inline
+struct mbuf *
+m_gethdr(int how, short type)
+{
+	struct mb_args args;
+
+	args.flags = M_PKTHDR;
+	args.how = how;
+	args.type = type;
+	return (uma_zalloc_arg(zone_mbuf, &args, how));
+}
+
+static __inline
+struct mbuf *
+m_getcl(int how, short type, int flags)
+{
+	struct mb_args args;
+
+	args.flags = flags;
+	args.how = how;
+	args.type = type;
+	return (uma_zalloc_arg(zone_pack, &args, how));
+}
+
+static __inline
+struct mbuf *
+m_free(struct mbuf *m)
+{
+	struct mbuf *n = m->m_next;
+
+#ifdef INVARIANTS
+	m->m_flags |= M_FREELIST;
+#endif
+	if (m->m_flags & M_EXT)
+		mb_free_ext(m);
+	else
+		uma_zfree(zone_mbuf, m);
+	return n;
+}
+
+static __inline
+void
+m_clget(struct mbuf *m, int how)
+{
+	m->m_ext.ext_buf = NULL;
+	uma_zalloc_arg(zone_clust, m, how);
+}
+
+static __inline
+void
+m_chtype(struct mbuf *m, short new_type)
+{
+	m->m_type = new_type;
+}
+
 /*
 * mbuf, cluster, and external object allocation macros
 * (for compatibility purposes).
 */
 /* NB: M_COPY_PKTHDR is deprecated.  Use M_MOVE_PKTHDR or m_dup_pktdr. */
 #define	M_MOVE_PKTHDR(to, from)	m_move_pkthdr((to), (from))
-#define	m_getclr(how, type)	m_get_clrd((how), (type))
 #define	MGET(m, how, type)	((m) = m_get((how), (type)))
 #define	MGETHDR(m, how, type)	((m) = m_gethdr((how), (type)))
 #define	MCLGET(m, how)		m_clget((m), (how))
 #define	MEXTADD(m, buf, size, free, args, flags, type) 			\
    m_extadd((m), (caddr_t)(buf), (size), (free), (args), (flags), (type))

-/*
- * MEXTFREE(m): disassociate (and possibly free) an external object from (m).
- *
- * If the atomic_cmpset_int() returns 0, then we effectively do nothing
- * in terms of "cleaning up" (freeing the ext buf and ref. counter) as
- * this means that either there are still references, or another thread
- * is taking care of the clean-up.
- */
-#define	MEXTFREE(m) do {						\
-	struct mbuf *_mb = (m);						\
-									\
-	MEXT_REM_REF(_mb);						\
-	if (atomic_cmpset_int(_mb->m_ext.ref_cnt, 0, 1))		\
-		_mext_free(_mb);					\
-	_mb->m_flags &= ~M_EXT;						\
-} while (0)
-
 /*
 * Evaluate TRUE if it's safe to write to the mbuf m's data region (this
 * can be both the local data payload, or an external buffer area,
@ -425,18 +519,13 @@ extern	int max_linkhdr;		/* Largest link-level header */
 extern	int max_protohdr;		/* Largest protocol header */
 extern	struct mbstat mbstat;		/* General mbuf stats/infos */
 extern	int nmbclusters;		/* Maximum number of clusters */
-extern	int nmbcnt;			/* Scale kmem_map for counter space */
-extern	int nmbufs;			/* Maximum number of mbufs */

 struct uio;

-void		 _mext_free(struct mbuf *);
 void		 m_adj(struct mbuf *, int);
 int		 m_apply(struct mbuf *, int, int,
 		    int (*)(void *, void *, u_int), void *);
 void		 m_cat(struct mbuf *, struct mbuf *);
-void		 m_chtype(struct mbuf *, short);
-void		 m_clget(struct mbuf *, int);
 void		 m_extadd(struct mbuf *, caddr_t, u_int,
 		    void (*)(void *, void *), void *, int, int);
 void		 m_copyback(struct mbuf *, int, int, c_caddr_t);
@ -451,13 +540,7 @@ struct	mbuf	*m_dup(struct mbuf *, int);
 int		 m_dup_pkthdr(struct mbuf *, struct mbuf *, int);
 u_int		 m_fixhdr(struct mbuf *);
 struct	mbuf	*m_fragment(struct mbuf *, int, int);
-struct	mbuf	*m_free(struct mbuf *);
 void		 m_freem(struct mbuf *);
-struct	mbuf	*m_get(int, short);
-struct	mbuf	*m_get_clrd(int, short);
-struct	mbuf	*m_getcl(int, short, int);
-struct	mbuf	*m_gethdr(int, short);
-struct	mbuf	*m_gethdr_clrd(int, short);
 struct	mbuf	*m_getm(struct mbuf *, int, int, short);
 struct	mbuf	*m_getptr(struct mbuf *, int, int *);
 u_int		 m_length(struct mbuf *, struct mbuf **);
@ -470,7 +553,7 @@ struct	mbuf	*m_split(struct mbuf *, int, int);
 struct	mbuf	*m_uiotombuf(struct uio *, int, int);

 /*-
- * Packets may have annotations attached by affixing a list
+ * Network packets may have annotations attached by affixing a list
 * of "packet tags" to the pkthdr structure.  Packet tags are
 * dynamically allocated semi-opaque data structures that have
 * a fixed header (struct m_tag) that specifies the size of the
--- a/sys/vm/uma.h
+++ b/sys/vm/uma.h
@ -43,7 +43,7 @@

 /* Types and type defs */

-struct uma_zone; 
+struct uma_zone;
 /* Opaque type used as a handle to the zone */
 typedef struct uma_zone * uma_zone_t;

@ -157,11 +157,45 @@ typedef void (*uma_fini)(void *mem, int size);
 *	A pointer to a structure which is intended to be opaque to users of
 *	the interface.  The value may be null if the wait flag is not set.
 */
-
 uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 			uma_init uminit, uma_fini fini, int align,
 			u_int16_t flags);

+/*
+ * Create a secondary uma zone
+ *
+ * Arguments:
+ *	name  The text name of the zone for debugging and stats, this memory
+ *		should not be freed until the zone has been deallocated.
+ *	ctor  The constructor that is called when the object is allocated
+ *	dtor  The destructor that is called when the object is freed.
+ *	zinit  An initializer that sets up the initial state of the memory
+ *		as the object passes from the Keg's slab to the Zone's cache.
+ *	zfini  A discard function that undoes initialization done by init
+ *		as the object passes from the Zone's cache to the Keg's slab.
+ *
+ *		ctor/dtor/zinit/zfini may all be null, see notes above.
+ *		Note that the zinit and zfini specified here are NOT
+ *		exactly the same as the init/fini specified to uma_zcreate()
+ *		when creating a master zone.  These zinit/zfini are called
+ *		on the TRANSITION from keg to zone (and vice-versa). Once
+ *		these are set, the primary zone may alter its init/fini
+ *		(which are called when the object passes from VM to keg)
+ *		using uma_zone_set_init/fini()) as well as its own
+ *		zinit/zfini (unset by default for master zone) with
+ *		uma_zone_set_zinit/zfini() (note subtle 'z' prefix).
+ *
+ *	align A bitmask that corisponds to the requested alignment
+ *		eg 4 would be 0x3
+ *	flags A set of parameters that control the behavior of the zone
+ *
+ * Returns:
+ *	A pointer to a structure which is intended to be opaque to users of
+ *	the interface.  The value may be null if the wait flag is not set.
+ */
+uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
+		    uma_init zinit, uma_fini zfini, uma_zone_t master);
+
 /*
 * Definitions for uma_zcreate flags
 *
@ -185,6 +219,9 @@ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 					 * Use a hash table instead of caching
 					 * information in the vm_page.
 					 */
+#define	UMA_ZONE_SECONDARY	0x0200	/* Zone is a Secondary Zone */
+#define	UMA_ZONE_REFCNT		0x0400	/* Allocate refcnts in slabs */
+#define	UMA_ZONE_MAXBUCKET	0x0800	/* Use largest buckets */

 /* Definitions for align */
 #define UMA_ALIGN_PTR	(sizeof(void *) - 1)	/* Alignment fit for ptr */
@ -201,7 +238,6 @@ uma_zone_t uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 *	zone  The zone we want to destroy.
 *
 */
-
 void uma_zdestroy(uma_zone_t zone);

 /*
@ -375,6 +411,28 @@ int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int size);
 */
 void uma_zone_set_max(uma_zone_t zone, int nitems);

+/*
+ * The following two routines (uma_zone_set_init/fini)
+ * are used to set the backend init/fini pair which acts on an
+ * object as it becomes allocated and is placed in a slab within
+ * the specified zone's backing keg.  These should probably not
+ * be changed once allocations have already begun and only
+ * immediately upon zone creation.
+ */
+void uma_zone_set_init(uma_zone_t zone, uma_init uminit);
+void uma_zone_set_fini(uma_zone_t zone, uma_fini fini);
+
+/*
+ * The following two routines (uma_zone_set_zinit/zfini) are
+ * used to set the zinit/zfini pair which acts on an object as
+ * it passes from the backing Keg's slab cache to the
+ * specified Zone's bucket cache.  These should probably not
+ * be changed once allocations have already begun and
+ * only immediately upon zone creation.
+ */
+void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit);
+void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini);
+
 /*
 * Replaces the standard page_alloc or obj_alloc functions for this zone
 *
@ -430,5 +488,19 @@ void uma_zone_set_freef(uma_zone_t zone, uma_free freef);
 */
 void uma_prealloc(uma_zone_t zone, int itemcnt);

+/*
+ * Used to lookup the reference counter allocated for an item
+ * from a UMA_ZONE_REFCNT zone.  For UMA_ZONE_REFCNT zones,
+ * reference counters are allocated for items and stored in
+ * the underlying slab header.
+ *
+ * Arguments:
+ * 	zone  The UMA_ZONE_REFCNT zone to which the item belongs.
+ *	item  The address of the item for which we want a refcnt.
+ *
+ * Returns:
+ * 	A pointer to a u_int32_t reference counter.
+ */
+u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item);

 #endif
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
--- a/sys/vm/uma_dbg.c
+++ b/sys/vm/uma_dbg.c
@ -192,15 +192,17 @@ static uma_slab_t
 uma_dbg_getslab(uma_zone_t zone, void *item)
 {
 	uma_slab_t slab;
+	uma_keg_t keg;
 	u_int8_t *mem;

+	keg = zone->uz_keg;
 	mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
-	if (zone->uz_flags & UMA_ZONE_MALLOC) {
+	if (keg->uk_flags & UMA_ZONE_MALLOC) {
 		slab = vtoslab((vm_offset_t)mem);
-	} else if (zone->uz_flags & UMA_ZONE_HASH) {
-		slab = hash_sfind(&zone->uz_hash, mem);
+	} else if (keg->uk_flags & UMA_ZONE_HASH) {
+		slab = hash_sfind(&keg->uk_hash, mem);
 	} else {
-		mem += zone->uz_pgoff;
+		mem += keg->uk_pgoff;
 		slab = (uma_slab_t)mem;
 	}

@ -215,8 +217,10 @@ uma_dbg_getslab(uma_zone_t zone, void *item)
 void
 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 {
+	uma_keg_t keg;
 	int freei;

+	keg = zone->uz_keg;
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
@ -225,9 +229,9 @@ uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 	}

 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
-	    / zone->uz_rsize;
+	    / keg->uk_rsize;

-	slab->us_freelist[freei] = 255;
+	slab->us_freelist[freei].us_item = 255;

 	return;
 }
@ -241,8 +245,10 @@ uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 void
 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 {
+	uma_keg_t keg;
 	int freei;

+	keg = zone->uz_keg;
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
@ -251,22 +257,22 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 	}

 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
-	    / zone->uz_rsize;
+	    / keg->uk_rsize;

-	if (freei >= zone->uz_ipers)
+	if (freei >= keg->uk_ipers)
 		panic("zone: %s(%p) slab %p freelist %d out of range 0-%d\n",
-		    zone->uz_name, zone, slab, freei, zone->uz_ipers-1);
+		    zone->uz_name, zone, slab, freei, keg->uk_ipers-1);

-	if (((freei * zone->uz_rsize) + slab->us_data) != item) {
+	if (((freei * keg->uk_rsize) + slab->us_data) != item) {
 		printf("zone: %s(%p) slab %p freed address %p unaligned.\n",
 		    zone->uz_name, zone, slab, item);
 		panic("should be %p\n",
-		    (freei * zone->uz_rsize) + slab->us_data);
+		    (freei * keg->uk_rsize) + slab->us_data);
 	}

-	if (slab->us_freelist[freei] != 255) {
+	if (slab->us_freelist[freei].us_item != 255) {
 		printf("Slab at %p, freei %d = %d.\n",
-		    slab, freei, slab->us_freelist[freei]);
+		    slab, freei, slab->us_freelist[freei].us_item);
 		panic("Duplicate free of item %p from zone %p(%s)\n",
 		    item, zone, zone->uz_name);
 	}
@ -276,5 +282,5 @@ uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 	 * Until then the count of valid slabs will make sure we don't
 	 * accidentally follow this and assume it's a valid index.
 	 */
-	slab->us_freelist[freei] = 0;
+	slab->us_freelist[freei].us_item = 0;
 }
--- a/sys/vm/uma_int.h
+++ b/sys/vm/uma_int.h
@ -35,10 +35,10 @@
 /* 
 * Here's a quick description of the relationship between the objects:
 *
- * Zones contain lists of slabs which are stored in either the full bin, empty
+ * Kegs contain lists of slabs which are stored in either the full bin, empty
 * bin, or partially allocated bin, to reduce fragmentation.  They also contain
 * the user supplied value for size, which is adjusted for alignment purposes
- * and rsize is the result of that.  The zone also stores information for
+ * and rsize is the result of that.  The Keg also stores information for
 * managing a hash of page addresses that maps pages to uma_slab_t structures
 * for pages that don't have embedded uma_slab_t's.
 *  
@ -67,6 +67,20 @@
 * so at this time it may not make sense to optimize for it.  This can, of 
 * course, be solved with dynamic slab sizes.
 *
+ * Kegs may serve multiple Zones but by far most of the time they only serve
+ * one.  When a Zone is created, a Keg is allocated and setup for it.  While
+ * the backing Keg stores slabs, the Zone caches Buckets of items allocated
+ * from the slabs.  Each Zone is equipped with an init/fini and ctor/dtor
+ * pair, as well as with its own set of small per-CPU caches, layered above
+ * the Zone's general Bucket cache.
+ *
+ * The PCPU caches are protected by their own locks, while the Zones backed
+ * by the same Keg all share a common Keg lock (to coalesce contention on
+ * the backing slabs).  The backing Keg typically only serves one Zone but
+ * in the case of multiple Zones, one of the Zones is considered the
+ * Master Zone and all Zone-related stats from the Keg are done in the
+ * Master Zone.  For an example of a Multi-Zone setup, refer to the
+ * Mbuf allocation code.
 */

 /*
@ -134,28 +148,6 @@
 		SLIST_REMOVE(&(h)->uh_slab_hash[UMA_HASH((h),		\
 		    (mem))], (s), uma_slab, us_hlink);

-/* Page management structure */
-
-/* Sorry for the union, but space efficiency is important */
-struct uma_slab {
-	uma_zone_t	us_zone;		/* Zone we live in */
-	union {
-		LIST_ENTRY(uma_slab)	_us_link;	/* slabs in zone */
-		unsigned long	_us_size;	/* Size of allocation */
-	} us_type;
-	SLIST_ENTRY(uma_slab)	us_hlink;	/* Link for hash table */
-	u_int8_t	*us_data;		/* First item */
-	u_int8_t	us_flags;		/* Page flags see uma.h */
-	u_int8_t	us_freecount;	/* How many are free? */
-	u_int8_t	us_firstfree;	/* First free item index */
-	u_int8_t	us_freelist[1];	/* Free List (actually larger) */
-};
-
-#define us_link	us_type._us_link
-#define us_size	us_type._us_size
-
-typedef struct uma_slab * uma_slab_t;
-
 /* Hash table for freed address -> slab translation */

 SLIST_HEAD(slabhead, uma_slab);
@ -187,6 +179,97 @@ struct uma_cache {

 typedef struct uma_cache * uma_cache_t;

+/*
+ * Keg management structure
+ *
+ * TODO: Optimize for cache line size
+ *
+ */
+struct uma_keg {
+	LIST_ENTRY(uma_keg)	uk_link;	/* List of all kegs */
+
+	struct mtx	uk_lock;	/* Lock for the keg */
+	struct uma_hash	uk_hash;
+
+	LIST_HEAD(,uma_zone)	uk_zones;	/* Keg's zones */
+	LIST_HEAD(,uma_slab)	uk_part_slab;	/* partially allocated slabs */
+	LIST_HEAD(,uma_slab)	uk_free_slab;	/* empty slab list */
+	LIST_HEAD(,uma_slab)	uk_full_slab;	/* full slabs */
+
+	u_int32_t	uk_recurse;	/* Allocation recursion count */
+	u_int32_t	uk_align;	/* Alignment mask */
+	u_int32_t	uk_pages;	/* Total page count */
+	u_int32_t	uk_free;	/* Count of items free in slabs */
+	u_int32_t	uk_size;	/* Requested size of each item */
+	u_int32_t	uk_rsize;	/* Real size of each item */
+	u_int32_t	uk_maxpages;	/* Maximum number of pages to alloc */
+
+	uma_init	uk_init;	/* Keg's init routine */
+	uma_fini	uk_fini;	/* Keg's fini routine */
+	uma_alloc	uk_allocf;	/* Allocation function */
+	uma_free	uk_freef;	/* Free routine */
+
+	struct vm_object	*uk_obj;	/* Zone specific object */
+	vm_offset_t	uk_kva;		/* Base kva for zones with objs */
+	uma_zone_t	uk_slabzone;	/* Slab zone backing us, if OFFPAGE */
+
+	u_int16_t	uk_pgoff;	/* Offset to uma_slab struct */
+	u_int16_t	uk_ppera;	/* pages per allocation from backend */
+	u_int16_t	uk_ipers;	/* Items per slab */
+	u_int16_t	uk_flags;	/* Internal flags */
+};
+
+/* Simpler reference to uma_keg for internal use. */
+typedef struct uma_keg * uma_keg_t;
+
+/* Page management structure */
+
+/* Sorry for the union, but space efficiency is important */
+struct uma_slab_head {
+	uma_keg_t	us_keg;			/* Keg we live in */
+	union {
+		LIST_ENTRY(uma_slab)	_us_link;	/* slabs in zone */
+		unsigned long	_us_size;	/* Size of allocation */
+	} us_type;
+	SLIST_ENTRY(uma_slab)	us_hlink;	/* Link for hash table */
+	u_int8_t	*us_data;		/* First item */
+	u_int8_t	us_flags;		/* Page flags see uma.h */
+	u_int8_t	us_freecount;	/* How many are free? */
+	u_int8_t	us_firstfree;	/* First free item index */
+};
+
+/* The standard slab structure */
+struct uma_slab {
+	struct uma_slab_head	us_head;	/* slab header data */
+	struct {
+		u_int8_t	us_item;
+	} us_freelist[1];			/* actual number bigger */
+};
+
+/*
+ * The slab structure for UMA_ZONE_REFCNT zones for whose items we
+ * maintain reference counters in the slab for.
+ */
+struct uma_slab_refcnt {
+	struct uma_slab_head	us_head;	/* slab header data */
+	struct {
+		u_int8_t	us_item;
+		u_int32_t	us_refcnt;
+	} us_freelist[1];			/* actual number bigger */
+};
+
+#define	us_keg		us_head.us_keg
+#define	us_link		us_head.us_type._us_link
+#define	us_size		us_head.us_type._us_size
+#define	us_hlink	us_head.us_hlink
+#define	us_data		us_head.us_data
+#define	us_flags	us_head.us_flags
+#define	us_freecount	us_head.us_freecount
+#define	us_firstfree	us_head.us_firstfree
+
+typedef struct uma_slab * uma_slab_t;
+typedef struct uma_slab_refcnt * uma_slabrefcnt_t;
+
 /*
 * Zone management structure 
 *
@ -195,42 +278,22 @@ typedef struct uma_cache * uma_cache_t;
 */
 struct uma_zone {
 	char		*uz_name;	/* Text name of the zone */
-	LIST_ENTRY(uma_zone)	uz_link;	/* List of all zones */
-	u_int32_t	uz_align;	/* Alignment mask */
-	u_int32_t	uz_pages;	/* Total page count */
+	struct mtx	*uz_lock;	/* Lock for the zone (keg's lock) */
+	uma_keg_t	uz_keg;		/* Our underlying Keg */

-/* Used during alloc / free */
-	struct mtx	uz_lock;	/* Lock for the zone */
-	u_int32_t	uz_free;	/* Count of items free in slabs */
-	u_int16_t	uz_ipers;	/* Items per slab */
-	u_int16_t	uz_flags;	/* Internal flags */
-
-	LIST_HEAD(,uma_slab)	uz_part_slab;	/* partially allocated slabs */
-	LIST_HEAD(,uma_slab)	uz_free_slab;	/* empty slab list */
-	LIST_HEAD(,uma_slab)	uz_full_slab;	/* full slabs */
+	LIST_ENTRY(uma_zone)	uz_link;	/* List of all zones in keg */
 	LIST_HEAD(,uma_bucket)	uz_full_bucket;	/* full buckets */
 	LIST_HEAD(,uma_bucket)	uz_free_bucket;	/* Buckets for frees */
-	u_int32_t	uz_size;	/* Requested size of each item */
-	u_int32_t	uz_rsize;	/* Real size of each item */
-
-	struct uma_hash	uz_hash;
-	u_int16_t	uz_pgoff;	/* Offset to uma_slab struct */
-	u_int16_t	uz_ppera;	/* pages per allocation from backend */

 	uma_ctor	uz_ctor;	/* Constructor for each allocation */
 	uma_dtor	uz_dtor;	/* Destructor */
-	u_int64_t	uz_allocs;	/* Total number of allocations */
-
 	uma_init	uz_init;	/* Initializer for each item */
 	uma_fini	uz_fini;	/* Discards memory */
-	uma_alloc	uz_allocf;	/* Allocation function */
-	uma_free	uz_freef;	/* Free routine */
-	struct vm_object	*uz_obj;	/* Zone specific object */
-	vm_offset_t	uz_kva;		/* Base kva for zones with objs */
-	u_int32_t	uz_maxpages;	/* Maximum number of pages to alloc */
-	int		uz_recurse;	/* Allocation recursion count */
+
+	u_int64_t	uz_allocs;	/* Total number of allocations */
 	uint16_t	uz_fills;	/* Outstanding bucket fills */
 	uint16_t	uz_count;	/* Highest value ub_ptr can have */
+
 	/*
 	 * This HAS to be the last item because we adjust the zone size
 	 * based on NCPU and then allocate the space for the zones.
@ -256,16 +319,16 @@ void uma_large_free(uma_slab_t slab);
 #define	ZONE_LOCK_INIT(z, lc)					\
 	do {							\
 		if ((lc))					\
-			mtx_init(&(z)->uz_lock, (z)->uz_name,	\
+			mtx_init((z)->uz_lock, (z)->uz_name,	\
 			    (z)->uz_name, MTX_DEF | MTX_DUPOK);	\
 		else						\
-			mtx_init(&(z)->uz_lock, (z)->uz_name,	\
+			mtx_init((z)->uz_lock, (z)->uz_name,	\
 			    "UMA zone", MTX_DEF | MTX_DUPOK);	\
 	} while (0)
 	    
-#define	ZONE_LOCK_FINI(z)	mtx_destroy(&(z)->uz_lock)
-#define	ZONE_LOCK(z)	mtx_lock(&(z)->uz_lock)
-#define ZONE_UNLOCK(z)	mtx_unlock(&(z)->uz_lock)
+#define	ZONE_LOCK_FINI(z)	mtx_destroy((z)->uz_lock)
+#define	ZONE_LOCK(z)	mtx_lock((z)->uz_lock)
+#define ZONE_UNLOCK(z)	mtx_unlock((z)->uz_lock)

 #define	CPU_LOCK_INIT(cpu)					\
 	mtx_init(&uma_pcpu_mtx[(cpu)], "UMA pcpu", "UMA pcpu",	\
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@ -320,16 +320,6 @@ kmem_malloc(map, size, flags)
 	vm_map_lock(map);
 	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
 		vm_map_unlock(map);
-		if (map != kmem_map) {
-			static int last_report; /* when we did it (in ticks) */
-			if (ticks < last_report ||
-			    (ticks - last_report) >= hz) {
-				last_report = ticks;
-				printf("Out of mbuf address space!\n");
-				printf("Consider increasing NMBCLUSTERS\n");
-			}
-			return (0);
-		}
 		if ((flags & M_NOWAIT) == 0)
 			panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
 				(long)size, (long)map->size);
--- a/usr.bin/netstat/main.c
+++ b/usr.bin/netstat/main.c
@ -256,7 +256,6 @@ static char *nlistf = NULL, *memf = NULL;
 int	Aflag;		/* show addresses of protocol control block */
 int	aflag;		/* show all sockets (including servers) */
 int	bflag;		/* show i/f total bytes in/out */
-int	cflag;		/* show mbuf cache information */
 int	dflag;		/* show i/f dropped packets */
 int	gflag;		/* show group (multicast) routing or stats */
 int	iflag;		/* show interfaces */
@ -297,9 +296,6 @@ main(int argc, char *argv[])
 		case 'b':
 			bflag = 1;
 			break;
-		case 'c':
-			cflag = 1;
-			break;
 		case 'd':
 			dflag = 1;
 			break;
@ -425,10 +421,6 @@ main(int argc, char *argv[])
 	if (nlistf != NULL || memf != NULL)
 		setgid(getgid());

-	if (cflag && !mflag) {
-		(void)fprintf(stderr, "-c only valid with -m\n");
-		usage();
-	}
 	if (mflag) {
 		if (memf != NULL) {
 			if (kread(0, 0, 0) == 0)
--- a/usr.bin/netstat/mbuf.c
+++ b/usr.bin/netstat/mbuf.c
@ -99,17 +99,12 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
    u_long mbhiaddr, u_long clhiaddr, u_long mbloaddr, u_long clloaddr,
    u_long cpusaddr __unused, u_long pgsaddr, u_long mbpaddr)
 {
-	int i, j, nmbufs, nmbclusters, page_size, num_objs;
+	int i, nmbclusters;
 	int nsfbufs, nsfbufspeak, nsfbufsused;
-	u_int mbuf_hiwm, clust_hiwm, mbuf_lowm, clust_lowm;
-	u_long totspace[2], totused[2];
-	u_long gentotnum, gentotfree, totnum, totfree;
-	u_long totmem, totmemalloced, totmemused;
 	short nmbtypes;
 	size_t mlen;
 	long *mbtypes = NULL;
 	struct mbstat *mbstat = NULL;
-	struct mbpstat **mbpstat = NULL;
 	struct mbtypenames *mp;
 	bool *seen = NULL;

@ -119,50 +114,12 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
 		goto err;
 	}

-	/*
-	 * XXX: Unfortunately, for the time being, we have to fetch
-	 * the total length of the per-CPU stats area via sysctl
-	 * (regardless of whether we're looking at a core or not.
-	 */
-	if (sysctlbyname("kern.ipc.mb_statpcpu", NULL, &mlen, NULL, 0) < 0) {
-		warn("sysctl: retrieving mb_statpcpu len");
-		goto err;
-	} 
-	num_objs = (int)(mlen / sizeof(struct mbpstat));
-	if ((mbpstat = calloc(num_objs, sizeof(struct mbpstat *))) == NULL) {
-		warn("calloc: cannot allocate memory for mbpstats pointers");
-		goto err;
-	}
-	if ((mbpstat[0] = calloc(num_objs, sizeof(struct mbpstat))) == NULL) {
-		warn("calloc: cannot allocate memory for mbpstats");
-		goto err;
-	}
-
 	if (mbaddr) {
-		if (kread(mbpaddr, (char *)mbpstat[0], mlen))
-			goto err; 
 		if (kread(mbaddr, (char *)mbstat, sizeof mbstat))
 			goto err;
 		if (kread(nmbcaddr, (char *)&nmbclusters, sizeof(int)))
 			goto err;
-		if (kread(nmbufaddr, (char *)&nmbufs, sizeof(int)))
-			goto err;
-		if (kread(mbhiaddr, (char *)&mbuf_hiwm, sizeof(u_int)))
-			goto err;
-		if (kread(clhiaddr, (char *)&clust_hiwm, sizeof(u_int)))
-			goto err;
-		if (kread(mbloaddr, (char *)&mbuf_lowm, sizeof(u_int)))
-			goto err;
-		if (kread(clloaddr, (char *)&clust_lowm, sizeof(u_int)))
-			goto err;
-		if (kread(pgsaddr, (char *)&page_size, sizeof(int)))
-			goto err;
 	} else {
-		if (sysctlbyname("kern.ipc.mb_statpcpu", mbpstat[0], &mlen,
-		    NULL, 0) < 0) {
-			warn("sysctl: retrieving mb_statpcpu");
-			goto err;
-		}
 		mlen = sizeof *mbstat;
 		if (sysctlbyname("kern.ipc.mbstat", mbstat, &mlen, NULL, 0)
 		    < 0) {
@ -175,43 +132,9 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
 			warn("sysctl: retrieving nmbclusters");
 			goto err;
 		}
-		mlen = sizeof(int);
-		if (sysctlbyname("kern.ipc.nmbufs", &nmbufs, &mlen, NULL, 0)
-		    < 0) {
-			warn("sysctl: retrieving nmbufs");
-			goto err;
-		}
-		mlen = sizeof(u_int);
-		if (sysctlbyname("kern.ipc.mbuf_hiwm", &mbuf_hiwm, &mlen,
-		    NULL, 0) < 0) {
-			warn("sysctl: retrieving mbuf_hiwm");
-			goto err;
-		}
-		mlen = sizeof(u_int);
-		if (sysctlbyname("kern.ipc.clust_hiwm", &clust_hiwm, &mlen,
-		    NULL, 0) < 0) {
-			warn("sysctl: retrieving clust_hiwm");
-			goto err;
-		}
-		mlen = sizeof(u_int);
-		if (sysctlbyname("kern.ipc.mbuf_lowm", &mbuf_lowm, &mlen,
-		    NULL, 0) < 0) {
-			warn("sysctl: retrieving mbuf_lowm");
-			goto err;
-		}
-		mlen = sizeof(u_int);
-		if (sysctlbyname("kern.ipc.clust_lowm", &clust_lowm, &mlen,
-		    NULL, 0) < 0) {
-			warn("sysctl: retrieving clust_lowm");
-			goto err;
-		}
-		mlen = sizeof(int);
-		if (sysctlbyname("hw.pagesize", &page_size, &mlen, NULL, 0)
-		    < 0) {
-			warn("sysctl: retrieving hw.pagesize");
-			goto err;
-		}
 	}
+	if (mbstat->m_mbufs < 0) mbstat->m_mbufs = 0;		/* XXX */
+	if (mbstat->m_mclusts < 0) mbstat->m_mclusts = 0;	/* XXX */

 	nmbtypes = mbstat->m_numtypes;
 	if ((seen = calloc(nmbtypes, sizeof(*seen))) == NULL) {
@ -223,59 +146,13 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
 		goto err;
 	}

-	for (i = 0; i < num_objs; i++)
-		mbpstat[i] = mbpstat[0] + i;
-
 #undef MSIZE
 #define MSIZE		(mbstat->m_msize)
 #undef MCLBYTES
 #define	MCLBYTES	(mbstat->m_mclbytes)
-#define	GENLST		(num_objs - 1)

-	totnum = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck;
-	totfree = mbpstat[GENLST]->mb_mbfree;
-	for (j = 1; j < nmbtypes; j++)
-		mbtypes[j] += mbpstat[GENLST]->mb_mbtypes[j];
-	totspace[0] = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck * MSIZE;
-	for (i = 0; i < (num_objs - 1); i++) {
-		if (mbpstat[i]->mb_active == 0)
-			continue;
-		totspace[0] += mbpstat[i]->mb_mbbucks*mbstat->m_mbperbuck*MSIZE;
-		totnum += mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck;
-		totfree += mbpstat[i]->mb_mbfree;
-		for (j = 1; j < nmbtypes; j++)
-			mbtypes[j] += mbpstat[i]->mb_mbtypes[j]; 
-	}
-	totused[0] = totnum - totfree;
-	if (cflag) {
-		printf("mbuf usage:\n"
-		    "\tTotal:\t\t%lu/%lu/%d (in use/in pool/max)\n",
-		    totused[0], totnum, nmbufs);
-		gentotnum = mbpstat[GENLST]->mb_mbbucks * mbstat->m_mbperbuck;
-		gentotfree = mbpstat[GENLST]->mb_mbfree;
-		printf("\tGEN cache:\t%lu/%lu (in use/in pool)\n",
-		    gentotnum - gentotfree, gentotnum);
-	} else {
-		/* XXX: peak is now wrong. */
-		printf("%lu/%lu/%d mbufs in use (current/peak/max):\n",
-		    totused[0], totnum, nmbufs);
-	}
+	printf("%lu mbufs in use\n", mbstat->m_mbufs);

-	for (i = 0; cflag && i < (num_objs - 1); i++) {
-		if (mbpstat[i]->mb_active == 0)
-			continue;
-		printf("\tCPU #%d cache:\t%lu/%lu (in use/in pool)\n",
-		    i,
-		    (mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck -
-		     mbpstat[i]->mb_mbfree),
-		    (mbpstat[i]->mb_mbbucks * mbstat->m_mbperbuck));
-	}
-	if (cflag) {
-		printf("\tMbuf cache high watermark: %d\n", mbuf_hiwm);
-#ifdef NOTYET
-		printf("\tMbuf cache low watermark: %d\n", mbuf_lowm);
-#endif
-	}
 	for (mp = mbtypenames; mp->mt_name; mp++) {
 		if (mbtypes[mp->mt_type]) {
 			seen[mp->mt_type] = YES;
@ -288,53 +165,10 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
 			printf("\t  %lu mbufs allocated to <mbuf type: %d>\n",
 			    mbtypes[i], i);
 	}
-	if (cflag)
-		printf("\t%.1f%% of mbuf map consumed\n",
-		    totspace[0] * 100.0 / (nmbufs * MSIZE));

-	totnum = mbpstat[GENLST]->mb_clbucks * mbstat->m_clperbuck;
-	totfree = mbpstat[GENLST]->mb_clfree;
-	totspace[1] = mbpstat[GENLST]->mb_clbucks*mbstat->m_clperbuck*MCLBYTES;
-	for (i = 0; i < (num_objs - 1); i++) {
-		if (mbpstat[i]->mb_active == 0)
-			continue;
-		totspace[1] += mbpstat[i]->mb_clbucks * mbstat->m_clperbuck
-		    * MCLBYTES;
-		totnum += mbpstat[i]->mb_clbucks * mbstat->m_clperbuck;
-		totfree += mbpstat[i]->mb_clfree;
-	}
-	totused[1] = totnum - totfree;
-	if (cflag) {
-		printf("mbuf cluster usage:\n"
-		    "\tTotal:\t\t%lu/%lu/%d (in use/in pool/max)\n",
-		    totused[1], totnum, nmbclusters);
-		gentotnum = mbpstat[GENLST]->mb_clbucks * mbstat->m_clperbuck;
-		gentotfree = mbpstat[GENLST]->mb_clfree;
-		printf("\tGEN cache:\t%lu/%lu (in use/in pool)\n",
-		    gentotnum - gentotfree, gentotnum);
-	} else {
-		/* XXX: peak is now wrong. */
-		printf("%lu/%lu/%d mbuf clusters in use (current/peak/max)\n",
-		    totused[1], totnum, nmbclusters);
-	}
-	for (i = 0; cflag && i < (num_objs - 1); i++) {
-		if (mbpstat[i]->mb_active == 0)
-			continue;
-		printf("\tCPU #%d cache:\t%lu/%lu (in use/in pool)\n",
-		    i,
-		    (mbpstat[i]->mb_clbucks * mbstat->m_clperbuck -
-		     mbpstat[i]->mb_clfree),
-		    (mbpstat[i]->mb_clbucks * mbstat->m_clperbuck));
-	}
-	if (cflag) {
-		printf("\tCluster cache high watermark: %d\n", clust_hiwm);
-#ifdef NOTYET
-		printf("\tCluster cache low watermark: %d\n", clust_lowm);
-#endif
-	}
-	if (cflag)
-		printf("\t%.1f%% of cluster map consumed\n",
-		    totspace[1] * 100.0 / (nmbclusters * MCLBYTES));
+	printf("%lu/%d mbuf clusters in use (current/max)\n",
+	    mbstat->m_mclusts, nmbclusters);
+
 	mlen = sizeof(nsfbufs);
 	if (!sysctlbyname("kern.ipc.nsfbufs", &nsfbufs, &mlen, NULL, 0) &&
 	    !sysctlbyname("kern.ipc.nsfbufsused", &nsfbufsused, &mlen, NULL,
@ -344,15 +178,8 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
 		printf("%d/%d/%d sfbufs in use (current/peak/max)\n",
 		    nsfbufsused, nsfbufspeak, nsfbufs);
 	}
-	totmem = nmbufs * MSIZE + nmbclusters * MCLBYTES;
-	totmemalloced = totspace[0] + totspace[1];
-	totmemused = totused[0] * MSIZE + totused[1] * MCLBYTES;
-	printf(
-	    "%lu KBytes allocated to network (%.1f%% in use, %.1f%% wired)\n",
-	    totmem / 1024, totmemused * 100.0 / totmem,
-	    totmemalloced * 100.0 / totmem);
-	printf("%lu requests for memory denied\n", mbstat->m_drops);
-	printf("%lu requests for memory delayed\n", mbstat->m_wait);
+	printf("%lu KBytes allocated to network\n", (mbstat->m_mbufs * MSIZE +
+	    mbstat->m_mclusts * MCLBYTES) / 1024);
 	printf("%lu requests for sfbufs denied\n", mbstat->sf_allocfail);
 	printf("%lu requests for sfbufs delayed\n", mbstat->sf_allocwait);
 	printf("%lu requests for I/O initiated by sendfile\n",
@ -366,9 +193,4 @@ mbpr(u_long mbaddr, u_long mbtaddr __unused, u_long nmbcaddr, u_long nmbufaddr,
 		free(seen);
 	if (mbstat != NULL)
 		free(mbstat);
-	if (mbpstat != NULL) {
-		if (mbpstat[0] != NULL)
-			free(mbpstat[0]);
-		free(mbpstat);
-	}
 }
--- a/usr.bin/netstat/netstat.1
+++ b/usr.bin/netstat/netstat.1
@ -181,7 +181,6 @@ or for a single
 .Bk -words
 .Nm
 .Fl m
-.Op Fl c
 .Op Fl M Ar core
 .Op Fl N Ar system
 .Ek
@ -189,9 +188,6 @@ or for a single
 Show statistics recorded by the memory management routines
 .Pq Xr mbuf 9 .
 The network manages a private pool of memory buffers.
-The
-.Fl c
-option shows per-CPU statistics for caching.
 .It Xo
 .Bk -words
 .Nm
--- a/usr.bin/netstat/netstat.h
+++ b/usr.bin/netstat/netstat.h
@ -39,7 +39,6 @@
 extern int	Aflag;	/* show addresses of protocol control block */
 extern int	aflag;	/* show all sockets (including servers) */
 extern int	bflag;	/* show i/f total bytes in/out */
-extern int	cflag;	/* show mbuf cache information */
 extern int	dflag;	/* show i/f dropped packets */
 extern int	gflag;	/* show group (multicast) routing or stats */
 extern int	iflag;	/* show interfaces */
--- a/usr.bin/systat/mbufs.c
+++ b/usr.bin/systat/mbufs.c
@ -52,12 +52,9 @@ static const char sccsid[] = "@(#)mbufs.c	8.1 (Berkeley) 6/6/93";
 #include "systat.h"
 #include "extern.h"

-static struct mbpstat **mbpstat;
 static struct mbstat *mbstat;
-static int num_objs;
 static long *m_mbtypes;
 static short nmbtypes;
-#define	GENLST	(num_objs - 1)

 static struct mtnames {
 	short mt_type;
@ -101,20 +98,11 @@ void
 showmbufs()
 {
 	int i, j, max, idx;
-	u_long totfree;
+	u_long totmbufs;
 	char buf[10];
 	const char *mtname;

-	totfree = mbpstat[GENLST]->mb_mbfree; 
-	for (i = 1; i < nmbtypes; i++)
-		m_mbtypes[i] += mbpstat[GENLST]->mb_mbtypes[i];
-	for (i = 0; i < GENLST; i++) {
-		if (mbpstat[i]->mb_active == 0)
-			continue;
-		totfree += mbpstat[i]->mb_mbfree;
-		for (j = 1; j < nmbtypes; j++)
-			m_mbtypes[j] += mbpstat[i]->mb_mbtypes[j];
-	}
+	totmbufs = mbstat->m_mbufs;

 	/*
 	 * Print totals for different mbuf types.
@ -159,16 +147,16 @@ showmbufs()
 	/*
 	 * Print total number of free mbufs.
 	 */
-	if (totfree > 0) {
-		mvwprintw(wnd, 1+j, 0, "%-10.10s", "free");
-		if (totfree > 60) {
-			snprintf(buf, sizeof(buf), " %lu", totfree);
-			totfree = 60;
-			while(totfree--)
+	if (totmbufs > 0) {
+		mvwprintw(wnd, 1+j, 0, "%-10.10s", "Mbufs");
+		if (totmbufs > 60) {
+			snprintf(buf, sizeof(buf), " %lu", totmbufs);
+			totmbufs = 60;
+			while(totmbufs--)
 				waddch(wnd, 'X');
 			waddstr(wnd, buf);
 		} else {
-			while(totfree--)
+			while(totmbufs--)
 				waddch(wnd, 'X');
 		}
 		wclrtoeol(wnd);
@ -198,23 +186,6 @@ initmbufs()
 		return 0;
 	}

-	if (sysctlbyname("kern.ipc.mb_statpcpu", NULL, &len, NULL, 0) < 0) {
-		error("sysctl getting mbpstat total size failed");
-		return 0;
-	}
-	num_objs = (int)(len / sizeof(struct mbpstat));
-	if ((mbpstat = calloc(num_objs, sizeof(struct mbpstat *))) == NULL) {
-		error("calloc mbpstat pointers failed");
-		return 0;
-	}
-	if ((mbpstat[0] = calloc(num_objs, sizeof(struct mbpstat))) == NULL) {
-		error("calloc mbpstat structures failed");
-		return 0;
-	}
-
-	for (i = 0; i < num_objs; i++)
-		mbpstat[i] = mbpstat[0] + i;
-
 	return 1;
 }

@ -223,7 +194,7 @@ fetchmbufs()
 {
 	size_t len;

-	len = num_objs * sizeof(struct mbpstat);
-	if (sysctlbyname("kern.ipc.mb_statpcpu", mbpstat[0], &len, NULL, 0) < 0)
-		printw("sysctl: mbpstat: %s", strerror(errno));
+	len = sizeof *mbstat;
+	if (sysctlbyname("kern.ipc.mbstat", mbstat, &len, NULL, 0) < 0)
+		printw("sysctl: mbstat: %s", strerror(errno));
 }