2005-01-07 02:29:27 +00:00
|
|
|
/*-
|
2017-11-27 15:23:17 +00:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
|
|
|
*
|
2019-11-28 07:49:25 +00:00
|
|
|
* Copyright (c) 2002-2019 Jeffrey Roberson <jeff@FreeBSD.org>
|
2005-07-16 09:51:52 +00:00
|
|
|
* Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
|
2006-10-26 12:55:32 +00:00
|
|
|
* Copyright (c) 2004-2006 Robert N. M. Watson
|
2005-07-16 09:51:52 +00:00
|
|
|
* All rights reserved.
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice unmodified, this list of conditions, and the following
|
|
|
|
* disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
|
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uma_core.c Implementation of the Universal Memory allocator
|
|
|
|
*
|
|
|
|
* This allocator is intended to replace the multitude of similar object caches
|
|
|
|
* in the standard FreeBSD kernel. The intent is to be flexible as well as
|
2016-05-02 20:16:29 +00:00
|
|
|
* efficient. A primary design goal is to return unused memory to the rest of
|
2004-01-30 16:26:29 +00:00
|
|
|
* the system. This will make the system as a whole more flexible due to the
|
2002-03-19 09:11:49 +00:00
|
|
|
* ability to move memory to subsystems which most need it instead of leaving
|
|
|
|
* pools of reserved memory unused.
|
|
|
|
*
|
|
|
|
* The basic ideas stem from similar slab/zone based allocators whose algorithms
|
|
|
|
* are well known.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TODO:
|
|
|
|
* - Improve memory usage for large allocations
|
|
|
|
* - Investigate cache size adjustments
|
|
|
|
*/
|
|
|
|
|
2003-06-11 23:50:51 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2005-10-20 16:39:33 +00:00
|
|
|
#include "opt_ddb.h"
|
2002-03-19 09:11:49 +00:00
|
|
|
#include "opt_param.h"
|
2011-10-12 18:08:28 +00:00
|
|
|
#include "opt_vm.h"
|
2005-10-20 16:39:33 +00:00
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
2021-04-13 17:39:50 -04:00
|
|
|
#include <sys/asan.h>
|
2013-06-13 21:05:38 +00:00
|
|
|
#include <sys/bitset.h>
|
2018-10-24 16:41:47 +00:00
|
|
|
#include <sys/domainset.h>
|
2017-02-25 16:39:21 +00:00
|
|
|
#include <sys/eventhandler.h>
|
2002-03-19 09:11:49 +00:00
|
|
|
#include <sys/kernel.h>
|
|
|
|
#include <sys/types.h>
|
2018-01-02 04:35:56 +00:00
|
|
|
#include <sys/limits.h>
|
2002-03-19 09:11:49 +00:00
|
|
|
#include <sys/queue.h>
|
|
|
|
#include <sys/malloc.h>
|
2004-08-06 21:52:38 +00:00
|
|
|
#include <sys/ktr.h>
|
2002-03-19 09:11:49 +00:00
|
|
|
#include <sys/lock.h>
|
2021-08-10 17:15:03 -04:00
|
|
|
#include <sys/msan.h>
|
2002-03-19 09:11:49 +00:00
|
|
|
#include <sys/mutex.h>
|
2002-05-20 17:54:48 +00:00
|
|
|
#include <sys/proc.h>
|
This is the much-discussed major upgrade to the random(4) device, known to you all as /dev/random.
This code has had an extensive rewrite and a good series of reviews, both by the author and other parties. This means a lot of code has been simplified. Pluggable structures for high-rate entropy generators are available, and it is most definitely not the case that /dev/random can be driven by only a hardware souce any more. This has been designed out of the device. Hardware sources are stirred into the CSPRNG (Yarrow, Fortuna) like any other entropy source. Pluggable modules may be written by third parties for additional sources.
The harvesting structures and consequently the locking have been simplified. Entropy harvesting is done in a more general way (the documentation for this will follow). There is some GREAT entropy to be had in the UMA allocator, but it is disabled for now as messing with that is likely to annoy many people.
The venerable (but effective) Yarrow algorithm, which is no longer supported by its authors now has an alternative, Fortuna. For now, Yarrow is retained as the default algorithm, but this may be changed using a kernel option. It is intended to make Fortuna the default algorithm for 11.0. Interested parties are encouraged to read ISBN 978-0-470-47424-2 "Cryptography Engineering" By Ferguson, Schneier and Kohno for Fortuna's gory details. Heck, read it anyway.
Many thanks to Arthur Mesh who did early grunt work, and who got caught in the crossfire rather more than he deserved to.
My thanks also to folks who helped me thresh this out on whiteboards and in the odd "Hallway track", or otherwise.
My Nomex pants are on. Let the feedback commence!
Reviewed by: trasz,des(partial),imp(partial?),rwatson(partial?)
Approved by: so(des)
2014-10-30 21:21:53 +00:00
|
|
|
#include <sys/random.h>
|
2013-03-09 02:32:23 +00:00
|
|
|
#include <sys/rwlock.h>
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
#include <sys/sbuf.h>
|
2013-11-19 10:51:46 +00:00
|
|
|
#include <sys/sched.h>
|
2020-01-04 03:04:46 +00:00
|
|
|
#include <sys/sleepqueue.h>
|
2002-03-19 09:11:49 +00:00
|
|
|
#include <sys/smp.h>
|
2020-01-31 00:49:51 +00:00
|
|
|
#include <sys/smr.h>
|
2021-08-10 17:15:03 -04:00
|
|
|
#include <sys/sysctl.h>
|
2016-02-03 23:30:17 +00:00
|
|
|
#include <sys/taskqueue.h>
|
2002-04-08 06:20:34 +00:00
|
|
|
#include <sys/vmmeter.h>
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
#include <vm/vm.h>
|
2020-10-14 22:51:40 +00:00
|
|
|
#include <vm/vm_param.h>
|
2018-10-24 16:41:47 +00:00
|
|
|
#include <vm/vm_domainset.h>
|
2002-03-19 09:11:49 +00:00
|
|
|
#include <vm/vm_object.h>
|
|
|
|
#include <vm/vm_page.h>
|
2013-02-26 23:35:27 +00:00
|
|
|
#include <vm/vm_pageout.h>
|
2018-01-12 23:25:05 +00:00
|
|
|
#include <vm/vm_phys.h>
|
2018-10-01 14:14:21 +00:00
|
|
|
#include <vm/vm_pagequeue.h>
|
2002-03-19 09:11:49 +00:00
|
|
|
#include <vm/vm_map.h>
|
|
|
|
#include <vm/vm_kern.h>
|
|
|
|
#include <vm/vm_extern.h>
|
2020-10-14 22:51:40 +00:00
|
|
|
#include <vm/vm_dumpset.h>
|
2002-03-19 09:11:49 +00:00
|
|
|
#include <vm/uma.h>
|
|
|
|
#include <vm/uma_int.h>
|
2002-05-02 02:08:48 +00:00
|
|
|
#include <vm/uma_dbg.h>
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2005-10-20 16:39:33 +00:00
|
|
|
#include <ddb/ddb.h>
|
|
|
|
|
2011-10-12 18:08:28 +00:00
|
|
|
#ifdef DEBUG_MEMGUARD
|
|
|
|
#include <vm/memguard.h>
|
|
|
|
#endif
|
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
#include <machine/md_var.h>
|
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
#define UMA_ALWAYS_CTORDTOR 1
|
|
|
|
#else
|
|
|
|
#define UMA_ALWAYS_CTORDTOR 0
|
|
|
|
#endif
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
2018-01-12 23:25:05 +00:00
|
|
|
* This is the zone and keg from which all zones are spawned.
|
2002-03-19 09:11:49 +00:00
|
|
|
*/
|
2018-01-12 23:25:05 +00:00
|
|
|
static uma_zone_t kegs;
|
|
|
|
static uma_zone_t zones;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-03-07 15:37:23 +00:00
|
|
|
/*
|
|
|
|
* On INVARIANTS builds, the slab contains a second bitset of the same size,
|
|
|
|
* "dbg_bits", which is laid out immediately after us_free.
|
|
|
|
*/
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
#define SLAB_BITSETS 2
|
|
|
|
#else
|
|
|
|
#define SLAB_BITSETS 1
|
|
|
|
#endif
|
|
|
|
|
2020-01-14 02:14:15 +00:00
|
|
|
/*
|
|
|
|
* These are the two zones from which all offpage uma_slab_ts are allocated.
|
|
|
|
*
|
|
|
|
* One zone is for slab headers that can represent a larger number of items,
|
|
|
|
* making the slabs themselves more efficient, and the other zone is for
|
|
|
|
* headers that are smaller and represent fewer items, making the headers more
|
|
|
|
* efficient.
|
|
|
|
*/
|
|
|
|
#define SLABZONE_SIZE(setsize) \
|
|
|
|
(sizeof(struct uma_hash_slab) + BITSET_SIZE(setsize) * SLAB_BITSETS)
|
|
|
|
#define SLABZONE0_SETSIZE (PAGE_SIZE / 16)
|
|
|
|
#define SLABZONE1_SETSIZE SLAB_MAX_SETSIZE
|
|
|
|
#define SLABZONE0_SIZE SLABZONE_SIZE(SLABZONE0_SETSIZE)
|
|
|
|
#define SLABZONE1_SIZE SLABZONE_SIZE(SLABZONE1_SETSIZE)
|
|
|
|
static uma_zone_t slabzones[2];
|
2002-03-19 09:11:49 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The initial hash tables come out of this zone so they can be allocated
|
|
|
|
* prior to malloc coming up.
|
|
|
|
*/
|
|
|
|
static uma_zone_t hashzone;
|
|
|
|
|
2007-02-11 20:13:52 +00:00
|
|
|
/* The boot-time adjusted value for cache line alignment. */
|
2011-03-21 09:40:01 +00:00
|
|
|
int uma_align_cache = 64 - 1;
|
2007-02-11 20:13:52 +00:00
|
|
|
|
2003-09-19 07:23:50 +00:00
|
|
|
static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
|
2019-11-28 00:19:09 +00:00
|
|
|
static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc");
|
2003-09-19 07:23:50 +00:00
|
|
|
|
2002-04-08 06:20:34 +00:00
|
|
|
/*
|
|
|
|
* Are we allowed to allocate buckets?
|
|
|
|
*/
|
|
|
|
static int bucketdisable = 1;
|
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
/* Linked list of all kegs in the system */
|
2009-12-28 22:56:30 +00:00
|
|
|
static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2013-11-28 19:20:49 +00:00
|
|
|
/* Linked list of all cache-only zones in the system */
|
|
|
|
static LIST_HEAD(,uma_zone) uma_cachezones =
|
|
|
|
LIST_HEAD_INITIALIZER(uma_cachezones);
|
|
|
|
|
2021-04-14 12:57:24 -04:00
|
|
|
/*
|
|
|
|
* Mutex for global lists: uma_kegs, uma_cachezones, and the per-keg list of
|
|
|
|
* zones.
|
|
|
|
*/
|
2017-09-06 20:28:18 +00:00
|
|
|
static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2021-04-14 12:57:24 -04:00
|
|
|
static struct sx uma_reclaim_lock;
|
|
|
|
|
2017-06-01 18:26:57 +00:00
|
|
|
/*
|
2020-01-16 05:01:21 +00:00
|
|
|
* First available virual address for boot time allocations.
|
2017-06-01 18:26:57 +00:00
|
|
|
*/
|
2020-01-16 05:01:21 +00:00
|
|
|
static vm_offset_t bootstart;
|
|
|
|
static vm_offset_t bootmem;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2019-06-06 16:26:58 +00:00
|
|
|
/*
|
|
|
|
* kmem soft limit, initialized by uma_set_limit(). Ensure that early
|
|
|
|
* allocations don't trigger a wakeup of the reclaim thread.
|
|
|
|
*/
|
2019-11-29 03:14:10 +00:00
|
|
|
unsigned long uma_kmem_limit = LONG_MAX;
|
2019-06-06 16:26:58 +00:00
|
|
|
SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
|
|
|
|
"UMA kernel memory soft limit");
|
2019-11-29 03:14:10 +00:00
|
|
|
unsigned long uma_kmem_total;
|
2019-06-06 16:26:58 +00:00
|
|
|
SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
|
|
|
|
"UMA kernel memory usage");
|
2017-11-28 23:40:54 +00:00
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/* Is the VM done starting up? */
|
2020-01-09 19:17:42 +00:00
|
|
|
static enum {
|
|
|
|
BOOT_COLD,
|
2020-01-16 05:01:21 +00:00
|
|
|
BOOT_KVA,
|
2020-05-14 16:06:54 +00:00
|
|
|
BOOT_PCPU,
|
2020-01-09 19:17:42 +00:00
|
|
|
BOOT_RUNNING,
|
|
|
|
BOOT_SHUTDOWN,
|
|
|
|
} booted = BOOT_COLD;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2003-09-19 23:27:46 +00:00
|
|
|
/*
|
|
|
|
* This is the handle used to schedule events that need to happen
|
|
|
|
* outside of the allocation fast path.
|
|
|
|
*/
|
2002-03-19 09:11:49 +00:00
|
|
|
static struct callout uma_callout;
|
2003-09-19 23:27:46 +00:00
|
|
|
#define UMA_TIMEOUT 20 /* Seconds for callout interval. */
|
2002-03-19 09:11:49 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This structure is passed as the zone ctor arg so that I don't have to create
|
|
|
|
* a special allocation function just for zones.
|
|
|
|
*/
|
|
|
|
struct uma_zctor_args {
|
2012-10-26 17:51:05 +00:00
|
|
|
const char *name;
|
2002-05-02 07:36:30 +00:00
|
|
|
size_t size;
|
2002-03-19 09:11:49 +00:00
|
|
|
uma_ctor ctor;
|
|
|
|
uma_dtor dtor;
|
|
|
|
uma_init uminit;
|
|
|
|
uma_fini fini;
|
2013-06-17 03:43:47 +00:00
|
|
|
uma_import import;
|
|
|
|
uma_release release;
|
|
|
|
void *arg;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
uma_keg_t keg;
|
|
|
|
int align;
|
2013-04-09 17:43:48 +00:00
|
|
|
uint32_t flags;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct uma_kctor_args {
|
|
|
|
uma_zone_t zone;
|
|
|
|
size_t size;
|
|
|
|
uma_init uminit;
|
|
|
|
uma_fini fini;
|
2002-03-19 09:11:49 +00:00
|
|
|
int align;
|
2013-04-09 17:43:48 +00:00
|
|
|
uint32_t flags;
|
2002-03-19 09:11:49 +00:00
|
|
|
};
|
|
|
|
|
2003-09-19 06:26:45 +00:00
|
|
|
struct uma_bucket_zone {
|
|
|
|
uma_zone_t ubz_zone;
|
2020-02-23 03:32:04 +00:00
|
|
|
const char *ubz_name;
|
2013-06-18 04:50:20 +00:00
|
|
|
int ubz_entries; /* Number of items it can hold. */
|
|
|
|
int ubz_maxsize; /* Maximum allocation size per-item. */
|
2003-09-19 06:26:45 +00:00
|
|
|
};
|
|
|
|
|
2013-06-18 04:50:20 +00:00
|
|
|
/*
|
|
|
|
* Compute the actual number of bucket entries to pack them in power
|
|
|
|
* of two sizes for more efficient space utilization.
|
|
|
|
*/
|
|
|
|
#define BUCKET_SIZE(n) \
|
|
|
|
(((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
|
|
|
|
|
2014-06-12 11:57:07 +00:00
|
|
|
#define BUCKET_MAX BUCKET_SIZE(256)
|
2003-09-19 06:26:45 +00:00
|
|
|
|
|
|
|
struct uma_bucket_zone bucket_zones[] = {
|
2020-02-04 20:28:06 +00:00
|
|
|
/* Literal bucket sizes. */
|
|
|
|
{ NULL, "2 Bucket", 2, 4096 },
|
|
|
|
{ NULL, "4 Bucket", 4, 3072 },
|
|
|
|
{ NULL, "8 Bucket", 8, 2048 },
|
|
|
|
{ NULL, "16 Bucket", 16, 1024 },
|
|
|
|
/* Rounded down power of 2 sizes for efficiency. */
|
2013-06-18 04:50:20 +00:00
|
|
|
{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
|
|
|
|
{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
|
|
|
|
{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
|
2014-06-12 11:57:07 +00:00
|
|
|
{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
|
2003-09-19 06:26:45 +00:00
|
|
|
{ NULL, NULL, 0}
|
|
|
|
};
|
|
|
|
|
2005-07-15 23:34:39 +00:00
|
|
|
/*
|
|
|
|
* Flags and enumerations to be passed to internal functions.
|
|
|
|
*/
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
enum zfreeskip {
|
|
|
|
SKIP_NONE = 0,
|
|
|
|
SKIP_CNT = 0x00000001,
|
|
|
|
SKIP_DTOR = 0x00010000,
|
|
|
|
SKIP_FINI = 0x00020000,
|
|
|
|
};
|
2004-08-02 00:18:36 +00:00
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/* Prototypes.. */
|
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
void uma_startup1(vm_offset_t);
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
void uma_startup2(void);
|
|
|
|
|
2018-01-12 23:25:05 +00:00
|
|
|
static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
|
|
|
|
static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
|
2018-07-06 02:06:03 +00:00
|
|
|
static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
|
2018-01-12 23:25:05 +00:00
|
|
|
static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
|
2020-02-04 22:40:11 +00:00
|
|
|
static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
|
2015-04-01 12:42:26 +00:00
|
|
|
static void page_free(void *, vm_size_t, uint8_t);
|
2018-07-06 02:06:03 +00:00
|
|
|
static void pcpu_page_free(void *, vm_size_t, uint8_t);
|
2019-01-23 18:58:15 +00:00
|
|
|
static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
|
2003-09-19 23:27:46 +00:00
|
|
|
static void cache_drain(uma_zone_t);
|
2002-03-19 09:11:49 +00:00
|
|
|
static void bucket_drain(uma_zone_t, uma_bucket_t);
|
2021-04-14 12:57:24 -04:00
|
|
|
static void bucket_cache_reclaim(uma_zone_t zone, bool, int);
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
static bool bucket_cache_reclaim_domain(uma_zone_t, bool, bool, int);
|
2004-08-02 00:18:36 +00:00
|
|
|
static int keg_ctor(void *, int, void *, int);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
static void keg_dtor(void *, int, void *);
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
static void keg_drain(uma_keg_t keg, int domain);
|
2004-08-02 00:18:36 +00:00
|
|
|
static int zone_ctor(void *, int, void *, int);
|
2002-04-08 04:48:58 +00:00
|
|
|
static void zone_dtor(void *, int, void *);
|
2020-01-31 00:49:51 +00:00
|
|
|
static inline void item_dtor(uma_zone_t zone, void *item, int size,
|
|
|
|
void *udata, enum zfreeskip skip);
|
2004-08-02 00:18:36 +00:00
|
|
|
static int zero_init(void *, int, int);
|
2020-02-19 18:48:46 +00:00
|
|
|
static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata,
|
|
|
|
int itemdomain, bool ws);
|
2019-11-28 00:19:09 +00:00
|
|
|
static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *);
|
2020-01-16 05:01:21 +00:00
|
|
|
static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *);
|
2019-11-28 00:19:09 +00:00
|
|
|
static void zone_timeout(uma_zone_t zone, void *);
|
2019-06-06 23:57:28 +00:00
|
|
|
static int hash_alloc(struct uma_hash *, u_int);
|
2002-05-13 04:39:28 +00:00
|
|
|
static int hash_expand(struct uma_hash *, struct uma_hash *);
|
|
|
|
static void hash_free(struct uma_hash *hash);
|
2002-03-19 09:11:49 +00:00
|
|
|
static void uma_timeout(void *);
|
2020-01-09 19:17:42 +00:00
|
|
|
static void uma_shutdown(void);
|
2018-01-12 23:25:05 +00:00
|
|
|
static void *zone_alloc_item(uma_zone_t, void *, int, int);
|
2013-06-17 03:43:47 +00:00
|
|
|
static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
|
2020-01-04 03:04:46 +00:00
|
|
|
static int zone_alloc_limit(uma_zone_t zone, int count, int flags);
|
|
|
|
static void zone_free_limit(uma_zone_t zone, int count);
|
2002-04-08 06:20:34 +00:00
|
|
|
static void bucket_enable(void);
|
2003-09-19 06:26:45 +00:00
|
|
|
static void bucket_init(void);
|
2013-06-26 00:57:38 +00:00
|
|
|
static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
|
|
|
|
static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
|
2021-04-14 12:57:24 -04:00
|
|
|
static void bucket_zone_drain(int domain);
|
2019-11-26 22:17:02 +00:00
|
|
|
static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
|
2013-06-17 03:43:47 +00:00
|
|
|
static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item);
|
2021-04-13 17:39:50 -04:00
|
|
|
static size_t slab_sizeof(int nitems);
|
2009-01-25 09:11:24 +00:00
|
|
|
static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
|
2013-04-09 17:43:48 +00:00
|
|
|
uma_fini fini, int align, uint32_t flags);
|
2019-12-04 18:40:05 +00:00
|
|
|
static int zone_import(void *, void **, int, int, int);
|
|
|
|
static void zone_release(void *, void **, int);
|
2019-11-26 22:17:02 +00:00
|
|
|
static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int);
|
2019-11-27 23:19:06 +00:00
|
|
|
static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int);
|
2002-10-24 07:59:03 +00:00
|
|
|
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
|
|
|
|
static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
|
2019-11-28 00:19:09 +00:00
|
|
|
static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS);
|
|
|
|
static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS);
|
2019-12-11 06:50:55 +00:00
|
|
|
static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS);
|
2019-12-13 09:32:09 +00:00
|
|
|
static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS);
|
2020-01-04 03:04:46 +00:00
|
|
|
static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-01-04 19:29:25 +00:00
|
|
|
static uint64_t uma_zone_get_allocs(uma_zone_t zone);
|
|
|
|
|
2020-02-26 14:26:36 +00:00
|
|
|
static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
|
2020-02-04 22:40:45 +00:00
|
|
|
"Memory allocation debugging");
|
|
|
|
|
2016-02-03 22:02:36 +00:00
|
|
|
#ifdef INVARIANTS
|
2020-01-04 19:29:25 +00:00
|
|
|
static uint64_t uma_keg_get_allocs(uma_keg_t zone);
|
2019-12-14 05:21:56 +00:00
|
|
|
static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg);
|
|
|
|
|
2018-06-08 00:15:08 +00:00
|
|
|
static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
|
|
|
|
static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
|
2016-02-03 22:02:36 +00:00
|
|
|
static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
|
|
|
|
static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
|
2018-06-08 00:15:08 +00:00
|
|
|
|
|
|
|
static u_int dbg_divisor = 1;
|
|
|
|
SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
|
|
|
|
CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
|
|
|
|
"Debug & thrash every this item in memory allocator");
|
|
|
|
|
|
|
|
static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
|
|
|
|
static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
|
|
|
|
SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
|
|
|
|
&uma_dbg_cnt, "memory items debugged");
|
|
|
|
SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
|
|
|
|
&uma_skip_cnt, "memory items skipped, not debugged");
|
2016-02-03 22:02:36 +00:00
|
|
|
#endif
|
|
|
|
|
2020-02-26 14:26:36 +00:00
|
|
|
SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
|
|
|
|
"Universal Memory Allocator");
|
2019-11-28 04:15:16 +00:00
|
|
|
|
2020-01-12 05:08:57 +00:00
|
|
|
SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_INT,
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
|
|
|
|
|
2020-01-12 05:08:57 +00:00
|
|
|
SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_STRUCT,
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
|
|
|
|
|
2012-12-07 22:27:13 +00:00
|
|
|
static int zone_warnings = 1;
|
2014-06-28 03:56:17 +00:00
|
|
|
SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
|
2012-12-07 22:27:13 +00:00
|
|
|
"Warn when UMA zones becomes full");
|
|
|
|
|
2020-02-04 22:40:45 +00:00
|
|
|
static int multipage_slabs = 1;
|
|
|
|
TUNABLE_INT("vm.debug.uma_multipage_slabs", &multipage_slabs);
|
|
|
|
SYSCTL_INT(_vm_debug, OID_AUTO, uma_multipage_slabs,
|
|
|
|
CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &multipage_slabs, 0,
|
|
|
|
"UMA may choose larger slab sizes for better efficiency");
|
|
|
|
|
2020-01-14 02:14:15 +00:00
|
|
|
/*
|
|
|
|
* Select the slab zone for an offpage slab with the given maximum item count.
|
|
|
|
*/
|
|
|
|
static inline uma_zone_t
|
|
|
|
slabzone(int ipers)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (slabzones[ipers > SLABZONE0_SETSIZE]);
|
|
|
|
}
|
|
|
|
|
2002-04-08 06:20:34 +00:00
|
|
|
/*
|
|
|
|
* This routine checks to see whether or not it's safe to enable buckets.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
bucket_enable(void)
|
|
|
|
{
|
2019-12-13 09:32:03 +00:00
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
KASSERT(booted >= BOOT_KVA, ("Bucket enable before init"));
|
2012-05-23 18:56:29 +00:00
|
|
|
bucketdisable = vm_page_count_min();
|
2002-04-08 06:20:34 +00:00
|
|
|
}
|
|
|
|
|
2004-11-06 11:43:30 +00:00
|
|
|
/*
|
|
|
|
* Initialize bucket_zones, the array of zones of buckets of various sizes.
|
|
|
|
*
|
|
|
|
* For each zone, calculate the memory required for each bucket, consisting
|
2013-06-18 04:50:20 +00:00
|
|
|
* of the header and an array of pointers.
|
2004-11-06 11:43:30 +00:00
|
|
|
*/
|
2003-09-19 06:26:45 +00:00
|
|
|
static void
|
|
|
|
bucket_init(void)
|
|
|
|
{
|
|
|
|
struct uma_bucket_zone *ubz;
|
2013-06-18 04:50:20 +00:00
|
|
|
int size;
|
2003-09-19 06:26:45 +00:00
|
|
|
|
2015-04-20 16:48:21 +00:00
|
|
|
for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
|
2003-09-19 06:26:45 +00:00
|
|
|
size = roundup(sizeof(struct uma_bucket), sizeof(void *));
|
|
|
|
size += sizeof(void *) * ubz->ubz_entries;
|
|
|
|
ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
|
2009-01-25 09:11:24 +00:00
|
|
|
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
|
2020-01-04 18:48:13 +00:00
|
|
|
UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET |
|
|
|
|
UMA_ZONE_FIRSTTOUCH);
|
2003-09-19 06:26:45 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-11-06 11:43:30 +00:00
|
|
|
/*
|
|
|
|
* Given a desired number of entries for a bucket, return the zone from which
|
|
|
|
* to allocate the bucket.
|
|
|
|
*/
|
|
|
|
static struct uma_bucket_zone *
|
|
|
|
bucket_zone_lookup(int entries)
|
|
|
|
{
|
2013-06-18 04:50:20 +00:00
|
|
|
struct uma_bucket_zone *ubz;
|
2004-11-06 11:43:30 +00:00
|
|
|
|
2013-06-18 04:50:20 +00:00
|
|
|
for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
|
|
|
|
if (ubz->ubz_entries >= entries)
|
|
|
|
return (ubz);
|
|
|
|
ubz--;
|
|
|
|
return (ubz);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
bucket_select(int size)
|
|
|
|
{
|
|
|
|
struct uma_bucket_zone *ubz;
|
|
|
|
|
|
|
|
ubz = &bucket_zones[0];
|
|
|
|
if (size > ubz->ubz_maxsize)
|
|
|
|
return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
|
|
|
|
|
|
|
|
for (; ubz->ubz_entries != 0; ubz++)
|
|
|
|
if (ubz->ubz_maxsize < size)
|
|
|
|
break;
|
|
|
|
ubz--;
|
|
|
|
return (ubz->ubz_entries);
|
2004-11-06 11:43:30 +00:00
|
|
|
}
|
|
|
|
|
2003-09-19 06:26:45 +00:00
|
|
|
static uma_bucket_t
|
2013-06-26 00:57:38 +00:00
|
|
|
bucket_alloc(uma_zone_t zone, void *udata, int flags)
|
2003-09-19 06:26:45 +00:00
|
|
|
{
|
|
|
|
struct uma_bucket_zone *ubz;
|
|
|
|
uma_bucket_t bucket;
|
|
|
|
|
|
|
|
/*
|
2020-01-31 00:49:51 +00:00
|
|
|
* Don't allocate buckets early in boot.
|
2003-09-19 06:26:45 +00:00
|
|
|
*/
|
2020-01-31 00:49:51 +00:00
|
|
|
if (__predict_false(booted < BOOT_KVA))
|
2003-09-19 06:26:45 +00:00
|
|
|
return (NULL);
|
2020-01-16 05:01:21 +00:00
|
|
|
|
2013-06-26 00:57:38 +00:00
|
|
|
/*
|
|
|
|
* To limit bucket recursion we store the original zone flags
|
|
|
|
* in a cookie passed via zalloc_arg/zfree_arg. This allows the
|
|
|
|
* NOVM flag to persist even through deep recursions. We also
|
|
|
|
* store ZFLAG_BUCKET once we have recursed attempting to allocate
|
|
|
|
* a bucket for a bucket zone so we do not allow infinite bucket
|
|
|
|
* recursion. This cookie will even persist to frees of unused
|
|
|
|
* buckets via the allocation path or bucket allocations in the
|
|
|
|
* free path.
|
|
|
|
*/
|
|
|
|
if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
|
|
|
|
udata = (void *)(uintptr_t)zone->uz_flags;
|
2013-11-27 19:55:42 +00:00
|
|
|
else {
|
|
|
|
if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
|
|
|
|
return (NULL);
|
2013-06-26 00:57:38 +00:00
|
|
|
udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
|
2013-11-27 19:55:42 +00:00
|
|
|
}
|
2020-02-06 08:32:25 +00:00
|
|
|
if (((uintptr_t)udata & UMA_ZONE_VM) != 0)
|
2013-06-20 19:08:12 +00:00
|
|
|
flags |= M_NOVM;
|
2020-12-06 22:45:39 +00:00
|
|
|
ubz = bucket_zone_lookup(atomic_load_16(&zone->uz_bucket_size));
|
2014-06-12 11:36:22 +00:00
|
|
|
if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
|
|
|
|
ubz++;
|
2013-06-26 00:57:38 +00:00
|
|
|
bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
|
2003-09-19 06:26:45 +00:00
|
|
|
if (bucket) {
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
|
|
|
|
#endif
|
|
|
|
bucket->ub_cnt = 0;
|
2020-12-06 22:45:39 +00:00
|
|
|
bucket->ub_entries = min(ubz->ubz_entries,
|
|
|
|
zone->uz_bucket_size_max);
|
2020-01-31 00:49:51 +00:00
|
|
|
bucket->ub_seq = SMR_SEQ_INVALID;
|
|
|
|
CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p",
|
|
|
|
zone->uz_name, zone, bucket);
|
2003-09-19 06:26:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return (bucket);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2013-06-26 00:57:38 +00:00
|
|
|
bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
|
2003-09-19 06:26:45 +00:00
|
|
|
{
|
|
|
|
struct uma_bucket_zone *ubz;
|
|
|
|
|
2020-02-19 18:48:46 +00:00
|
|
|
if (bucket->ub_cnt != 0)
|
|
|
|
bucket_drain(zone, bucket);
|
|
|
|
|
2013-06-18 04:50:20 +00:00
|
|
|
KASSERT(bucket->ub_cnt == 0,
|
|
|
|
("bucket_free: Freeing a non free bucket."));
|
2020-01-31 00:49:51 +00:00
|
|
|
KASSERT(bucket->ub_seq == SMR_SEQ_INVALID,
|
|
|
|
("bucket_free: Freeing an SMR bucket."));
|
2013-06-26 00:57:38 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
|
|
|
|
udata = (void *)(uintptr_t)zone->uz_flags;
|
2004-11-06 11:43:30 +00:00
|
|
|
ubz = bucket_zone_lookup(bucket->ub_entries);
|
2013-06-26 00:57:38 +00:00
|
|
|
uma_zfree_arg(ubz->ubz_zone, bucket, udata);
|
2003-09-19 06:26:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2021-04-14 12:57:24 -04:00
|
|
|
bucket_zone_drain(int domain)
|
2003-09-19 06:26:45 +00:00
|
|
|
{
|
|
|
|
struct uma_bucket_zone *ubz;
|
|
|
|
|
|
|
|
for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
|
2021-04-14 12:57:24 -04:00
|
|
|
uma_zone_reclaim_domain(ubz->ubz_zone, UMA_RECLAIM_DRAIN,
|
|
|
|
domain);
|
2003-09-19 06:26:45 +00:00
|
|
|
}
|
|
|
|
|
2021-04-13 17:39:50 -04:00
|
|
|
#ifdef KASAN
|
2021-05-05 17:05:46 -04:00
|
|
|
_Static_assert(UMA_SMALLEST_UNIT % KASAN_SHADOW_SCALE == 0,
|
|
|
|
"Base UMA allocation size not a multiple of the KASAN scale factor");
|
|
|
|
|
2021-04-13 17:39:50 -04:00
|
|
|
static void
|
|
|
|
kasan_mark_item_valid(uma_zone_t zone, void *item)
|
|
|
|
{
|
|
|
|
void *pcpu_item;
|
|
|
|
size_t sz, rsz;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
sz = zone->uz_size;
|
|
|
|
rsz = roundup2(sz, KASAN_SHADOW_SCALE);
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) {
|
2021-07-09 20:38:21 -04:00
|
|
|
kasan_mark(item, sz, rsz, KASAN_GENERIC_REDZONE);
|
2021-04-13 17:39:50 -04:00
|
|
|
} else {
|
|
|
|
pcpu_item = zpcpu_base_to_offset(item);
|
|
|
|
for (i = 0; i <= mp_maxid; i++)
|
2021-07-09 20:38:21 -04:00
|
|
|
kasan_mark(zpcpu_get_cpu(pcpu_item, i), sz, rsz,
|
|
|
|
KASAN_GENERIC_REDZONE);
|
2021-04-13 17:39:50 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kasan_mark_item_invalid(uma_zone_t zone, void *item)
|
|
|
|
{
|
|
|
|
void *pcpu_item;
|
|
|
|
size_t sz;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
sz = roundup2(zone->uz_size, KASAN_SHADOW_SCALE);
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) {
|
|
|
|
kasan_mark(item, 0, sz, KASAN_UMA_FREED);
|
|
|
|
} else {
|
|
|
|
pcpu_item = zpcpu_base_to_offset(item);
|
|
|
|
for (i = 0; i <= mp_maxid; i++)
|
2021-07-09 20:38:21 -04:00
|
|
|
kasan_mark(zpcpu_get_cpu(pcpu_item, i), 0, sz,
|
|
|
|
KASAN_UMA_FREED);
|
2021-04-13 17:39:50 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kasan_mark_slab_valid(uma_keg_t keg, void *mem)
|
|
|
|
{
|
|
|
|
size_t sz;
|
|
|
|
|
|
|
|
if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) {
|
|
|
|
sz = keg->uk_ppera * PAGE_SIZE;
|
|
|
|
kasan_mark(mem, sz, sz, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kasan_mark_slab_invalid(uma_keg_t keg, void *mem)
|
|
|
|
{
|
|
|
|
size_t sz;
|
|
|
|
|
|
|
|
if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) {
|
|
|
|
if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0)
|
|
|
|
sz = keg->uk_ppera * PAGE_SIZE;
|
|
|
|
else
|
|
|
|
sz = keg->uk_pgoff;
|
|
|
|
kasan_mark(mem, 0, sz, KASAN_UMA_FREED);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else /* !KASAN */
|
|
|
|
static void
|
|
|
|
kasan_mark_item_valid(uma_zone_t zone __unused, void *item __unused)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kasan_mark_item_invalid(uma_zone_t zone __unused, void *item __unused)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kasan_mark_slab_valid(uma_keg_t keg __unused, void *mem __unused)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kasan_mark_slab_invalid(uma_keg_t keg __unused, void *mem __unused)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* KASAN */
|
|
|
|
|
2021-08-10 17:15:03 -04:00
|
|
|
#ifdef KMSAN
|
|
|
|
static inline void
|
|
|
|
kmsan_mark_item_uninitialized(uma_zone_t zone, void *item)
|
|
|
|
{
|
|
|
|
void *pcpu_item;
|
|
|
|
size_t sz;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if ((zone->uz_flags &
|
|
|
|
(UMA_ZFLAG_CACHE | UMA_ZONE_SECONDARY | UMA_ZONE_MALLOC)) != 0) {
|
|
|
|
/*
|
|
|
|
* Cache zones should not be instrumented by default, as UMA
|
|
|
|
* does not have enough information to do so correctly.
|
|
|
|
* Consumers can mark items themselves if it makes sense to do
|
|
|
|
* so.
|
|
|
|
*
|
|
|
|
* Items from secondary zones are initialized by the parent
|
|
|
|
* zone and thus cannot safely be marked by UMA.
|
|
|
|
*
|
|
|
|
* malloc zones are handled directly by malloc(9) and friends,
|
|
|
|
* since they can provide more precise origin tracking.
|
|
|
|
*/
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (zone->uz_keg->uk_init != NULL) {
|
|
|
|
/*
|
|
|
|
* By definition, initialized items cannot be marked. The
|
|
|
|
* best we can do is mark items from these zones after they
|
|
|
|
* are freed to the keg.
|
|
|
|
*/
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
sz = zone->uz_size;
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) {
|
|
|
|
kmsan_orig(item, sz, KMSAN_TYPE_UMA, KMSAN_RET_ADDR);
|
|
|
|
kmsan_mark(item, sz, KMSAN_STATE_UNINIT);
|
|
|
|
} else {
|
|
|
|
pcpu_item = zpcpu_base_to_offset(item);
|
|
|
|
for (i = 0; i <= mp_maxid; i++) {
|
|
|
|
kmsan_orig(zpcpu_get_cpu(pcpu_item, i), sz,
|
|
|
|
KMSAN_TYPE_UMA, KMSAN_RET_ADDR);
|
|
|
|
kmsan_mark(zpcpu_get_cpu(pcpu_item, i), sz,
|
|
|
|
KMSAN_STATE_INITED);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else /* !KMSAN */
|
|
|
|
static inline void
|
|
|
|
kmsan_mark_item_uninitialized(uma_zone_t zone __unused, void *item __unused)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* KMSAN */
|
|
|
|
|
2020-02-19 18:48:46 +00:00
|
|
|
/*
|
|
|
|
* Acquire the domain lock and record contention.
|
|
|
|
*/
|
|
|
|
static uma_zone_domain_t
|
|
|
|
zone_domain_lock(uma_zone_t zone, int domain)
|
|
|
|
{
|
|
|
|
uma_zone_domain_t zdom;
|
|
|
|
bool lockfail;
|
|
|
|
|
|
|
|
zdom = ZDOM_GET(zone, domain);
|
|
|
|
lockfail = false;
|
|
|
|
if (ZDOM_OWNED(zdom))
|
|
|
|
lockfail = true;
|
|
|
|
ZDOM_LOCK(zdom);
|
|
|
|
/* This is unsynchronized. The counter does not need to be precise. */
|
|
|
|
if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max)
|
|
|
|
zone->uz_bucket_size++;
|
|
|
|
return (zdom);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2020-02-27 08:23:10 +00:00
|
|
|
* Search for the domain with the least cached items and return it if it
|
|
|
|
* is out of balance with the preferred domain.
|
2020-02-19 18:48:46 +00:00
|
|
|
*/
|
|
|
|
static __noinline int
|
|
|
|
zone_domain_lowest(uma_zone_t zone, int pref)
|
|
|
|
{
|
2020-02-27 08:23:10 +00:00
|
|
|
long least, nitems, prefitems;
|
2020-02-19 18:48:46 +00:00
|
|
|
int domain;
|
|
|
|
int i;
|
|
|
|
|
2020-02-27 08:23:10 +00:00
|
|
|
prefitems = least = LONG_MAX;
|
2020-02-19 18:48:46 +00:00
|
|
|
domain = 0;
|
|
|
|
for (i = 0; i < vm_ndomains; i++) {
|
|
|
|
nitems = ZDOM_GET(zone, i)->uzd_nitems;
|
|
|
|
if (nitems < least) {
|
|
|
|
domain = i;
|
|
|
|
least = nitems;
|
2020-02-27 08:23:10 +00:00
|
|
|
}
|
|
|
|
if (domain == pref)
|
|
|
|
prefitems = nitems;
|
2020-02-19 18:48:46 +00:00
|
|
|
}
|
2020-02-27 08:23:10 +00:00
|
|
|
if (prefitems < least * 2)
|
|
|
|
return (pref);
|
2020-02-19 18:48:46 +00:00
|
|
|
|
|
|
|
return (domain);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Search for the domain with the most cached items and return it or the
|
|
|
|
* preferred domain if it has enough to proceed.
|
|
|
|
*/
|
|
|
|
static __noinline int
|
|
|
|
zone_domain_highest(uma_zone_t zone, int pref)
|
|
|
|
{
|
|
|
|
long most, nitems;
|
|
|
|
int domain;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (ZDOM_GET(zone, pref)->uzd_nitems > BUCKET_MAX)
|
|
|
|
return (pref);
|
|
|
|
|
|
|
|
most = 0;
|
|
|
|
domain = 0;
|
|
|
|
for (i = 0; i < vm_ndomains; i++) {
|
|
|
|
nitems = ZDOM_GET(zone, i)->uzd_nitems;
|
|
|
|
if (nitems > most) {
|
|
|
|
domain = i;
|
|
|
|
most = nitems;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (domain);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set the maximum imax value.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zone_domain_imax_set(uma_zone_domain_t zdom, int nitems)
|
|
|
|
{
|
|
|
|
long old;
|
|
|
|
|
|
|
|
old = zdom->uzd_imax;
|
|
|
|
do {
|
|
|
|
if (old >= nitems)
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
return;
|
2020-02-19 18:48:46 +00:00
|
|
|
} while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0);
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We are at new maximum, so do the last WSS update for the old
|
|
|
|
* bimin and prepare to measure next allocation batch.
|
|
|
|
*/
|
|
|
|
if (zdom->uzd_wss < old - zdom->uzd_bimin)
|
|
|
|
zdom->uzd_wss = old - zdom->uzd_bimin;
|
|
|
|
zdom->uzd_bimin = nitems;
|
2020-02-19 18:48:46 +00:00
|
|
|
}
|
|
|
|
|
2019-09-01 22:22:43 +00:00
|
|
|
/*
|
|
|
|
* Attempt to satisfy an allocation by retrieving a full bucket from one of the
|
2020-01-31 00:49:51 +00:00
|
|
|
* zone's caches. If a bucket is found the zone is not locked on return.
|
2019-09-01 22:22:43 +00:00
|
|
|
*/
|
2018-11-13 19:44:40 +00:00
|
|
|
static uma_bucket_t
|
2020-02-19 18:48:46 +00:00
|
|
|
zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim)
|
2018-11-13 19:44:40 +00:00
|
|
|
{
|
|
|
|
uma_bucket_t bucket;
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
long cnt;
|
2020-01-31 00:49:51 +00:00
|
|
|
int i;
|
|
|
|
bool dtor = false;
|
2018-11-13 19:44:40 +00:00
|
|
|
|
2020-02-19 18:48:46 +00:00
|
|
|
ZDOM_LOCK_ASSERT(zdom);
|
2018-11-13 19:44:40 +00:00
|
|
|
|
2020-02-04 02:41:24 +00:00
|
|
|
if ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) == NULL)
|
2020-01-31 00:49:51 +00:00
|
|
|
return (NULL);
|
|
|
|
|
2020-02-13 20:58:51 +00:00
|
|
|
/* SMR Buckets can not be re-used until readers expire. */
|
2020-01-31 00:49:51 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
|
|
|
|
bucket->ub_seq != SMR_SEQ_INVALID) {
|
|
|
|
if (!smr_poll(zone->uz_smr, bucket->ub_seq, false))
|
|
|
|
return (NULL);
|
|
|
|
bucket->ub_seq = SMR_SEQ_INVALID;
|
2020-02-13 20:58:51 +00:00
|
|
|
dtor = (zone->uz_dtor != NULL) || UMA_ALWAYS_CTORDTOR;
|
2020-02-19 18:48:46 +00:00
|
|
|
if (STAILQ_NEXT(bucket, ub_link) != NULL)
|
|
|
|
zdom->uzd_seq = STAILQ_NEXT(bucket, ub_link)->ub_seq;
|
2018-11-13 19:44:40 +00:00
|
|
|
}
|
2020-02-04 02:41:24 +00:00
|
|
|
STAILQ_REMOVE_HEAD(&zdom->uzd_buckets, ub_link);
|
2020-10-02 19:04:29 +00:00
|
|
|
|
|
|
|
KASSERT(zdom->uzd_nitems >= bucket->ub_cnt,
|
|
|
|
("%s: item count underflow (%ld, %d)",
|
|
|
|
__func__, zdom->uzd_nitems, bucket->ub_cnt));
|
|
|
|
KASSERT(bucket->ub_cnt > 0,
|
|
|
|
("%s: empty bucket in bucket cache", __func__));
|
2020-01-31 00:49:51 +00:00
|
|
|
zdom->uzd_nitems -= bucket->ub_cnt;
|
2020-02-19 18:48:46 +00:00
|
|
|
|
|
|
|
if (reclaim) {
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
/*
|
|
|
|
* Shift the bounds of the current WSS interval to avoid
|
|
|
|
* perturbing the estimates.
|
|
|
|
*/
|
|
|
|
cnt = lmin(zdom->uzd_bimin, bucket->ub_cnt);
|
|
|
|
atomic_subtract_long(&zdom->uzd_imax, cnt);
|
|
|
|
zdom->uzd_bimin -= cnt;
|
2020-02-19 18:48:46 +00:00
|
|
|
zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt);
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
if (zdom->uzd_limin >= bucket->ub_cnt) {
|
|
|
|
zdom->uzd_limin -= bucket->ub_cnt;
|
|
|
|
} else {
|
|
|
|
zdom->uzd_limin = 0;
|
|
|
|
zdom->uzd_timin = 0;
|
|
|
|
}
|
|
|
|
} else if (zdom->uzd_bimin > zdom->uzd_nitems) {
|
|
|
|
zdom->uzd_bimin = zdom->uzd_nitems;
|
|
|
|
if (zdom->uzd_imin > zdom->uzd_nitems)
|
|
|
|
zdom->uzd_imin = zdom->uzd_nitems;
|
|
|
|
}
|
2020-02-19 18:48:46 +00:00
|
|
|
|
|
|
|
ZDOM_UNLOCK(zdom);
|
2020-01-31 00:49:51 +00:00
|
|
|
if (dtor)
|
|
|
|
for (i = 0; i < bucket->ub_cnt; i++)
|
|
|
|
item_dtor(zone, bucket->ub_bucket[i], zone->uz_size,
|
|
|
|
NULL, SKIP_NONE);
|
|
|
|
|
2018-11-13 19:44:40 +00:00
|
|
|
return (bucket);
|
|
|
|
}
|
|
|
|
|
2019-09-01 22:22:43 +00:00
|
|
|
/*
|
|
|
|
* Insert a full bucket into the specified cache. The "ws" parameter indicates
|
|
|
|
* whether the bucket's contents should be counted as part of the zone's working
|
2020-02-19 18:48:46 +00:00
|
|
|
* set. The bucket may be freed if it exceeds the bucket limit.
|
2019-09-01 22:22:43 +00:00
|
|
|
*/
|
2018-11-13 19:44:40 +00:00
|
|
|
static void
|
2020-02-19 18:48:46 +00:00
|
|
|
zone_put_bucket(uma_zone_t zone, int domain, uma_bucket_t bucket, void *udata,
|
2018-11-13 19:44:40 +00:00
|
|
|
const bool ws)
|
|
|
|
{
|
2020-02-19 18:48:46 +00:00
|
|
|
uma_zone_domain_t zdom;
|
|
|
|
|
|
|
|
/* We don't cache empty buckets. This can happen after a reclaim. */
|
|
|
|
if (bucket->ub_cnt == 0)
|
|
|
|
goto out;
|
|
|
|
zdom = zone_domain_lock(zone, domain);
|
2018-11-13 19:44:40 +00:00
|
|
|
|
2020-02-19 18:48:46 +00:00
|
|
|
/*
|
|
|
|
* Conditionally set the maximum number of items.
|
|
|
|
*/
|
2018-11-13 19:44:40 +00:00
|
|
|
zdom->uzd_nitems += bucket->ub_cnt;
|
2020-02-19 18:48:46 +00:00
|
|
|
if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) {
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
if (ws) {
|
2020-02-19 18:48:46 +00:00
|
|
|
zone_domain_imax_set(zdom, zdom->uzd_nitems);
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Shift the bounds of the current WSS interval to
|
|
|
|
* avoid perturbing the estimates.
|
|
|
|
*/
|
|
|
|
atomic_add_long(&zdom->uzd_imax, bucket->ub_cnt);
|
|
|
|
zdom->uzd_imin += bucket->ub_cnt;
|
|
|
|
zdom->uzd_bimin += bucket->ub_cnt;
|
|
|
|
zdom->uzd_limin += bucket->ub_cnt;
|
|
|
|
}
|
2020-02-19 18:48:46 +00:00
|
|
|
if (STAILQ_EMPTY(&zdom->uzd_buckets))
|
|
|
|
zdom->uzd_seq = bucket->ub_seq;
|
2020-10-02 19:04:09 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to promote reuse of recently used items. For items
|
|
|
|
* protected by SMR, try to defer reuse to minimize polling.
|
|
|
|
*/
|
|
|
|
if (bucket->ub_seq == SMR_SEQ_INVALID)
|
|
|
|
STAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
|
|
|
|
else
|
|
|
|
STAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
|
2020-02-19 18:48:46 +00:00
|
|
|
ZDOM_UNLOCK(zdom);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
zdom->uzd_nitems -= bucket->ub_cnt;
|
|
|
|
ZDOM_UNLOCK(zdom);
|
|
|
|
out:
|
|
|
|
bucket_free(zone, bucket, udata);
|
2018-11-13 19:44:40 +00:00
|
|
|
}
|
|
|
|
|
2019-12-25 20:50:53 +00:00
|
|
|
/* Pops an item out of a per-cpu cache bucket. */
|
|
|
|
static inline void *
|
|
|
|
cache_bucket_pop(uma_cache_t cache, uma_cache_bucket_t bucket)
|
|
|
|
{
|
|
|
|
void *item;
|
|
|
|
|
|
|
|
CRITICAL_ASSERT(curthread);
|
|
|
|
|
|
|
|
bucket->ucb_cnt--;
|
|
|
|
item = bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt];
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = NULL;
|
|
|
|
KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
|
|
|
|
#endif
|
|
|
|
cache->uc_allocs++;
|
|
|
|
|
|
|
|
return (item);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Pushes an item into a per-cpu cache bucket. */
|
|
|
|
static inline void
|
|
|
|
cache_bucket_push(uma_cache_t cache, uma_cache_bucket_t bucket, void *item)
|
|
|
|
{
|
|
|
|
|
|
|
|
CRITICAL_ASSERT(curthread);
|
|
|
|
KASSERT(bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] == NULL,
|
|
|
|
("uma_zfree: Freeing to non free bucket index."));
|
|
|
|
|
|
|
|
bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = item;
|
|
|
|
bucket->ucb_cnt++;
|
|
|
|
cache->uc_frees++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Unload a UMA bucket from a per-cpu cache.
|
|
|
|
*/
|
|
|
|
static inline uma_bucket_t
|
|
|
|
cache_bucket_unload(uma_cache_bucket_t bucket)
|
|
|
|
{
|
|
|
|
uma_bucket_t b;
|
|
|
|
|
|
|
|
b = bucket->ucb_bucket;
|
|
|
|
if (b != NULL) {
|
|
|
|
MPASS(b->ub_entries == bucket->ucb_entries);
|
|
|
|
b->ub_cnt = bucket->ucb_cnt;
|
|
|
|
bucket->ucb_bucket = NULL;
|
|
|
|
bucket->ucb_entries = bucket->ucb_cnt = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (b);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline uma_bucket_t
|
|
|
|
cache_bucket_unload_alloc(uma_cache_t cache)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (cache_bucket_unload(&cache->uc_allocbucket));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline uma_bucket_t
|
|
|
|
cache_bucket_unload_free(uma_cache_t cache)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (cache_bucket_unload(&cache->uc_freebucket));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline uma_bucket_t
|
|
|
|
cache_bucket_unload_cross(uma_cache_t cache)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (cache_bucket_unload(&cache->uc_crossbucket));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Load a bucket into a per-cpu cache bucket.
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
cache_bucket_load(uma_cache_bucket_t bucket, uma_bucket_t b)
|
|
|
|
{
|
|
|
|
|
|
|
|
CRITICAL_ASSERT(curthread);
|
|
|
|
MPASS(bucket->ucb_bucket == NULL);
|
2020-02-13 20:58:51 +00:00
|
|
|
MPASS(b->ub_seq == SMR_SEQ_INVALID);
|
2019-12-25 20:50:53 +00:00
|
|
|
|
|
|
|
bucket->ucb_bucket = b;
|
|
|
|
bucket->ucb_cnt = b->ub_cnt;
|
|
|
|
bucket->ucb_entries = b->ub_entries;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
cache_bucket_load_alloc(uma_cache_t cache, uma_bucket_t b)
|
|
|
|
{
|
|
|
|
|
|
|
|
cache_bucket_load(&cache->uc_allocbucket, b);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
cache_bucket_load_free(uma_cache_t cache, uma_bucket_t b)
|
|
|
|
{
|
|
|
|
|
|
|
|
cache_bucket_load(&cache->uc_freebucket, b);
|
|
|
|
}
|
|
|
|
|
2020-01-04 18:48:13 +00:00
|
|
|
#ifdef NUMA
|
2019-12-25 20:50:53 +00:00
|
|
|
static inline void
|
|
|
|
cache_bucket_load_cross(uma_cache_t cache, uma_bucket_t b)
|
|
|
|
{
|
|
|
|
|
|
|
|
cache_bucket_load(&cache->uc_crossbucket, b);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copy and preserve ucb_spare.
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
cache_bucket_copy(uma_cache_bucket_t b1, uma_cache_bucket_t b2)
|
|
|
|
{
|
|
|
|
|
|
|
|
b1->ucb_bucket = b2->ucb_bucket;
|
|
|
|
b1->ucb_entries = b2->ucb_entries;
|
|
|
|
b1->ucb_cnt = b2->ucb_cnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Swap two cache buckets.
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
cache_bucket_swap(uma_cache_bucket_t b1, uma_cache_bucket_t b2)
|
|
|
|
{
|
|
|
|
struct uma_cache_bucket b3;
|
|
|
|
|
|
|
|
CRITICAL_ASSERT(curthread);
|
|
|
|
|
|
|
|
cache_bucket_copy(&b3, b1);
|
|
|
|
cache_bucket_copy(b1, b2);
|
|
|
|
cache_bucket_copy(b2, &b3);
|
|
|
|
}
|
|
|
|
|
2020-02-19 18:48:46 +00:00
|
|
|
/*
|
|
|
|
* Attempt to fetch a bucket from a zone on behalf of the current cpu cache.
|
|
|
|
*/
|
|
|
|
static uma_bucket_t
|
|
|
|
cache_fetch_bucket(uma_zone_t zone, uma_cache_t cache, int domain)
|
|
|
|
{
|
|
|
|
uma_zone_domain_t zdom;
|
|
|
|
uma_bucket_t bucket;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Avoid the lock if possible.
|
|
|
|
*/
|
|
|
|
zdom = ZDOM_GET(zone, domain);
|
|
|
|
if (zdom->uzd_nitems == 0)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
if ((cache_uz_flags(cache) & UMA_ZONE_SMR) != 0 &&
|
|
|
|
!smr_poll(zone->uz_smr, zdom->uzd_seq, false))
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check the zone's cache of buckets.
|
|
|
|
*/
|
|
|
|
zdom = zone_domain_lock(zone, domain);
|
2020-10-02 19:04:29 +00:00
|
|
|
if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL)
|
2020-02-19 18:48:46 +00:00
|
|
|
return (bucket);
|
|
|
|
ZDOM_UNLOCK(zdom);
|
|
|
|
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
2012-12-07 22:27:13 +00:00
|
|
|
static void
|
|
|
|
zone_log_warning(uma_zone_t zone)
|
|
|
|
{
|
|
|
|
static const struct timeval warninterval = { 300, 0 };
|
|
|
|
|
|
|
|
if (!zone_warnings || zone->uz_warning == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (ratecheck(&zone->uz_ratecheck, &warninterval))
|
|
|
|
printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
|
|
|
|
}
|
|
|
|
|
2015-12-20 02:05:33 +00:00
|
|
|
static inline void
|
|
|
|
zone_maxaction(uma_zone_t zone)
|
|
|
|
{
|
2016-02-03 23:30:17 +00:00
|
|
|
|
|
|
|
if (zone->uz_maxaction.ta_func != NULL)
|
|
|
|
taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
|
2015-12-20 02:05:33 +00:00
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
|
|
|
* Routine called by timeout which is used to fire off some time interval
|
2003-09-19 23:27:46 +00:00
|
|
|
* based calculations. (stats, hash size, etc.)
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Arguments:
|
|
|
|
* arg Unused
|
2004-01-30 16:26:29 +00:00
|
|
|
*
|
2002-03-19 09:11:49 +00:00
|
|
|
* Returns:
|
|
|
|
* Nothing
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
uma_timeout(void *unused)
|
|
|
|
{
|
2002-04-08 06:20:34 +00:00
|
|
|
bucket_enable();
|
2019-11-28 00:19:09 +00:00
|
|
|
zone_foreach(zone_timeout, NULL);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
|
|
|
/* Reschedule this event */
|
2003-09-19 23:27:46 +00:00
|
|
|
callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2018-11-13 19:44:40 +00:00
|
|
|
/*
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
* Update the working set size estimates for the zone's bucket cache.
|
|
|
|
* The constants chosen here are somewhat arbitrary.
|
2018-11-13 19:44:40 +00:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zone_domain_update_wss(uma_zone_domain_t zdom)
|
|
|
|
{
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
long m;
|
2018-11-13 19:44:40 +00:00
|
|
|
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
ZDOM_LOCK_ASSERT(zdom);
|
|
|
|
MPASS(zdom->uzd_imax >= zdom->uzd_nitems);
|
|
|
|
MPASS(zdom->uzd_nitems >= zdom->uzd_bimin);
|
|
|
|
MPASS(zdom->uzd_bimin >= zdom->uzd_imin);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Estimate WSS as modified moving average of biggest allocation
|
|
|
|
* batches for each period over few minutes (UMA_TIMEOUT of 20s).
|
|
|
|
*/
|
|
|
|
zdom->uzd_wss = lmax(zdom->uzd_wss * 3 / 4,
|
|
|
|
zdom->uzd_imax - zdom->uzd_bimin);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Estimate longtime minimum item count as a combination of recent
|
|
|
|
* minimum item count, adjusted by WSS for safety, and the modified
|
|
|
|
* moving average over the last several hours (UMA_TIMEOUT of 20s).
|
|
|
|
* timin measures time since limin tried to go negative, that means
|
|
|
|
* we were dangerously close to or got out of cache.
|
|
|
|
*/
|
|
|
|
m = zdom->uzd_imin - zdom->uzd_wss;
|
|
|
|
if (m >= 0) {
|
|
|
|
if (zdom->uzd_limin >= m)
|
|
|
|
zdom->uzd_limin = m;
|
|
|
|
else
|
|
|
|
zdom->uzd_limin = (m + zdom->uzd_limin * 255) / 256;
|
|
|
|
zdom->uzd_timin++;
|
|
|
|
} else {
|
|
|
|
zdom->uzd_limin = 0;
|
|
|
|
zdom->uzd_timin = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* To reduce period edge effects on WSS keep half of the imax. */
|
|
|
|
atomic_subtract_long(&zdom->uzd_imax,
|
|
|
|
(zdom->uzd_imax - zdom->uzd_nitems + 1) / 2);
|
|
|
|
zdom->uzd_imin = zdom->uzd_bimin = zdom->uzd_nitems;
|
2018-11-13 19:44:40 +00:00
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
2003-09-19 23:27:46 +00:00
|
|
|
* Routine to perform timeout driven calculations. This expands the
|
|
|
|
* hashes and does per cpu statistics aggregation.
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
2009-01-25 09:11:24 +00:00
|
|
|
* Returns nothing.
|
2002-03-19 09:11:49 +00:00
|
|
|
*/
|
|
|
|
static void
|
2019-11-28 00:19:09 +00:00
|
|
|
zone_timeout(uma_zone_t zone, void *unused)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
2019-11-10 09:25:19 +00:00
|
|
|
uma_keg_t keg;
|
2020-01-04 03:30:08 +00:00
|
|
|
u_int slabs, pages;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-01-09 02:03:03 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0)
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
goto trim;
|
2019-11-10 09:25:19 +00:00
|
|
|
|
|
|
|
keg = zone->uz_keg;
|
2020-01-04 03:30:08 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Hash zones are non-numa by definition so the first domain
|
|
|
|
* is the only one present.
|
|
|
|
*/
|
|
|
|
KEG_LOCK(keg, 0);
|
|
|
|
pages = keg->uk_domain[0].ud_pages;
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
2009-01-25 09:11:24 +00:00
|
|
|
* Expand the keg hash table.
|
2004-01-30 16:26:29 +00:00
|
|
|
*
|
2002-03-19 09:11:49 +00:00
|
|
|
* This is done if the number of slabs is larger than the hash size.
|
|
|
|
* What I'm trying to do here is completely reduce collisions. This
|
|
|
|
* may be a little aggressive. Should I allow for two collisions max?
|
|
|
|
*/
|
2020-01-04 03:30:08 +00:00
|
|
|
if ((slabs = pages / keg->uk_ppera) > keg->uk_hash.uh_hashsize) {
|
2002-09-18 08:26:30 +00:00
|
|
|
struct uma_hash newhash;
|
|
|
|
struct uma_hash oldhash;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
2004-01-30 16:26:29 +00:00
|
|
|
* This is so involved because allocating and freeing
|
2009-01-25 09:11:24 +00:00
|
|
|
* while the keg lock is held will lead to deadlock.
|
2002-09-18 08:26:30 +00:00
|
|
|
* I have to do everything in stages and check for
|
|
|
|
* races.
|
|
|
|
*/
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_UNLOCK(keg, 0);
|
2019-06-06 23:57:28 +00:00
|
|
|
ret = hash_alloc(&newhash, 1 << fls(slabs));
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_LOCK(keg, 0);
|
2002-09-18 08:26:30 +00:00
|
|
|
if (ret) {
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
if (hash_expand(&keg->uk_hash, &newhash)) {
|
|
|
|
oldhash = keg->uk_hash;
|
|
|
|
keg->uk_hash = newhash;
|
2002-09-18 08:26:30 +00:00
|
|
|
} else
|
|
|
|
oldhash = newhash;
|
2002-05-13 04:39:28 +00:00
|
|
|
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_UNLOCK(keg, 0);
|
2002-09-18 08:26:30 +00:00
|
|
|
hash_free(&oldhash);
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
goto trim;
|
2002-04-14 13:47:10 +00:00
|
|
|
}
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_UNLOCK(keg, 0);
|
2018-11-13 19:44:40 +00:00
|
|
|
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
trim:
|
|
|
|
/* Trim caches not used for a long time. */
|
|
|
|
for (int i = 0; i < vm_ndomains; i++) {
|
|
|
|
if (bucket_cache_reclaim_domain(zone, false, false, i) &&
|
|
|
|
(zone->uz_flags & UMA_ZFLAG_CACHE) == 0)
|
|
|
|
keg_drain(zone->uz_keg, i);
|
|
|
|
}
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2002-04-14 13:47:10 +00:00
|
|
|
/*
|
|
|
|
* Allocate and zero fill the next sized hash table from the appropriate
|
|
|
|
* backing store.
|
|
|
|
*
|
|
|
|
* Arguments:
|
2002-05-13 04:39:28 +00:00
|
|
|
* hash A new hash structure with the old hash size in uh_hashsize
|
2002-04-14 13:47:10 +00:00
|
|
|
*
|
|
|
|
* Returns:
|
2016-05-02 20:16:29 +00:00
|
|
|
* 1 on success and 0 on failure.
|
2002-04-14 13:47:10 +00:00
|
|
|
*/
|
2002-09-28 17:15:38 +00:00
|
|
|
static int
|
2019-06-06 23:57:28 +00:00
|
|
|
hash_alloc(struct uma_hash *hash, u_int size)
|
2002-04-14 13:47:10 +00:00
|
|
|
{
|
2019-02-02 04:11:59 +00:00
|
|
|
size_t alloc;
|
2002-04-14 13:47:10 +00:00
|
|
|
|
2019-06-06 23:57:28 +00:00
|
|
|
KASSERT(powerof2(size), ("hash size must be power of 2"));
|
|
|
|
if (size > UMA_HASH_SIZE_INIT) {
|
|
|
|
hash->uh_hashsize = size;
|
2002-05-13 04:39:28 +00:00
|
|
|
alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
|
2019-12-08 01:15:06 +00:00
|
|
|
hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT);
|
2002-04-14 13:47:10 +00:00
|
|
|
} else {
|
2002-05-13 04:39:28 +00:00
|
|
|
alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
|
2009-01-25 09:11:24 +00:00
|
|
|
hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
|
2018-01-12 23:25:05 +00:00
|
|
|
UMA_ANYDOMAIN, M_WAITOK);
|
2002-05-13 04:39:28 +00:00
|
|
|
hash->uh_hashsize = UMA_HASH_SIZE_INIT;
|
|
|
|
}
|
|
|
|
if (hash->uh_slab_hash) {
|
|
|
|
bzero(hash->uh_slab_hash, alloc);
|
|
|
|
hash->uh_hashmask = hash->uh_hashsize - 1;
|
|
|
|
return (1);
|
2002-04-14 13:47:10 +00:00
|
|
|
}
|
|
|
|
|
2002-05-13 04:39:28 +00:00
|
|
|
return (0);
|
2002-04-14 13:47:10 +00:00
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
2003-09-19 22:31:45 +00:00
|
|
|
* Expands the hash table for HASH zones. This is done from zone_timeout
|
|
|
|
* to reduce collisions. This must not be done in the regular allocation
|
|
|
|
* path, otherwise, we can recurse on the vm while allocating pages.
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Arguments:
|
2004-01-30 16:26:29 +00:00
|
|
|
* oldhash The hash you want to expand
|
2002-05-13 04:39:28 +00:00
|
|
|
* newhash The hash structure for the new table
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Returns:
|
2004-01-30 16:26:29 +00:00
|
|
|
* Nothing
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Discussion:
|
|
|
|
*/
|
2002-05-13 04:39:28 +00:00
|
|
|
static int
|
|
|
|
hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
2019-12-08 01:15:06 +00:00
|
|
|
uma_hash_slab_t slab;
|
2019-02-12 04:33:05 +00:00
|
|
|
u_int hval;
|
|
|
|
u_int idx;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2002-05-13 04:39:28 +00:00
|
|
|
if (!newhash->uh_slab_hash)
|
|
|
|
return (0);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2002-05-13 04:39:28 +00:00
|
|
|
if (oldhash->uh_hashsize >= newhash->uh_hashsize)
|
|
|
|
return (0);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* I need to investigate hash algorithms for resizing without a
|
|
|
|
* full rehash.
|
|
|
|
*/
|
|
|
|
|
2019-02-12 04:33:05 +00:00
|
|
|
for (idx = 0; idx < oldhash->uh_hashsize; idx++)
|
2019-12-08 01:15:06 +00:00
|
|
|
while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
|
|
|
|
slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]);
|
|
|
|
LIST_REMOVE(slab, uhs_hlink);
|
|
|
|
hval = UMA_HASH(newhash, slab->uhs_data);
|
|
|
|
LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
|
|
|
|
slab, uhs_hlink);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2002-05-13 04:39:28 +00:00
|
|
|
return (1);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2002-04-14 13:47:10 +00:00
|
|
|
/*
|
|
|
|
* Free the hash bucket to the appropriate backing store.
|
|
|
|
*
|
|
|
|
* Arguments:
|
|
|
|
* slab_hash The hash bucket we're freeing
|
|
|
|
* hashsize The number of entries in that hash bucket
|
|
|
|
*
|
|
|
|
* Returns:
|
|
|
|
* Nothing
|
|
|
|
*/
|
2002-04-08 04:48:58 +00:00
|
|
|
static void
|
2002-05-13 04:39:28 +00:00
|
|
|
hash_free(struct uma_hash *hash)
|
2002-04-08 04:48:58 +00:00
|
|
|
{
|
2002-05-13 04:39:28 +00:00
|
|
|
if (hash->uh_slab_hash == NULL)
|
|
|
|
return;
|
|
|
|
if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
|
2013-06-17 03:43:47 +00:00
|
|
|
zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
|
2002-04-08 04:48:58 +00:00
|
|
|
else
|
2003-09-19 07:23:50 +00:00
|
|
|
free(hash->uh_slab_hash, M_UMAHASH);
|
2002-04-08 04:48:58 +00:00
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
|
|
|
* Frees all outstanding items in a bucket
|
|
|
|
*
|
|
|
|
* Arguments:
|
|
|
|
* zone The zone to free to, must be unlocked.
|
2020-01-04 03:04:46 +00:00
|
|
|
* bucket The free/alloc bucket with items.
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Returns:
|
|
|
|
* Nothing
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
|
|
|
|
{
|
2013-06-17 03:43:47 +00:00
|
|
|
int i;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-02-19 18:48:46 +00:00
|
|
|
if (bucket->ub_cnt == 0)
|
2002-03-19 09:11:49 +00:00
|
|
|
return;
|
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
|
|
|
|
bucket->ub_seq != SMR_SEQ_INVALID) {
|
|
|
|
smr_wait(zone->uz_smr, bucket->ub_seq);
|
2020-02-13 20:58:51 +00:00
|
|
|
bucket->ub_seq = SMR_SEQ_INVALID;
|
2020-01-31 00:49:51 +00:00
|
|
|
for (i = 0; i < bucket->ub_cnt; i++)
|
|
|
|
item_dtor(zone, bucket->ub_bucket[i],
|
|
|
|
zone->uz_size, NULL, SKIP_NONE);
|
|
|
|
}
|
2013-06-17 03:43:47 +00:00
|
|
|
if (zone->uz_fini)
|
2021-04-13 17:39:50 -04:00
|
|
|
for (i = 0; i < bucket->ub_cnt; i++) {
|
|
|
|
kasan_mark_item_valid(zone, bucket->ub_bucket[i]);
|
2013-06-17 03:43:47 +00:00
|
|
|
zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
|
2021-04-13 17:39:50 -04:00
|
|
|
kasan_mark_item_invalid(zone, bucket->ub_bucket[i]);
|
|
|
|
}
|
2013-06-17 03:43:47 +00:00
|
|
|
zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
|
2020-01-04 03:04:46 +00:00
|
|
|
if (zone->uz_max_items > 0)
|
|
|
|
zone_free_limit(zone, bucket->ub_cnt);
|
2020-01-31 00:49:51 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt);
|
|
|
|
#endif
|
2013-06-17 03:43:47 +00:00
|
|
|
bucket->ub_cnt = 0;
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Drains the per cpu caches for a zone.
|
|
|
|
*
|
2020-01-04 03:15:34 +00:00
|
|
|
* NOTE: This may only be called while the zone is being torn down, and not
|
Modify UMA to use critical sections to protect per-CPU caches, rather than
mutexes, which offers lower overhead on both UP and SMP. When allocating
from or freeing to the per-cpu cache, without INVARIANTS enabled, we now
no longer perform any mutex operations, which offers a 1%-3% performance
improvement in a variety of micro-benchmarks. We rely on critical
sections to prevent (a) preemption resulting in reentrant access to UMA on
a single CPU, and (b) migration of the thread during access. In the event
we need to go back to the zone for a new bucket, we release the critical
section to acquire the global zone mutex, and must re-acquire the critical
section and re-evaluate which cache we are accessing in case migration has
occured, or circumstances have changed in the current cache.
Per-CPU cache statistics are now gathered lock-free by the sysctl, which
can result in small races in statistics reporting for caches.
Reviewed by: bmilekic, jeff (somewhat)
Tested by: rwatson, kris, gnn, scottl, mike at sentex dot net, others
2005-04-29 18:56:36 +00:00
|
|
|
* during normal operation. This is necessary in order that we do not have
|
|
|
|
* to migrate CPUs to drain the per-CPU caches.
|
|
|
|
*
|
2002-03-19 09:11:49 +00:00
|
|
|
* Arguments:
|
2003-07-30 18:55:15 +00:00
|
|
|
* zone The zone to drain, must be unlocked.
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Returns:
|
|
|
|
* Nothing
|
|
|
|
*/
|
|
|
|
static void
|
2003-09-19 23:27:46 +00:00
|
|
|
cache_drain(uma_zone_t zone)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
|
|
|
uma_cache_t cache;
|
2019-12-25 20:50:53 +00:00
|
|
|
uma_bucket_t bucket;
|
2020-02-13 20:58:51 +00:00
|
|
|
smr_seq_t seq;
|
2002-03-19 09:11:49 +00:00
|
|
|
int cpu;
|
|
|
|
|
|
|
|
/*
|
Modify UMA to use critical sections to protect per-CPU caches, rather than
mutexes, which offers lower overhead on both UP and SMP. When allocating
from or freeing to the per-cpu cache, without INVARIANTS enabled, we now
no longer perform any mutex operations, which offers a 1%-3% performance
improvement in a variety of micro-benchmarks. We rely on critical
sections to prevent (a) preemption resulting in reentrant access to UMA on
a single CPU, and (b) migration of the thread during access. In the event
we need to go back to the zone for a new bucket, we release the critical
section to acquire the global zone mutex, and must re-acquire the critical
section and re-evaluate which cache we are accessing in case migration has
occured, or circumstances have changed in the current cache.
Per-CPU cache statistics are now gathered lock-free by the sysctl, which
can result in small races in statistics reporting for caches.
Reviewed by: bmilekic, jeff (somewhat)
Tested by: rwatson, kris, gnn, scottl, mike at sentex dot net, others
2005-04-29 18:56:36 +00:00
|
|
|
* XXX: It is safe to not lock the per-CPU caches, because we're
|
|
|
|
* tearing down the zone anyway. I.e., there will be no further use
|
|
|
|
* of the caches at this point.
|
|
|
|
*
|
|
|
|
* XXX: It would good to be able to assert that the zone is being
|
|
|
|
* torn down to prevent improper use of cache_drain().
|
2002-03-19 09:11:49 +00:00
|
|
|
*/
|
2020-02-13 20:58:51 +00:00
|
|
|
seq = SMR_SEQ_INVALID;
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
|
2020-02-22 03:44:10 +00:00
|
|
|
seq = smr_advance(zone->uz_smr);
|
2010-06-11 18:46:34 +00:00
|
|
|
CPU_FOREACH(cpu) {
|
2002-03-19 09:11:49 +00:00
|
|
|
cache = &zone->uz_cpu[cpu];
|
2019-12-25 20:50:53 +00:00
|
|
|
bucket = cache_bucket_unload_alloc(cache);
|
2020-02-19 18:48:46 +00:00
|
|
|
if (bucket != NULL)
|
2019-12-25 20:50:53 +00:00
|
|
|
bucket_free(zone, bucket, NULL);
|
|
|
|
bucket = cache_bucket_unload_free(cache);
|
|
|
|
if (bucket != NULL) {
|
2020-02-13 20:58:51 +00:00
|
|
|
bucket->ub_seq = seq;
|
2019-12-25 20:50:53 +00:00
|
|
|
bucket_free(zone, bucket, NULL);
|
|
|
|
}
|
|
|
|
bucket = cache_bucket_unload_cross(cache);
|
|
|
|
if (bucket != NULL) {
|
2020-02-13 20:58:51 +00:00
|
|
|
bucket->ub_seq = seq;
|
2019-12-25 20:50:53 +00:00
|
|
|
bucket_free(zone, bucket, NULL);
|
|
|
|
}
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
2021-04-14 12:57:24 -04:00
|
|
|
bucket_cache_reclaim(zone, true, UMA_ANYDOMAIN);
|
2004-02-01 06:15:17 +00:00
|
|
|
}
|
|
|
|
|
2013-11-19 10:51:46 +00:00
|
|
|
static void
|
2019-11-28 00:19:09 +00:00
|
|
|
cache_shrink(uma_zone_t zone, void *unused)
|
2013-11-19 10:51:46 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
|
|
|
|
return;
|
|
|
|
|
2021-04-14 12:57:24 -04:00
|
|
|
ZONE_LOCK(zone);
|
2019-11-28 00:19:09 +00:00
|
|
|
zone->uz_bucket_size =
|
|
|
|
(zone->uz_bucket_size_min + zone->uz_bucket_size) / 2;
|
2021-04-14 12:57:24 -04:00
|
|
|
ZONE_UNLOCK(zone);
|
2013-11-19 10:51:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2019-11-28 00:19:09 +00:00
|
|
|
cache_drain_safe_cpu(uma_zone_t zone, void *unused)
|
2013-11-19 10:51:46 +00:00
|
|
|
{
|
|
|
|
uma_cache_t cache;
|
2019-08-06 21:50:34 +00:00
|
|
|
uma_bucket_t b1, b2, b3;
|
2018-01-12 23:25:05 +00:00
|
|
|
int domain;
|
2013-11-19 10:51:46 +00:00
|
|
|
|
|
|
|
if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
|
|
|
|
return;
|
|
|
|
|
2019-08-06 21:50:34 +00:00
|
|
|
b1 = b2 = b3 = NULL;
|
2013-11-19 10:51:46 +00:00
|
|
|
critical_enter();
|
|
|
|
cache = &zone->uz_cpu[curcpu];
|
2020-02-19 18:48:46 +00:00
|
|
|
domain = PCPU_GET(domain);
|
2019-12-25 20:50:53 +00:00
|
|
|
b1 = cache_bucket_unload_alloc(cache);
|
2020-01-31 00:49:51 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't flush SMR zone buckets. This leaves the zone without a
|
|
|
|
* bucket and forces every free to synchronize().
|
|
|
|
*/
|
2020-02-13 20:58:51 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) == 0) {
|
|
|
|
b2 = cache_bucket_unload_free(cache);
|
|
|
|
b3 = cache_bucket_unload_cross(cache);
|
|
|
|
}
|
|
|
|
critical_exit();
|
|
|
|
|
|
|
|
if (b1 != NULL)
|
2020-02-19 18:48:46 +00:00
|
|
|
zone_free_bucket(zone, b1, NULL, domain, false);
|
2020-02-13 20:58:51 +00:00
|
|
|
if (b2 != NULL)
|
2020-02-19 18:48:46 +00:00
|
|
|
zone_free_bucket(zone, b2, NULL, domain, false);
|
2020-02-13 20:58:51 +00:00
|
|
|
if (b3 != NULL) {
|
2020-02-19 18:48:46 +00:00
|
|
|
/* Adjust the domain so it goes to zone_free_cross. */
|
|
|
|
domain = (domain + 1) % vm_ndomains;
|
|
|
|
zone_free_bucket(zone, b3, NULL, domain, false);
|
2019-08-06 21:50:34 +00:00
|
|
|
}
|
2013-11-19 10:51:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Safely drain per-CPU caches of a zone(s) to alloc bucket.
|
|
|
|
* This is an expensive call because it needs to bind to all CPUs
|
|
|
|
* one by one and enter a critical section on each of them in order
|
|
|
|
* to safely access their cache buckets.
|
|
|
|
* Zone lock must not be held on call this function.
|
|
|
|
*/
|
|
|
|
static void
|
2019-09-01 22:22:43 +00:00
|
|
|
pcpu_cache_drain_safe(uma_zone_t zone)
|
2013-11-19 10:51:46 +00:00
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
/*
|
2020-01-04 03:15:34 +00:00
|
|
|
* Polite bucket sizes shrinking was not enough, shrink aggressively.
|
2013-11-19 10:51:46 +00:00
|
|
|
*/
|
|
|
|
if (zone)
|
2019-11-28 00:19:09 +00:00
|
|
|
cache_shrink(zone, NULL);
|
2013-11-19 10:51:46 +00:00
|
|
|
else
|
2019-11-28 00:19:09 +00:00
|
|
|
zone_foreach(cache_shrink, NULL);
|
2013-11-19 10:51:46 +00:00
|
|
|
|
|
|
|
CPU_FOREACH(cpu) {
|
|
|
|
thread_lock(curthread);
|
|
|
|
sched_bind(curthread, cpu);
|
|
|
|
thread_unlock(curthread);
|
|
|
|
|
|
|
|
if (zone)
|
2019-11-28 00:19:09 +00:00
|
|
|
cache_drain_safe_cpu(zone, NULL);
|
2013-11-19 10:51:46 +00:00
|
|
|
else
|
2019-11-28 00:19:09 +00:00
|
|
|
zone_foreach(cache_drain_safe_cpu, NULL);
|
2013-11-19 10:51:46 +00:00
|
|
|
}
|
|
|
|
thread_lock(curthread);
|
|
|
|
sched_unbind(curthread);
|
|
|
|
thread_unlock(curthread);
|
|
|
|
}
|
|
|
|
|
2004-02-01 06:15:17 +00:00
|
|
|
/*
|
2019-09-01 22:22:43 +00:00
|
|
|
* Reclaim cached buckets from a zone. All buckets are reclaimed if the caller
|
|
|
|
* requested a drain, otherwise the per-domain caches are trimmed to either
|
|
|
|
* estimated working set size.
|
2004-02-01 06:15:17 +00:00
|
|
|
*/
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
static bool
|
|
|
|
bucket_cache_reclaim_domain(uma_zone_t zone, bool drain, bool trim, int domain)
|
2004-02-01 06:15:17 +00:00
|
|
|
{
|
2018-01-12 23:25:05 +00:00
|
|
|
uma_zone_domain_t zdom;
|
2004-02-01 06:15:17 +00:00
|
|
|
uma_bucket_t bucket;
|
2020-02-19 18:48:46 +00:00
|
|
|
long target;
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
bool done = false;
|
2021-04-09 09:47:09 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The cross bucket is partially filled and not part of
|
|
|
|
* the item count. Reclaim it individually here.
|
|
|
|
*/
|
|
|
|
zdom = ZDOM_GET(zone, domain);
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) {
|
|
|
|
ZONE_CROSS_LOCK(zone);
|
|
|
|
bucket = zdom->uzd_cross;
|
|
|
|
zdom->uzd_cross = NULL;
|
|
|
|
ZONE_CROSS_UNLOCK(zone);
|
|
|
|
if (bucket != NULL)
|
|
|
|
bucket_free(zone, bucket, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we were asked to drain the zone, we are done only once
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
* this bucket cache is empty. If trim, we reclaim items in
|
|
|
|
* excess of the zone's estimated working set size. Multiple
|
|
|
|
* consecutive calls will shrink the WSS and so reclaim more.
|
|
|
|
* If neither drain nor trim, then voluntarily reclaim 1/4
|
|
|
|
* (to reduce first spike) of items not used for a long time.
|
2021-04-09 09:47:09 -04:00
|
|
|
*/
|
|
|
|
ZDOM_LOCK(zdom);
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
zone_domain_update_wss(zdom);
|
|
|
|
if (drain)
|
|
|
|
target = 0;
|
|
|
|
else if (trim)
|
|
|
|
target = zdom->uzd_wss;
|
|
|
|
else if (zdom->uzd_timin > 900 / UMA_TIMEOUT)
|
|
|
|
target = zdom->uzd_nitems - zdom->uzd_limin / 4;
|
|
|
|
else {
|
|
|
|
ZDOM_UNLOCK(zdom);
|
|
|
|
return (done);
|
|
|
|
}
|
|
|
|
while ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) != NULL &&
|
|
|
|
zdom->uzd_nitems >= target + bucket->ub_cnt) {
|
2021-04-09 09:47:09 -04:00
|
|
|
bucket = zone_fetch_bucket(zone, zdom, true);
|
|
|
|
if (bucket == NULL)
|
|
|
|
break;
|
|
|
|
bucket_free(zone, bucket, NULL);
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
done = true;
|
2021-04-09 09:47:09 -04:00
|
|
|
ZDOM_LOCK(zdom);
|
|
|
|
}
|
|
|
|
ZDOM_UNLOCK(zdom);
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
return (done);
|
2021-04-09 09:47:09 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2021-04-14 12:57:24 -04:00
|
|
|
bucket_cache_reclaim(uma_zone_t zone, bool drain, int domain)
|
2021-04-09 09:47:09 -04:00
|
|
|
{
|
2018-01-12 23:25:05 +00:00
|
|
|
int i;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-02-19 18:48:46 +00:00
|
|
|
/*
|
|
|
|
* Shrink the zone bucket size to ensure that the per-CPU caches
|
|
|
|
* don't grow too large.
|
|
|
|
*/
|
|
|
|
if (zone->uz_bucket_size > zone->uz_bucket_size_min)
|
|
|
|
zone->uz_bucket_size--;
|
|
|
|
|
2021-04-14 12:57:24 -04:00
|
|
|
if (domain != UMA_ANYDOMAIN &&
|
|
|
|
(zone->uz_flags & UMA_ZONE_ROUNDROBIN) == 0) {
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
bucket_cache_reclaim_domain(zone, drain, true, domain);
|
2021-04-14 12:57:24 -04:00
|
|
|
} else {
|
|
|
|
for (i = 0; i < vm_ndomains; i++)
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
bucket_cache_reclaim_domain(zone, drain, true, i);
|
2021-04-14 12:57:24 -04:00
|
|
|
}
|
2013-06-18 04:50:20 +00:00
|
|
|
}
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2013-06-18 04:50:20 +00:00
|
|
|
static void
|
|
|
|
keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
|
|
|
|
{
|
|
|
|
uint8_t *mem;
|
2021-04-13 17:39:50 -04:00
|
|
|
size_t size;
|
2013-06-18 04:50:20 +00:00
|
|
|
int i;
|
|
|
|
uint8_t flags;
|
|
|
|
|
2017-06-01 18:36:52 +00:00
|
|
|
CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
|
|
|
|
keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
|
|
|
|
|
2019-12-08 01:15:06 +00:00
|
|
|
mem = slab_data(slab, keg);
|
2021-04-13 17:39:50 -04:00
|
|
|
size = PAGE_SIZE * keg->uk_ppera;
|
|
|
|
|
|
|
|
kasan_mark_slab_valid(keg, mem);
|
2013-06-18 04:50:20 +00:00
|
|
|
if (keg->uk_fini != NULL) {
|
2021-04-13 17:39:50 -04:00
|
|
|
for (i = start - 1; i > -1; i--)
|
2018-06-08 00:15:08 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
/*
|
|
|
|
* trash_fini implies that dtor was trash_dtor. trash_fini
|
|
|
|
* would check that memory hasn't been modified since free,
|
|
|
|
* which executed trash_dtor.
|
|
|
|
* That's why we need to run uma_dbg_kskip() check here,
|
|
|
|
* albeit we don't make skip check for other init/fini
|
|
|
|
* invocations.
|
|
|
|
*/
|
2019-12-08 01:15:06 +00:00
|
|
|
if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) ||
|
2018-06-08 00:15:08 +00:00
|
|
|
keg->uk_fini != trash_fini)
|
|
|
|
#endif
|
2019-12-08 01:15:06 +00:00
|
|
|
keg->uk_fini(slab_item(slab, keg, i), keg->uk_size);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
2021-04-13 17:39:50 -04:00
|
|
|
flags = slab->us_flags;
|
|
|
|
if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) {
|
2020-01-14 02:14:15 +00:00
|
|
|
zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab),
|
|
|
|
NULL, SKIP_NONE);
|
2021-04-13 17:39:50 -04:00
|
|
|
}
|
|
|
|
keg->uk_freef(mem, size, flags);
|
|
|
|
uma_total_dec(size);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2020-10-19 16:57:40 +00:00
|
|
|
static void
|
|
|
|
keg_drain_domain(uma_keg_t keg, int domain)
|
|
|
|
{
|
|
|
|
struct slabhead freeslabs;
|
|
|
|
uma_domain_t dom;
|
|
|
|
uma_slab_t slab, tmp;
|
|
|
|
uint32_t i, stofree, stokeep, partial;
|
|
|
|
|
|
|
|
dom = &keg->uk_domain[domain];
|
|
|
|
LIST_INIT(&freeslabs);
|
|
|
|
|
|
|
|
CTR4(KTR_UMA, "keg_drain %s(%p) domain %d free items: %u",
|
2020-10-19 18:54:44 +00:00
|
|
|
keg->uk_name, keg, domain, dom->ud_free_items);
|
2020-10-19 16:57:40 +00:00
|
|
|
|
|
|
|
KEG_LOCK(keg, domain);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Are the free items in partially allocated slabs sufficient to meet
|
|
|
|
* the reserve? If not, compute the number of fully free slabs that must
|
|
|
|
* be kept.
|
|
|
|
*/
|
|
|
|
partial = dom->ud_free_items - dom->ud_free_slabs * keg->uk_ipers;
|
|
|
|
if (partial < keg->uk_reserve) {
|
|
|
|
stokeep = min(dom->ud_free_slabs,
|
|
|
|
howmany(keg->uk_reserve - partial, keg->uk_ipers));
|
|
|
|
} else {
|
|
|
|
stokeep = 0;
|
|
|
|
}
|
|
|
|
stofree = dom->ud_free_slabs - stokeep;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Partition the free slabs into two sets: those that must be kept in
|
|
|
|
* order to maintain the reserve, and those that may be released back to
|
|
|
|
* the system. Since one set may be much larger than the other,
|
|
|
|
* populate the smaller of the two sets and swap them if necessary.
|
|
|
|
*/
|
|
|
|
for (i = min(stofree, stokeep); i > 0; i--) {
|
|
|
|
slab = LIST_FIRST(&dom->ud_free_slab);
|
|
|
|
LIST_REMOVE(slab, us_link);
|
|
|
|
LIST_INSERT_HEAD(&freeslabs, slab, us_link);
|
|
|
|
}
|
|
|
|
if (stofree > stokeep)
|
|
|
|
LIST_SWAP(&freeslabs, &dom->ud_free_slab, uma_slab, us_link);
|
|
|
|
|
|
|
|
if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) {
|
|
|
|
LIST_FOREACH(slab, &freeslabs, us_link)
|
|
|
|
UMA_HASH_REMOVE(&keg->uk_hash, slab);
|
|
|
|
}
|
|
|
|
dom->ud_free_items -= stofree * keg->uk_ipers;
|
|
|
|
dom->ud_free_slabs -= stofree;
|
|
|
|
dom->ud_pages -= stofree * keg->uk_ppera;
|
|
|
|
KEG_UNLOCK(keg, domain);
|
|
|
|
|
|
|
|
LIST_FOREACH_SAFE(slab, &freeslabs, us_link, tmp)
|
|
|
|
keg_free_slab(keg, slab, keg->uk_ipers);
|
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
2009-01-25 09:11:24 +00:00
|
|
|
* Frees pages from a keg back to the system. This is done on demand from
|
2002-03-19 09:11:49 +00:00
|
|
|
* the pageout daemon.
|
|
|
|
*
|
2009-01-25 09:11:24 +00:00
|
|
|
* Returns nothing.
|
2002-03-19 09:11:49 +00:00
|
|
|
*/
|
2009-01-25 09:11:24 +00:00
|
|
|
static void
|
2021-04-14 12:57:24 -04:00
|
|
|
keg_drain(uma_keg_t keg, int domain)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
2020-10-19 16:57:40 +00:00
|
|
|
int i;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-10-19 16:57:40 +00:00
|
|
|
if ((keg->uk_flags & UMA_ZONE_NOFREE) != 0)
|
2002-03-19 09:11:49 +00:00
|
|
|
return;
|
2021-04-14 12:57:24 -04:00
|
|
|
if (domain != UMA_ANYDOMAIN) {
|
|
|
|
keg_drain_domain(keg, domain);
|
|
|
|
} else {
|
|
|
|
for (i = 0; i < vm_ndomains; i++)
|
|
|
|
keg_drain_domain(keg, i);
|
|
|
|
}
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2009-01-25 09:11:24 +00:00
|
|
|
static void
|
2021-04-14 12:57:24 -04:00
|
|
|
zone_reclaim(uma_zone_t zone, int domain, int waitok, bool drain)
|
2009-01-25 09:11:24 +00:00
|
|
|
{
|
|
|
|
/*
|
2021-04-14 12:57:24 -04:00
|
|
|
* Count active reclaim operations in order to interlock with
|
|
|
|
* zone_dtor(), which removes the zone from global lists before
|
|
|
|
* attempting to reclaim items itself.
|
|
|
|
*
|
|
|
|
* The zone may be destroyed while sleeping, so only zone_dtor() should
|
|
|
|
* specify M_WAITOK.
|
2009-01-25 09:11:24 +00:00
|
|
|
*/
|
|
|
|
ZONE_LOCK(zone);
|
2021-04-14 12:57:24 -04:00
|
|
|
if (waitok == M_WAITOK) {
|
|
|
|
while (zone->uz_reclaimers > 0)
|
|
|
|
msleep(zone, ZONE_LOCKPTR(zone), PVM, "zonedrain", 1);
|
2009-01-25 09:11:24 +00:00
|
|
|
}
|
2021-04-14 12:57:24 -04:00
|
|
|
zone->uz_reclaimers++;
|
2009-01-25 09:11:24 +00:00
|
|
|
ZONE_UNLOCK(zone);
|
2021-04-14 12:57:24 -04:00
|
|
|
bucket_cache_reclaim(zone, drain, domain);
|
2019-09-01 22:22:43 +00:00
|
|
|
|
2019-11-10 09:25:19 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0)
|
2021-04-14 12:57:24 -04:00
|
|
|
keg_drain(zone->uz_keg, domain);
|
2009-01-25 09:11:24 +00:00
|
|
|
ZONE_LOCK(zone);
|
2021-04-14 12:57:24 -04:00
|
|
|
zone->uz_reclaimers--;
|
|
|
|
if (zone->uz_reclaimers == 0)
|
|
|
|
wakeup(zone);
|
2009-01-25 09:11:24 +00:00
|
|
|
ZONE_UNLOCK(zone);
|
|
|
|
}
|
|
|
|
|
2019-09-01 22:22:43 +00:00
|
|
|
static void
|
2021-04-14 12:57:24 -04:00
|
|
|
zone_drain(uma_zone_t zone, void *arg)
|
2009-01-25 09:11:24 +00:00
|
|
|
{
|
2021-04-14 12:57:24 -04:00
|
|
|
int domain;
|
2009-01-25 09:11:24 +00:00
|
|
|
|
2021-04-14 12:57:24 -04:00
|
|
|
domain = (int)(uintptr_t)arg;
|
|
|
|
zone_reclaim(zone, domain, M_NOWAIT, true);
|
2019-09-01 22:22:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2021-04-14 12:57:24 -04:00
|
|
|
zone_trim(uma_zone_t zone, void *arg)
|
2019-09-01 22:22:43 +00:00
|
|
|
{
|
2021-04-14 12:57:24 -04:00
|
|
|
int domain;
|
2019-09-01 22:22:43 +00:00
|
|
|
|
2021-04-14 12:57:24 -04:00
|
|
|
domain = (int)(uintptr_t)arg;
|
|
|
|
zone_reclaim(zone, domain, M_NOWAIT, false);
|
2009-01-25 09:11:24 +00:00
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
2020-01-04 03:30:08 +00:00
|
|
|
* Allocate a new slab for a keg and inserts it into the partial slab list.
|
|
|
|
* The keg should be unlocked on entry. If the allocation succeeds it will
|
|
|
|
* be locked on return.
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Arguments:
|
2019-01-23 18:58:15 +00:00
|
|
|
* flags Wait flags for the item initialization routine
|
|
|
|
* aflags Wait flags for the slab allocation
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Returns:
|
|
|
|
* The slab that was allocated or NULL if there is no memory and the
|
|
|
|
* caller specified M_NOWAIT.
|
|
|
|
*/
|
2004-01-30 16:26:29 +00:00
|
|
|
static uma_slab_t
|
2019-01-23 18:58:15 +00:00
|
|
|
keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
|
|
|
|
int aflags)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
2020-01-04 03:30:08 +00:00
|
|
|
uma_domain_t dom;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
uma_slab_t slab;
|
2017-11-28 23:40:54 +00:00
|
|
|
unsigned long size;
|
2013-04-09 17:43:48 +00:00
|
|
|
uint8_t *mem;
|
2019-01-23 18:58:15 +00:00
|
|
|
uint8_t sflags;
|
2002-03-19 09:11:49 +00:00
|
|
|
int i;
|
|
|
|
|
2018-01-12 23:25:05 +00:00
|
|
|
KASSERT(domain >= 0 && domain < vm_ndomains,
|
|
|
|
("keg_alloc_slab: domain %d out of range", domain));
|
2002-04-08 02:42:55 +00:00
|
|
|
|
2018-10-24 16:41:47 +00:00
|
|
|
slab = NULL;
|
|
|
|
mem = NULL;
|
2020-01-09 02:03:03 +00:00
|
|
|
if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) {
|
2020-01-14 02:14:15 +00:00
|
|
|
uma_hash_slab_t hslab;
|
|
|
|
hslab = zone_alloc_item(slabzone(keg->uk_ipers), NULL,
|
|
|
|
domain, aflags);
|
|
|
|
if (hslab == NULL)
|
2020-01-04 03:15:34 +00:00
|
|
|
goto fail;
|
2020-01-14 02:14:15 +00:00
|
|
|
slab = &hslab->uhs_slab;
|
2002-04-08 02:42:55 +00:00
|
|
|
}
|
|
|
|
|
2002-06-19 20:49:44 +00:00
|
|
|
/*
|
|
|
|
* This reproduces the old vm_zone behavior of zero filling pages the
|
|
|
|
* first time they are added to a zone.
|
|
|
|
*
|
|
|
|
* Malloced items are zeroed in uma_zalloc.
|
|
|
|
*/
|
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
|
2019-01-23 18:58:15 +00:00
|
|
|
aflags |= M_ZERO;
|
2002-06-19 20:49:44 +00:00
|
|
|
else
|
2019-01-23 18:58:15 +00:00
|
|
|
aflags &= ~M_ZERO;
|
2002-06-19 20:49:44 +00:00
|
|
|
|
2012-01-27 20:18:31 +00:00
|
|
|
if (keg->uk_flags & UMA_ZONE_NODUMP)
|
2019-01-23 18:58:15 +00:00
|
|
|
aflags |= M_NODUMP;
|
2012-01-27 20:18:31 +00:00
|
|
|
|
2009-01-25 09:11:24 +00:00
|
|
|
/* zone is passed for legacy reasons. */
|
2018-10-24 16:41:47 +00:00
|
|
|
size = keg->uk_ppera * PAGE_SIZE;
|
2021-04-13 17:39:50 -04:00
|
|
|
mem = keg->uk_allocf(zone, size, domain, &sflags, aflags);
|
2003-09-19 08:53:33 +00:00
|
|
|
if (mem == NULL) {
|
2020-01-09 02:03:03 +00:00
|
|
|
if (keg->uk_flags & UMA_ZFLAG_OFFPAGE)
|
2020-01-14 02:14:15 +00:00
|
|
|
zone_free_item(slabzone(keg->uk_ipers),
|
|
|
|
slab_tohashslab(slab), NULL, SKIP_NONE);
|
2020-01-04 03:15:34 +00:00
|
|
|
goto fail;
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
2017-11-28 23:40:54 +00:00
|
|
|
uma_total_inc(size);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-01-04 03:30:08 +00:00
|
|
|
/* For HASH zones all pages go to the same uma_domain. */
|
2020-01-09 02:03:03 +00:00
|
|
|
if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0)
|
2020-01-04 03:30:08 +00:00
|
|
|
domain = 0;
|
|
|
|
|
2002-06-25 21:04:50 +00:00
|
|
|
/* Point the slab into the allocated memory */
|
2020-01-09 02:03:03 +00:00
|
|
|
if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE))
|
2021-04-13 17:39:50 -04:00
|
|
|
slab = (uma_slab_t)(mem + keg->uk_pgoff);
|
2019-12-08 01:15:06 +00:00
|
|
|
else
|
2020-01-14 02:14:15 +00:00
|
|
|
slab_tohashslab(slab)->uhs_data = mem;
|
2002-06-25 21:04:50 +00:00
|
|
|
|
2020-01-09 02:03:03 +00:00
|
|
|
if (keg->uk_flags & UMA_ZFLAG_VTOSLAB)
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
for (i = 0; i < keg->uk_ppera; i++)
|
2019-11-28 07:49:25 +00:00
|
|
|
vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE),
|
|
|
|
zone, slab);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
slab->us_freecount = keg->uk_ipers;
|
2019-01-23 18:58:15 +00:00
|
|
|
slab->us_flags = sflags;
|
2018-01-12 23:25:05 +00:00
|
|
|
slab->us_domain = domain;
|
2020-01-04 03:30:08 +00:00
|
|
|
|
2019-12-02 22:44:34 +00:00
|
|
|
BIT_FILL(keg->uk_ipers, &slab->us_free);
|
2013-06-13 21:05:38 +00:00
|
|
|
#ifdef INVARIANTS
|
2019-12-14 05:21:56 +00:00
|
|
|
BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg));
|
2013-06-13 21:05:38 +00:00
|
|
|
#endif
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
2004-08-02 00:18:36 +00:00
|
|
|
if (keg->uk_init != NULL) {
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
for (i = 0; i < keg->uk_ipers; i++)
|
2019-12-08 01:15:06 +00:00
|
|
|
if (keg->uk_init(slab_item(slab, keg, i),
|
2019-01-23 18:58:15 +00:00
|
|
|
keg->uk_size, flags) != 0)
|
2004-08-02 00:18:36 +00:00
|
|
|
break;
|
|
|
|
if (i != keg->uk_ipers) {
|
2013-06-18 04:50:20 +00:00
|
|
|
keg_free_slab(keg, slab, i);
|
2020-01-04 03:15:34 +00:00
|
|
|
goto fail;
|
2004-08-02 00:18:36 +00:00
|
|
|
}
|
|
|
|
}
|
2021-04-13 17:39:50 -04:00
|
|
|
kasan_mark_slab_invalid(keg, mem);
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_LOCK(keg, domain);
|
2002-06-25 21:04:50 +00:00
|
|
|
|
2017-06-01 18:36:52 +00:00
|
|
|
CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
|
|
|
|
slab, keg->uk_name, keg);
|
|
|
|
|
2020-01-09 02:03:03 +00:00
|
|
|
if (keg->uk_flags & UMA_ZFLAG_HASH)
|
2018-10-24 16:41:47 +00:00
|
|
|
UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-01-04 03:30:08 +00:00
|
|
|
/*
|
|
|
|
* If we got a slab here it's safe to mark it partially used
|
|
|
|
* and return. We assume that the caller is going to remove
|
|
|
|
* at least one item.
|
|
|
|
*/
|
|
|
|
dom = &keg->uk_domain[domain];
|
|
|
|
LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
|
|
|
|
dom->ud_pages += keg->uk_ppera;
|
2020-02-11 20:06:33 +00:00
|
|
|
dom->ud_free_items += keg->uk_ipers;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
|
|
|
return (slab);
|
2020-01-04 03:15:34 +00:00
|
|
|
|
|
|
|
fail:
|
|
|
|
return (NULL);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2003-09-21 07:39:16 +00:00
|
|
|
/*
|
2021-02-22 18:21:49 -05:00
|
|
|
* This function is intended to be used early on in place of page_alloc(). It
|
|
|
|
* performs contiguous physical memory allocations and uses a bump allocator for
|
|
|
|
* KVA, so is usable before the kernel map is initialized.
|
2003-09-21 07:39:16 +00:00
|
|
|
*/
|
|
|
|
static void *
|
2018-01-12 23:25:05 +00:00
|
|
|
startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
|
|
|
|
int wait)
|
2003-09-21 07:39:16 +00:00
|
|
|
{
|
2020-01-16 05:01:21 +00:00
|
|
|
vm_paddr_t pa;
|
|
|
|
vm_page_t m;
|
2017-06-01 18:26:57 +00:00
|
|
|
void *mem;
|
|
|
|
int pages;
|
2020-01-16 05:01:21 +00:00
|
|
|
int i;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
2018-02-09 04:45:39 +00:00
|
|
|
pages = howmany(bytes, PAGE_SIZE);
|
|
|
|
KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
|
2020-01-16 05:01:21 +00:00
|
|
|
|
2018-02-09 04:45:39 +00:00
|
|
|
*pflag = UMA_SLAB_BOOT;
|
2020-01-16 05:01:21 +00:00
|
|
|
m = vm_page_alloc_contig_domain(NULL, 0, domain,
|
|
|
|
malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED, pages,
|
|
|
|
(vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT);
|
|
|
|
if (m == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
pa = VM_PAGE_TO_PHYS(m);
|
|
|
|
for (i = 0; i < pages; i++, pa += PAGE_SIZE) {
|
|
|
|
#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
|
|
|
|
defined(__riscv) || defined(__powerpc64__)
|
|
|
|
if ((wait & M_NODUMP) == 0)
|
|
|
|
dump_add_page(pa);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
/* Allocate KVA and indirectly advance bootmem. */
|
|
|
|
mem = (void *)pmap_map(&bootmem, m->phys_addr,
|
|
|
|
m->phys_addr + (pages * PAGE_SIZE), VM_PROT_READ | VM_PROT_WRITE);
|
|
|
|
if ((wait & M_ZERO) != 0)
|
|
|
|
bzero(mem, pages * PAGE_SIZE);
|
|
|
|
|
|
|
|
return (mem);
|
|
|
|
}
|
2018-02-09 04:45:39 +00:00
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
static void
|
|
|
|
startup_free(void *mem, vm_size_t bytes)
|
|
|
|
{
|
|
|
|
vm_offset_t va;
|
|
|
|
vm_page_t m;
|
|
|
|
|
|
|
|
va = (vm_offset_t)mem;
|
|
|
|
m = PHYS_TO_VM_PAGE(pmap_kextract(va));
|
2021-01-03 11:31:00 -05:00
|
|
|
|
|
|
|
/*
|
|
|
|
* startup_alloc() returns direct-mapped slabs on some platforms. Avoid
|
|
|
|
* unmapping ranges of the direct map.
|
|
|
|
*/
|
|
|
|
if (va >= bootstart && va + bytes <= bootmem)
|
|
|
|
pmap_remove(kernel_pmap, va, va + bytes);
|
2020-01-16 05:01:21 +00:00
|
|
|
for (; bytes != 0; bytes -= PAGE_SIZE, m++) {
|
|
|
|
#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
|
|
|
|
defined(__riscv) || defined(__powerpc64__)
|
|
|
|
dump_drop_page(VM_PAGE_TO_PHYS(m));
|
|
|
|
#endif
|
|
|
|
vm_page_unwire_noq(m);
|
|
|
|
vm_page_free(m);
|
|
|
|
}
|
2003-09-21 07:39:16 +00:00
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
|
|
|
* Allocates a number of pages from the system
|
|
|
|
*
|
|
|
|
* Arguments:
|
|
|
|
* bytes The number of bytes requested
|
|
|
|
* wait Shall we wait?
|
|
|
|
*
|
|
|
|
* Returns:
|
2004-01-30 16:26:29 +00:00
|
|
|
* A pointer to the alloced memory or possibly
|
2002-03-19 09:11:49 +00:00
|
|
|
* NULL if M_NOWAIT is set.
|
|
|
|
*/
|
|
|
|
static void *
|
2018-01-12 23:25:05 +00:00
|
|
|
page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
|
|
|
|
int wait)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
|
|
|
void *p; /* Returned page */
|
|
|
|
|
2017-11-28 23:40:54 +00:00
|
|
|
*pflag = UMA_SLAB_KERNEL;
|
2018-10-30 18:26:34 +00:00
|
|
|
p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
|
2004-01-30 16:26:29 +00:00
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
return (p);
|
|
|
|
}
|
|
|
|
|
2018-07-06 02:06:03 +00:00
|
|
|
static void *
|
|
|
|
pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
|
|
|
|
int wait)
|
|
|
|
{
|
|
|
|
struct pglist alloctail;
|
|
|
|
vm_offset_t addr, zkva;
|
|
|
|
int cpu, flags;
|
|
|
|
vm_page_t p, p_next;
|
|
|
|
#ifdef NUMA
|
|
|
|
struct pcpu *pc;
|
|
|
|
#endif
|
|
|
|
|
Fix pre-SI_SUB_CPU initialization of per-CPU counters.
r336020 introduced pcpu_page_alloc(), replacing page_alloc() as the
backend allocator for PCPU UMA zones. Unlike page_alloc(), it does
not honour malloc(9) flags such as M_ZERO or M_NODUMP, so fix that.
r336020 also changed counter(9) to initialize each counter using a
CPU_FOREACH() loop instead of an SMP rendezvous. Before SI_SUB_CPU,
smp_rendezvous() will only execute the callback on the current CPU
(i.e., CPU 0), so only one counter gets zeroed. The rest are zeroed
by virtue of the fact that UMA gratuitously zeroes slabs when importing
them into a zone.
Prior to SI_SUB_CPU, all_cpus is clear, so with r336020 we weren't
zeroing vm_cnt counters during boot: the CPU_FOREACH() loop had no
effect, and pcpu_page_alloc() didn't honour M_ZERO. Fix this by
iterating over the full range of CPU IDs when zeroing counters,
ignoring whether the corresponding bits in all_cpus are set.
Reported and tested by: pho (previous version)
Reviewed by: kib (previous version)
Differential Revision: https://reviews.freebsd.org/D16190
2018-07-10 00:18:12 +00:00
|
|
|
MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
|
2018-07-06 02:06:03 +00:00
|
|
|
|
Fix pre-SI_SUB_CPU initialization of per-CPU counters.
r336020 introduced pcpu_page_alloc(), replacing page_alloc() as the
backend allocator for PCPU UMA zones. Unlike page_alloc(), it does
not honour malloc(9) flags such as M_ZERO or M_NODUMP, so fix that.
r336020 also changed counter(9) to initialize each counter using a
CPU_FOREACH() loop instead of an SMP rendezvous. Before SI_SUB_CPU,
smp_rendezvous() will only execute the callback on the current CPU
(i.e., CPU 0), so only one counter gets zeroed. The rest are zeroed
by virtue of the fact that UMA gratuitously zeroes slabs when importing
them into a zone.
Prior to SI_SUB_CPU, all_cpus is clear, so with r336020 we weren't
zeroing vm_cnt counters during boot: the CPU_FOREACH() loop had no
effect, and pcpu_page_alloc() didn't honour M_ZERO. Fix this by
iterating over the full range of CPU IDs when zeroing counters,
ignoring whether the corresponding bits in all_cpus are set.
Reported and tested by: pho (previous version)
Reviewed by: kib (previous version)
Differential Revision: https://reviews.freebsd.org/D16190
2018-07-10 00:18:12 +00:00
|
|
|
TAILQ_INIT(&alloctail);
|
2018-07-06 02:06:03 +00:00
|
|
|
flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
|
Fix pre-SI_SUB_CPU initialization of per-CPU counters.
r336020 introduced pcpu_page_alloc(), replacing page_alloc() as the
backend allocator for PCPU UMA zones. Unlike page_alloc(), it does
not honour malloc(9) flags such as M_ZERO or M_NODUMP, so fix that.
r336020 also changed counter(9) to initialize each counter using a
CPU_FOREACH() loop instead of an SMP rendezvous. Before SI_SUB_CPU,
smp_rendezvous() will only execute the callback on the current CPU
(i.e., CPU 0), so only one counter gets zeroed. The rest are zeroed
by virtue of the fact that UMA gratuitously zeroes slabs when importing
them into a zone.
Prior to SI_SUB_CPU, all_cpus is clear, so with r336020 we weren't
zeroing vm_cnt counters during boot: the CPU_FOREACH() loop had no
effect, and pcpu_page_alloc() didn't honour M_ZERO. Fix this by
iterating over the full range of CPU IDs when zeroing counters,
ignoring whether the corresponding bits in all_cpus are set.
Reported and tested by: pho (previous version)
Reviewed by: kib (previous version)
Differential Revision: https://reviews.freebsd.org/D16190
2018-07-10 00:18:12 +00:00
|
|
|
malloc2vm_flags(wait);
|
|
|
|
*pflag = UMA_SLAB_KERNEL;
|
2018-07-06 02:06:03 +00:00
|
|
|
for (cpu = 0; cpu <= mp_maxid; cpu++) {
|
|
|
|
if (CPU_ABSENT(cpu)) {
|
|
|
|
p = vm_page_alloc(NULL, 0, flags);
|
|
|
|
} else {
|
|
|
|
#ifndef NUMA
|
|
|
|
p = vm_page_alloc(NULL, 0, flags);
|
|
|
|
#else
|
|
|
|
pc = pcpu_find(cpu);
|
2020-01-18 18:25:37 +00:00
|
|
|
if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain)))
|
|
|
|
p = NULL;
|
|
|
|
else
|
|
|
|
p = vm_page_alloc_domain(NULL, 0,
|
|
|
|
pc->pc_domain, flags);
|
2018-07-06 02:06:03 +00:00
|
|
|
if (__predict_false(p == NULL))
|
|
|
|
p = vm_page_alloc(NULL, 0, flags);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
if (__predict_false(p == NULL))
|
|
|
|
goto fail;
|
|
|
|
TAILQ_INSERT_TAIL(&alloctail, p, listq);
|
|
|
|
}
|
|
|
|
if ((addr = kva_alloc(bytes)) == 0)
|
|
|
|
goto fail;
|
|
|
|
zkva = addr;
|
|
|
|
TAILQ_FOREACH(p, &alloctail, listq) {
|
|
|
|
pmap_qenter(zkva, &p, 1);
|
|
|
|
zkva += PAGE_SIZE;
|
|
|
|
}
|
|
|
|
return ((void*)addr);
|
2019-06-07 18:23:29 +00:00
|
|
|
fail:
|
2018-07-06 02:06:03 +00:00
|
|
|
TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
|
2019-06-07 18:23:29 +00:00
|
|
|
vm_page_unwire_noq(p);
|
2018-07-06 02:06:03 +00:00
|
|
|
vm_page_free(p);
|
|
|
|
}
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
|
|
|
* Allocates a number of pages from within an object
|
|
|
|
*
|
|
|
|
* Arguments:
|
|
|
|
* bytes The number of bytes requested
|
|
|
|
* wait Shall we wait?
|
|
|
|
*
|
|
|
|
* Returns:
|
2004-01-30 16:26:29 +00:00
|
|
|
* A pointer to the alloced memory or possibly
|
2002-03-19 09:11:49 +00:00
|
|
|
* NULL if M_NOWAIT is set.
|
|
|
|
*/
|
|
|
|
static void *
|
2018-01-12 23:25:05 +00:00
|
|
|
noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
|
|
|
|
int wait)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
2013-02-26 23:35:27 +00:00
|
|
|
TAILQ_HEAD(, vm_page) alloctail;
|
|
|
|
u_long npages;
|
2003-08-03 06:08:48 +00:00
|
|
|
vm_offset_t retkva, zkva;
|
2013-02-26 23:35:27 +00:00
|
|
|
vm_page_t p, p_next;
|
2009-01-25 09:11:24 +00:00
|
|
|
uma_keg_t keg;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2013-02-26 23:35:27 +00:00
|
|
|
TAILQ_INIT(&alloctail);
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
keg = zone->uz_keg;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2013-02-26 23:35:27 +00:00
|
|
|
npages = howmany(bytes, PAGE_SIZE);
|
|
|
|
while (npages > 0) {
|
2018-01-12 23:25:05 +00:00
|
|
|
p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
|
2017-11-08 02:39:37 +00:00
|
|
|
VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
|
2017-11-08 23:25:05 +00:00
|
|
|
((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
|
|
|
|
VM_ALLOC_NOWAIT));
|
2013-02-26 23:35:27 +00:00
|
|
|
if (p != NULL) {
|
|
|
|
/*
|
|
|
|
* Since the page does not belong to an object, its
|
|
|
|
* listq is unused.
|
|
|
|
*/
|
|
|
|
TAILQ_INSERT_TAIL(&alloctail, p, listq);
|
|
|
|
npages--;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Page allocation failed, free intermediate pages and
|
|
|
|
* exit.
|
|
|
|
*/
|
|
|
|
TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
|
2019-06-07 18:23:29 +00:00
|
|
|
vm_page_unwire_noq(p);
|
2013-02-26 23:35:27 +00:00
|
|
|
vm_page_free(p);
|
|
|
|
}
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
*flags = UMA_SLAB_PRIV;
|
|
|
|
zkva = keg->uk_kva +
|
|
|
|
atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
|
|
|
|
retkva = zkva;
|
|
|
|
TAILQ_FOREACH(p, &alloctail, listq) {
|
2003-08-03 06:08:48 +00:00
|
|
|
pmap_qenter(zkva, &p, 1);
|
|
|
|
zkva += PAGE_SIZE;
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return ((void *)retkva);
|
|
|
|
}
|
|
|
|
|
2020-02-04 22:40:11 +00:00
|
|
|
/*
|
|
|
|
* Allocate physically contiguous pages.
|
|
|
|
*/
|
|
|
|
static void *
|
|
|
|
contig_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
|
|
|
|
int wait)
|
|
|
|
{
|
|
|
|
|
|
|
|
*pflag = UMA_SLAB_KERNEL;
|
|
|
|
return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
|
|
|
|
bytes, wait, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
|
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
|
|
|
* Frees a number of pages to the system
|
2004-01-30 16:26:29 +00:00
|
|
|
*
|
2002-03-19 09:11:49 +00:00
|
|
|
* Arguments:
|
|
|
|
* mem A pointer to the memory to be freed
|
|
|
|
* size The size of the memory being freed
|
|
|
|
* flags The original p->us_flags field
|
|
|
|
*
|
|
|
|
* Returns:
|
|
|
|
* Nothing
|
|
|
|
*/
|
|
|
|
static void
|
2015-04-01 12:42:26 +00:00
|
|
|
page_free(void *mem, vm_size_t size, uint8_t flags)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
2002-06-19 20:49:44 +00:00
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
if ((flags & UMA_SLAB_BOOT) != 0) {
|
|
|
|
startup_free(mem, size);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2020-02-04 22:40:11 +00:00
|
|
|
KASSERT((flags & UMA_SLAB_KERNEL) != 0,
|
|
|
|
("UMA: page_free used with invalid flags %x", flags));
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2018-08-25 19:38:08 +00:00
|
|
|
kmem_free((vm_offset_t)mem, size);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2018-07-06 02:06:03 +00:00
|
|
|
/*
|
|
|
|
* Frees pcpu zone allocations
|
|
|
|
*
|
|
|
|
* Arguments:
|
|
|
|
* mem A pointer to the memory to be freed
|
|
|
|
* size The size of the memory being freed
|
|
|
|
* flags The original p->us_flags field
|
|
|
|
*
|
|
|
|
* Returns:
|
|
|
|
* Nothing
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
|
|
|
|
{
|
|
|
|
vm_offset_t sva, curva;
|
|
|
|
vm_paddr_t paddr;
|
|
|
|
vm_page_t m;
|
|
|
|
|
|
|
|
MPASS(size == (mp_maxid+1)*PAGE_SIZE);
|
2020-02-04 22:39:58 +00:00
|
|
|
|
|
|
|
if ((flags & UMA_SLAB_BOOT) != 0) {
|
|
|
|
startup_free(mem, size);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-07-06 02:06:03 +00:00
|
|
|
sva = (vm_offset_t)mem;
|
|
|
|
for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
|
|
|
|
paddr = pmap_kextract(curva);
|
|
|
|
m = PHYS_TO_VM_PAGE(paddr);
|
2019-06-07 18:23:29 +00:00
|
|
|
vm_page_unwire_noq(m);
|
2018-07-06 02:06:03 +00:00
|
|
|
vm_page_free(m);
|
|
|
|
}
|
|
|
|
pmap_qremove(sva, size >> PAGE_SHIFT);
|
|
|
|
kva_free(sva, size);
|
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
|
|
|
* Zero fill initializer
|
|
|
|
*
|
|
|
|
* Arguments/Returns follow uma_init specifications
|
|
|
|
*/
|
2004-08-02 00:18:36 +00:00
|
|
|
static int
|
|
|
|
zero_init(void *mem, int size, int flags)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
|
|
|
bzero(mem, size);
|
2004-08-02 00:18:36 +00:00
|
|
|
return (0);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2019-12-14 05:21:56 +00:00
|
|
|
#ifdef INVARIANTS
|
2020-03-07 15:37:23 +00:00
|
|
|
static struct noslabbits *
|
2019-12-14 05:21:56 +00:00
|
|
|
slab_dbg_bits(uma_slab_t slab, uma_keg_t keg)
|
|
|
|
{
|
|
|
|
|
|
|
|
return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers)));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2019-12-02 22:44:34 +00:00
|
|
|
/*
|
|
|
|
* Actual size of embedded struct slab (!OFFPAGE).
|
|
|
|
*/
|
2020-03-07 15:37:23 +00:00
|
|
|
static size_t
|
2019-12-02 22:44:34 +00:00
|
|
|
slab_sizeof(int nitems)
|
|
|
|
{
|
|
|
|
size_t s;
|
|
|
|
|
2019-12-14 05:21:56 +00:00
|
|
|
s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS;
|
2019-12-02 22:44:34 +00:00
|
|
|
return (roundup(s, UMA_ALIGN_PTR + 1));
|
|
|
|
}
|
|
|
|
|
2020-01-09 02:03:17 +00:00
|
|
|
#define UMA_FIXPT_SHIFT 31
|
|
|
|
#define UMA_FRAC_FIXPT(n, d) \
|
|
|
|
((uint32_t)(((uint64_t)(n) << UMA_FIXPT_SHIFT) / (d)))
|
|
|
|
#define UMA_FIXPT_PCT(f) \
|
|
|
|
((u_int)(((uint64_t)100 * (f)) >> UMA_FIXPT_SHIFT))
|
|
|
|
#define UMA_PCT_FIXPT(pct) UMA_FRAC_FIXPT((pct), 100)
|
|
|
|
#define UMA_MIN_EFF UMA_PCT_FIXPT(100 - UMA_MAX_WASTE)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Compute the number of items that will fit in a slab. If hdr is true, the
|
|
|
|
* item count may be limited to provide space in the slab for an inline slab
|
|
|
|
* header. Otherwise, all slab space will be provided for item storage.
|
|
|
|
*/
|
|
|
|
static u_int
|
|
|
|
slab_ipers_hdr(u_int size, u_int rsize, u_int slabsize, bool hdr)
|
|
|
|
{
|
|
|
|
u_int ipers;
|
|
|
|
u_int padpi;
|
|
|
|
|
|
|
|
/* The padding between items is not needed after the last item. */
|
|
|
|
padpi = rsize - size;
|
|
|
|
|
|
|
|
if (hdr) {
|
|
|
|
/*
|
|
|
|
* Start with the maximum item count and remove items until
|
|
|
|
* the slab header first alongside the allocatable memory.
|
|
|
|
*/
|
|
|
|
for (ipers = MIN(SLAB_MAX_SETSIZE,
|
|
|
|
(slabsize + padpi - slab_sizeof(1)) / rsize);
|
|
|
|
ipers > 0 &&
|
|
|
|
ipers * rsize - padpi + slab_sizeof(ipers) > slabsize;
|
|
|
|
ipers--)
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
ipers = MIN((slabsize + padpi) / rsize, SLAB_MAX_SETSIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (ipers);
|
|
|
|
}
|
|
|
|
|
2020-02-04 22:40:34 +00:00
|
|
|
struct keg_layout_result {
|
|
|
|
u_int format;
|
|
|
|
u_int slabsize;
|
|
|
|
u_int ipers;
|
|
|
|
u_int eff;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void
|
|
|
|
keg_layout_one(uma_keg_t keg, u_int rsize, u_int slabsize, u_int fmt,
|
|
|
|
struct keg_layout_result *kl)
|
|
|
|
{
|
|
|
|
u_int total;
|
|
|
|
|
|
|
|
kl->format = fmt;
|
|
|
|
kl->slabsize = slabsize;
|
|
|
|
|
|
|
|
/* Handle INTERNAL as inline with an extra page. */
|
|
|
|
if ((fmt & UMA_ZFLAG_INTERNAL) != 0) {
|
|
|
|
kl->format &= ~UMA_ZFLAG_INTERNAL;
|
|
|
|
kl->slabsize += PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
kl->ipers = slab_ipers_hdr(keg->uk_size, rsize, kl->slabsize,
|
|
|
|
(fmt & UMA_ZFLAG_OFFPAGE) == 0);
|
|
|
|
|
|
|
|
/* Account for memory used by an offpage slab header. */
|
|
|
|
total = kl->slabsize;
|
|
|
|
if ((fmt & UMA_ZFLAG_OFFPAGE) != 0)
|
|
|
|
total += slabzone(kl->ipers)->uz_keg->uk_rsize;
|
|
|
|
|
|
|
|
kl->eff = UMA_FRAC_FIXPT(kl->ipers * rsize, total);
|
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
2020-01-09 02:03:17 +00:00
|
|
|
* Determine the format of a uma keg. This determines where the slab header
|
|
|
|
* will be placed (inline or offpage) and calculates ipers, rsize, and ppera.
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Arguments
|
2009-01-25 09:11:24 +00:00
|
|
|
* keg The zone we should initialize
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Returns
|
|
|
|
* Nothing
|
|
|
|
*/
|
|
|
|
static void
|
2020-01-09 02:03:17 +00:00
|
|
|
keg_layout(uma_keg_t keg)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
2020-02-04 22:40:34 +00:00
|
|
|
struct keg_layout_result kl = {}, kl_tmp;
|
|
|
|
u_int fmts[2];
|
2020-01-09 02:03:17 +00:00
|
|
|
u_int alignsize;
|
2020-02-04 22:40:34 +00:00
|
|
|
u_int nfmt;
|
2020-01-09 02:03:17 +00:00
|
|
|
u_int pages;
|
2004-07-29 15:25:40 +00:00
|
|
|
u_int rsize;
|
2017-03-11 16:35:36 +00:00
|
|
|
u_int slabsize;
|
2020-02-04 22:40:34 +00:00
|
|
|
u_int i, j;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-01-09 02:03:17 +00:00
|
|
|
KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
|
|
|
|
(keg->uk_size <= UMA_PCPU_ALLOC_SIZE &&
|
|
|
|
(keg->uk_flags & UMA_ZONE_CACHESPREAD) == 0),
|
|
|
|
("%s: cannot configure for PCPU: keg=%s, size=%u, flags=0x%b",
|
|
|
|
__func__, keg->uk_name, keg->uk_size, keg->uk_flags,
|
|
|
|
PRINT_UMA_ZFLAGS));
|
2020-02-06 08:32:25 +00:00
|
|
|
KASSERT((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) == 0 ||
|
2020-01-09 02:03:17 +00:00
|
|
|
(keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0,
|
|
|
|
("%s: incompatible flags 0x%b", __func__, keg->uk_flags,
|
|
|
|
PRINT_UMA_ZFLAGS));
|
2013-07-23 11:16:40 +00:00
|
|
|
|
2020-01-09 02:03:17 +00:00
|
|
|
alignsize = keg->uk_align + 1;
|
2021-07-09 20:38:21 -04:00
|
|
|
#ifdef KASAN
|
|
|
|
/*
|
|
|
|
* ASAN requires that each allocation be aligned to the shadow map
|
|
|
|
* scale factor.
|
|
|
|
*/
|
|
|
|
if (alignsize < KASAN_SHADOW_SCALE)
|
|
|
|
alignsize = KASAN_SHADOW_SCALE;
|
|
|
|
#endif
|
2013-04-08 19:10:45 +00:00
|
|
|
|
2013-06-13 21:05:38 +00:00
|
|
|
/*
|
|
|
|
* Calculate the size of each allocation (rsize) according to
|
|
|
|
* alignment. If the requested size is smaller than we have
|
|
|
|
* allocation bits for we round it up.
|
|
|
|
*/
|
2020-01-14 02:14:15 +00:00
|
|
|
rsize = MAX(keg->uk_size, UMA_SMALLEST_UNIT);
|
2020-01-09 02:03:17 +00:00
|
|
|
rsize = roundup2(rsize, alignsize);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-02-04 22:40:34 +00:00
|
|
|
if ((keg->uk_flags & UMA_ZONE_CACHESPREAD) != 0) {
|
2020-01-09 02:03:17 +00:00
|
|
|
/*
|
|
|
|
* We want one item to start on every align boundary in a page.
|
|
|
|
* To do this we will span pages. We will also extend the item
|
|
|
|
* by the size of align if it is an even multiple of align.
|
|
|
|
* Otherwise, it would fall on the same boundary every time.
|
|
|
|
*/
|
|
|
|
if ((rsize & alignsize) == 0)
|
|
|
|
rsize += alignsize;
|
|
|
|
slabsize = rsize * (PAGE_SIZE / alignsize);
|
|
|
|
slabsize = MIN(slabsize, rsize * SLAB_MAX_SETSIZE);
|
|
|
|
slabsize = MIN(slabsize, UMA_CACHESPREAD_MAX_SIZE);
|
2020-02-04 22:40:34 +00:00
|
|
|
slabsize = round_page(slabsize);
|
2020-01-09 02:03:17 +00:00
|
|
|
} else {
|
|
|
|
/*
|
2020-02-04 22:40:34 +00:00
|
|
|
* Start with a slab size of as many pages as it takes to
|
|
|
|
* represent a single item. We will try to fit as many
|
|
|
|
* additional items into the slab as possible.
|
2020-01-09 02:03:17 +00:00
|
|
|
*/
|
2020-02-04 22:40:34 +00:00
|
|
|
slabsize = round_page(keg->uk_size);
|
Handle a special case when a slab can fit only one allocation,
and zone has a large alignment. With alignment taken into
account uk_rsize will be greater than space in a slab. However,
since we have only one item per slab, it is always naturally
aligned.
Code that will panic before this change with 4k page:
z = uma_zcreate("test", 3984, NULL, NULL, NULL, NULL, 31, 0);
uma_zalloc(z, M_WAITOK);
A practical scenario to hit the panic is a machine with 56 CPUs
and 2 NUMA domains, which yields in zone size of 3984.
PR: 227116
MFC after: 2 weeks
2018-04-02 05:11:59 +00:00
|
|
|
}
|
2013-04-08 19:10:45 +00:00
|
|
|
|
2020-02-04 22:40:34 +00:00
|
|
|
/* Build a list of all of the available formats for this keg. */
|
|
|
|
nfmt = 0;
|
|
|
|
|
2020-01-09 02:03:17 +00:00
|
|
|
/* Evaluate an inline slab layout. */
|
|
|
|
if ((keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0)
|
2020-02-04 22:40:34 +00:00
|
|
|
fmts[nfmt++] = 0;
|
2020-01-09 02:03:17 +00:00
|
|
|
|
|
|
|
/* TODO: vm_page-embedded slab. */
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2004-07-29 15:25:40 +00:00
|
|
|
/*
|
|
|
|
* We can't do OFFPAGE if we're internal or if we've been
|
|
|
|
* asked to not go to the VM for buckets. If we do this we
|
2020-02-06 08:32:25 +00:00
|
|
|
* may end up going to the VM for slabs which we do not want
|
|
|
|
* to do if we're UMA_ZONE_VM, which clearly forbids it.
|
|
|
|
* In those cases, evaluate a pseudo-format called INTERNAL
|
|
|
|
* which has an inline slab header and one extra page to
|
|
|
|
* guarantee that it fits.
|
2020-02-04 22:40:34 +00:00
|
|
|
*
|
|
|
|
* Otherwise, see if using an OFFPAGE slab will improve our
|
|
|
|
* efficiency.
|
2004-07-29 15:25:40 +00:00
|
|
|
*/
|
2020-02-06 08:32:25 +00:00
|
|
|
if ((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) != 0)
|
2020-02-04 22:40:34 +00:00
|
|
|
fmts[nfmt++] = UMA_ZFLAG_INTERNAL;
|
|
|
|
else
|
|
|
|
fmts[nfmt++] = UMA_ZFLAG_OFFPAGE;
|
2004-07-29 15:25:40 +00:00
|
|
|
|
2013-06-13 21:05:38 +00:00
|
|
|
/*
|
2020-02-04 22:40:34 +00:00
|
|
|
* Choose a slab size and format which satisfy the minimum efficiency.
|
|
|
|
* Prefer the smallest slab size that meets the constraints.
|
2013-06-13 21:05:38 +00:00
|
|
|
*
|
2020-02-04 22:40:34 +00:00
|
|
|
* Start with a minimum slab size, to accommodate CACHESPREAD. Then,
|
|
|
|
* for small items (up to PAGE_SIZE), the iteration increment is one
|
|
|
|
* page; and for large items, the increment is one item.
|
2013-06-13 21:05:38 +00:00
|
|
|
*/
|
2020-02-04 22:40:34 +00:00
|
|
|
i = (slabsize + rsize - keg->uk_size) / MAX(PAGE_SIZE, rsize);
|
|
|
|
KASSERT(i >= 1, ("keg %s(%p) flags=0x%b slabsize=%u, rsize=%u, i=%u",
|
|
|
|
keg->uk_name, keg, keg->uk_flags, PRINT_UMA_ZFLAGS, slabsize,
|
|
|
|
rsize, i));
|
|
|
|
for ( ; ; i++) {
|
|
|
|
slabsize = (rsize <= PAGE_SIZE) ? ptoa(i) :
|
|
|
|
round_page(rsize * (i - 1) + keg->uk_size);
|
|
|
|
|
|
|
|
for (j = 0; j < nfmt; j++) {
|
|
|
|
/* Only if we have no viable format yet. */
|
|
|
|
if ((fmts[j] & UMA_ZFLAG_INTERNAL) != 0 &&
|
|
|
|
kl.ipers > 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
keg_layout_one(keg, rsize, slabsize, fmts[j], &kl_tmp);
|
|
|
|
if (kl_tmp.eff <= kl.eff)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
kl = kl_tmp;
|
|
|
|
|
|
|
|
CTR6(KTR_UMA, "keg %s layout: format %#x "
|
|
|
|
"(ipers %u * rsize %u) / slabsize %#x = %u%% eff",
|
|
|
|
keg->uk_name, kl.format, kl.ipers, rsize,
|
|
|
|
kl.slabsize, UMA_FIXPT_PCT(kl.eff));
|
|
|
|
|
|
|
|
/* Stop when we reach the minimum efficiency. */
|
|
|
|
if (kl.eff >= UMA_MIN_EFF)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2020-02-04 22:40:45 +00:00
|
|
|
if (kl.eff >= UMA_MIN_EFF || !multipage_slabs ||
|
2020-02-04 22:40:34 +00:00
|
|
|
slabsize >= SLAB_MAX_SETSIZE * rsize ||
|
|
|
|
(keg->uk_flags & (UMA_ZONE_PCPU | UMA_ZONE_CONTIG)) != 0)
|
|
|
|
break;
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
2013-04-08 19:10:45 +00:00
|
|
|
|
2020-02-04 22:40:34 +00:00
|
|
|
pages = atop(kl.slabsize);
|
|
|
|
if ((keg->uk_flags & UMA_ZONE_PCPU) != 0)
|
|
|
|
pages *= mp_maxid + 1;
|
|
|
|
|
|
|
|
keg->uk_rsize = rsize;
|
|
|
|
keg->uk_ipers = kl.ipers;
|
|
|
|
keg->uk_ppera = pages;
|
|
|
|
keg->uk_flags |= kl.format;
|
|
|
|
|
2020-01-09 02:03:17 +00:00
|
|
|
/*
|
|
|
|
* How do we find the slab header if it is offpage or if not all item
|
|
|
|
* start addresses are in the same page? We could solve the latter
|
|
|
|
* case with vaddr alignment, but we don't.
|
|
|
|
*/
|
2020-02-04 22:40:34 +00:00
|
|
|
if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0 ||
|
|
|
|
(keg->uk_ipers - 1) * rsize >= PAGE_SIZE) {
|
2020-01-09 02:03:03 +00:00
|
|
|
if ((keg->uk_flags & UMA_ZONE_NOTPAGE) != 0)
|
2020-02-04 22:40:34 +00:00
|
|
|
keg->uk_flags |= UMA_ZFLAG_HASH;
|
2020-01-09 02:03:03 +00:00
|
|
|
else
|
2020-02-04 22:40:34 +00:00
|
|
|
keg->uk_flags |= UMA_ZFLAG_VTOSLAB;
|
2020-01-09 02:03:03 +00:00
|
|
|
}
|
2020-02-04 22:40:34 +00:00
|
|
|
|
2020-01-14 02:13:46 +00:00
|
|
|
CTR6(KTR_UMA, "%s: keg=%s, flags=%#x, rsize=%u, ipers=%u, ppera=%u",
|
2020-02-04 22:40:34 +00:00
|
|
|
__func__, keg->uk_name, keg->uk_flags, rsize, keg->uk_ipers,
|
|
|
|
pages);
|
2020-01-09 02:03:17 +00:00
|
|
|
KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE,
|
|
|
|
("%s: keg=%s, flags=0x%b, rsize=%u, ipers=%u, ppera=%u", __func__,
|
2020-02-04 22:40:34 +00:00
|
|
|
keg->uk_name, keg->uk_flags, PRINT_UMA_ZFLAGS, rsize,
|
|
|
|
keg->uk_ipers, pages));
|
2009-01-25 09:11:24 +00:00
|
|
|
}
|
|
|
|
|
2004-01-30 16:26:29 +00:00
|
|
|
/*
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
* Keg header ctor. This initializes all fields, locks, etc. And inserts
|
|
|
|
* the keg onto the global keg list.
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Arguments/Returns follow uma_ctor specifications
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
* udata Actually uma_kctor_args
|
2002-03-19 09:11:49 +00:00
|
|
|
*/
|
2004-08-02 00:18:36 +00:00
|
|
|
static int
|
|
|
|
keg_ctor(void *mem, int size, void *udata, int flags)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
struct uma_kctor_args *arg = udata;
|
|
|
|
uma_keg_t keg = mem;
|
|
|
|
uma_zone_t zone;
|
2020-01-04 03:30:08 +00:00
|
|
|
int i;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
bzero(keg, size);
|
|
|
|
keg->uk_size = arg->size;
|
|
|
|
keg->uk_init = arg->uminit;
|
|
|
|
keg->uk_fini = arg->fini;
|
|
|
|
keg->uk_align = arg->align;
|
2013-06-26 00:57:38 +00:00
|
|
|
keg->uk_reserve = 0;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
keg->uk_flags = arg->flags;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2018-10-24 16:41:47 +00:00
|
|
|
/*
|
|
|
|
* We use a global round-robin policy by default. Zones with
|
2020-01-04 18:48:13 +00:00
|
|
|
* UMA_ZONE_FIRSTTOUCH set will use first-touch instead, in which
|
|
|
|
* case the iterator is never run.
|
2018-10-24 16:41:47 +00:00
|
|
|
*/
|
|
|
|
keg->uk_dr.dr_policy = DOMAINSET_RR();
|
|
|
|
keg->uk_dr.dr_iter = 0;
|
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
/*
|
2020-06-20 20:21:04 +00:00
|
|
|
* The primary zone is passed to us at keg-creation time.
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
*/
|
|
|
|
zone = arg->zone;
|
2009-01-25 09:11:24 +00:00
|
|
|
keg->uk_name = zone->uz_name;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
if (arg->flags & UMA_ZONE_ZINIT)
|
|
|
|
keg->uk_init = zero_init;
|
2002-06-17 22:02:41 +00:00
|
|
|
|
2016-03-01 00:33:32 +00:00
|
|
|
if (arg->flags & UMA_ZONE_MALLOC)
|
2020-01-09 02:03:03 +00:00
|
|
|
keg->uk_flags |= UMA_ZFLAG_VTOSLAB;
|
2009-01-25 09:11:24 +00:00
|
|
|
|
2020-01-09 02:03:03 +00:00
|
|
|
#ifndef SMP
|
|
|
|
keg->uk_flags &= ~UMA_ZONE_PCPU;
|
2013-04-08 19:10:45 +00:00
|
|
|
#endif
|
|
|
|
|
2020-01-09 02:03:17 +00:00
|
|
|
keg_layout(keg);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
2020-01-04 03:30:08 +00:00
|
|
|
/*
|
2020-02-19 18:48:46 +00:00
|
|
|
* Use a first-touch NUMA policy for kegs that pmap_extract() will
|
|
|
|
* work on. Use round-robin for everything else.
|
2020-01-04 18:48:13 +00:00
|
|
|
*
|
|
|
|
* Zones may override the default by specifying either.
|
2020-01-04 03:30:08 +00:00
|
|
|
*/
|
2020-01-04 18:48:13 +00:00
|
|
|
#ifdef NUMA
|
|
|
|
if ((keg->uk_flags &
|
2020-02-19 18:48:46 +00:00
|
|
|
(UMA_ZONE_ROUNDROBIN | UMA_ZFLAG_CACHE | UMA_ZONE_NOTPAGE)) == 0)
|
2020-01-04 18:48:13 +00:00
|
|
|
keg->uk_flags |= UMA_ZONE_FIRSTTOUCH;
|
|
|
|
else if ((keg->uk_flags & UMA_ZONE_FIRSTTOUCH) == 0)
|
|
|
|
keg->uk_flags |= UMA_ZONE_ROUNDROBIN;
|
2020-01-04 03:30:08 +00:00
|
|
|
#endif
|
|
|
|
|
2003-09-21 07:39:16 +00:00
|
|
|
/*
|
|
|
|
* If we haven't booted yet we need allocations to go through the
|
|
|
|
* startup cache until the vm is ready.
|
|
|
|
*/
|
|
|
|
#ifdef UMA_MD_SMALL_ALLOC
|
2020-01-16 05:01:21 +00:00
|
|
|
if (keg->uk_ppera == 1)
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
keg->uk_allocf = uma_small_alloc;
|
2020-01-16 05:01:21 +00:00
|
|
|
else
|
2017-06-08 21:33:19 +00:00
|
|
|
#endif
|
2020-01-16 05:01:21 +00:00
|
|
|
if (booted < BOOT_KVA)
|
|
|
|
keg->uk_allocf = startup_alloc;
|
2018-07-06 02:06:03 +00:00
|
|
|
else if (keg->uk_flags & UMA_ZONE_PCPU)
|
|
|
|
keg->uk_allocf = pcpu_page_alloc;
|
2020-02-04 22:40:11 +00:00
|
|
|
else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1)
|
|
|
|
keg->uk_allocf = contig_alloc;
|
2017-06-08 21:33:19 +00:00
|
|
|
else
|
|
|
|
keg->uk_allocf = page_alloc;
|
|
|
|
#ifdef UMA_MD_SMALL_ALLOC
|
|
|
|
if (keg->uk_ppera == 1)
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
keg->uk_freef = uma_small_free;
|
2017-06-08 21:33:19 +00:00
|
|
|
else
|
2011-05-22 17:46:16 +00:00
|
|
|
#endif
|
2018-07-06 02:06:03 +00:00
|
|
|
if (keg->uk_flags & UMA_ZONE_PCPU)
|
|
|
|
keg->uk_freef = pcpu_page_free;
|
|
|
|
else
|
2017-06-08 21:33:19 +00:00
|
|
|
keg->uk_freef = page_free;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
|
|
|
/*
|
2020-01-04 03:30:08 +00:00
|
|
|
* Initialize keg's locks.
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
*/
|
2020-01-04 03:30:08 +00:00
|
|
|
for (i = 0; i < vm_ndomains; i++)
|
|
|
|
KEG_LOCK_INIT(keg, i, (arg->flags & UMA_ZONE_MTXCLASS));
|
2002-04-29 23:45:41 +00:00
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
|
|
|
* If we're putting the slab header in the actual page we need to
|
2019-12-02 22:44:34 +00:00
|
|
|
* figure out where in each page it goes. See slab_sizeof
|
|
|
|
* definition.
|
2002-03-19 09:11:49 +00:00
|
|
|
*/
|
2020-01-09 02:03:03 +00:00
|
|
|
if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) {
|
2019-12-02 22:44:34 +00:00
|
|
|
size_t shsize;
|
|
|
|
|
|
|
|
shsize = slab_sizeof(keg->uk_ipers);
|
|
|
|
keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize;
|
2004-07-29 15:25:40 +00:00
|
|
|
/*
|
|
|
|
* The only way the following is possible is if with our
|
|
|
|
* UMA_ALIGN_PTR adjustments we are now bigger than
|
|
|
|
* UMA_SLAB_SIZE. I haven't checked whether this is
|
|
|
|
* mathematically possible for all cases, so we make
|
|
|
|
* sure here anyway.
|
|
|
|
*/
|
2019-12-02 22:44:34 +00:00
|
|
|
KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera,
|
2018-11-28 19:17:27 +00:00
|
|
|
("zone %s ipers %d rsize %d size %d slab won't fit",
|
|
|
|
zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2020-01-09 02:03:03 +00:00
|
|
|
if (keg->uk_flags & UMA_ZFLAG_HASH)
|
2019-06-06 23:57:28 +00:00
|
|
|
hash_alloc(&keg->uk_hash, 0);
|
2002-09-18 08:26:30 +00:00
|
|
|
|
2020-01-14 02:13:46 +00:00
|
|
|
CTR3(KTR_UMA, "keg_ctor %p zone %s(%p)", keg, zone->uz_name, zone);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
|
|
|
LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_wlock(&uma_rwlock);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_wunlock(&uma_rwlock);
|
2004-08-02 00:18:36 +00:00
|
|
|
return (0);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
}
|
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
static void
|
|
|
|
zone_kva_available(uma_zone_t zone, void *unused)
|
|
|
|
{
|
|
|
|
uma_keg_t keg;
|
|
|
|
|
|
|
|
if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
|
|
|
|
return;
|
|
|
|
KEG_GET(zone, keg);
|
2020-02-04 22:40:11 +00:00
|
|
|
|
|
|
|
if (keg->uk_allocf == startup_alloc) {
|
|
|
|
/* Switch to the real allocator. */
|
|
|
|
if (keg->uk_flags & UMA_ZONE_PCPU)
|
|
|
|
keg->uk_allocf = pcpu_page_alloc;
|
|
|
|
else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 &&
|
|
|
|
keg->uk_ppera > 1)
|
|
|
|
keg->uk_allocf = contig_alloc;
|
|
|
|
else
|
|
|
|
keg->uk_allocf = page_alloc;
|
|
|
|
}
|
2020-01-16 05:01:21 +00:00
|
|
|
}
|
|
|
|
|
2019-01-15 18:24:34 +00:00
|
|
|
static void
|
2019-11-28 00:19:09 +00:00
|
|
|
zone_alloc_counters(uma_zone_t zone, void *unused)
|
2019-01-15 18:24:34 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
zone->uz_allocs = counter_u64_alloc(M_WAITOK);
|
|
|
|
zone->uz_frees = counter_u64_alloc(M_WAITOK);
|
|
|
|
zone->uz_fails = counter_u64_alloc(M_WAITOK);
|
2020-02-19 18:48:46 +00:00
|
|
|
zone->uz_xdomain = counter_u64_alloc(M_WAITOK);
|
2019-01-15 18:24:34 +00:00
|
|
|
}
|
|
|
|
|
2019-11-28 00:19:09 +00:00
|
|
|
static void
|
|
|
|
zone_alloc_sysctl(uma_zone_t zone, void *unused)
|
|
|
|
{
|
|
|
|
uma_zone_domain_t zdom;
|
2020-01-04 03:30:08 +00:00
|
|
|
uma_domain_t dom;
|
2019-11-28 00:19:09 +00:00
|
|
|
uma_keg_t keg;
|
|
|
|
struct sysctl_oid *oid, *domainoid;
|
2019-12-08 01:55:23 +00:00
|
|
|
int domains, i, cnt;
|
2019-11-28 00:19:09 +00:00
|
|
|
static const char *nokeg = "cache zone";
|
|
|
|
char *c;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make a sysctl safe copy of the zone name by removing
|
|
|
|
* any special characters and handling dups by appending
|
|
|
|
* an index.
|
|
|
|
*/
|
|
|
|
if (zone->uz_namecnt != 0) {
|
2019-12-08 01:55:23 +00:00
|
|
|
/* Count the number of decimal digits and '_' separator. */
|
|
|
|
for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++)
|
|
|
|
cnt /= 10;
|
|
|
|
zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1,
|
|
|
|
M_UMA, M_WAITOK);
|
2019-11-28 00:19:09 +00:00
|
|
|
sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name,
|
|
|
|
zone->uz_namecnt);
|
|
|
|
} else
|
|
|
|
zone->uz_ctlname = strdup(zone->uz_name, M_UMA);
|
|
|
|
for (c = zone->uz_ctlname; *c != '\0'; c++)
|
|
|
|
if (strchr("./\\ -", *c) != NULL)
|
|
|
|
*c = '_';
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Basic parameters at the root.
|
|
|
|
*/
|
|
|
|
zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma),
|
2020-02-26 14:26:36 +00:00
|
|
|
OID_AUTO, zone->uz_ctlname, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
|
2019-11-28 00:19:09 +00:00
|
|
|
oid = zone->uz_oid;
|
|
|
|
SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size");
|
2019-12-11 06:50:55 +00:00
|
|
|
SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE,
|
|
|
|
zone, 0, sysctl_handle_uma_zone_flags, "A",
|
2019-11-28 00:19:09 +00:00
|
|
|
"Allocator configuration flags");
|
|
|
|
SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0,
|
|
|
|
"Desired per-cpu cache size");
|
|
|
|
SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0,
|
|
|
|
"Maximum allowed per-cpu cache size");
|
|
|
|
|
|
|
|
/*
|
|
|
|
* keg if present.
|
|
|
|
*/
|
2020-01-09 02:03:03 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0)
|
2020-01-04 03:30:08 +00:00
|
|
|
domains = vm_ndomains;
|
|
|
|
else
|
|
|
|
domains = 1;
|
2019-11-28 00:19:09 +00:00
|
|
|
oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
|
2020-02-26 14:26:36 +00:00
|
|
|
"keg", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
|
2019-11-28 00:19:09 +00:00
|
|
|
keg = zone->uz_keg;
|
2019-12-08 01:55:23 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) {
|
2019-11-28 00:19:09 +00:00
|
|
|
SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"name", CTLFLAG_RD, keg->uk_name, "Keg name");
|
|
|
|
SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"rsize", CTLFLAG_RD, &keg->uk_rsize, 0,
|
|
|
|
"Real object size with alignment");
|
|
|
|
SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"ppera", CTLFLAG_RD, &keg->uk_ppera, 0,
|
|
|
|
"pages per-slab allocation");
|
|
|
|
SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"ipers", CTLFLAG_RD, &keg->uk_ipers, 0,
|
|
|
|
"items available per-slab");
|
|
|
|
SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"align", CTLFLAG_RD, &keg->uk_align, 0,
|
|
|
|
"item alignment mask");
|
2020-10-19 16:57:40 +00:00
|
|
|
SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"reserve", CTLFLAG_RD, &keg->uk_reserve, 0,
|
|
|
|
"number of reserved items");
|
2019-12-13 09:32:09 +00:00
|
|
|
SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
|
|
|
|
keg, 0, sysctl_handle_uma_slab_efficiency, "I",
|
|
|
|
"Slab utilization (100 - internal fragmentation %)");
|
2020-01-04 03:30:08 +00:00
|
|
|
domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(oid),
|
2020-02-26 14:26:36 +00:00
|
|
|
OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
|
2020-01-04 03:30:08 +00:00
|
|
|
for (i = 0; i < domains; i++) {
|
|
|
|
dom = &keg->uk_domain[i];
|
|
|
|
oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid),
|
2020-02-26 14:26:36 +00:00
|
|
|
OID_AUTO, VM_DOMAIN(i)->vmd_name,
|
|
|
|
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
|
2020-01-04 03:30:08 +00:00
|
|
|
SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"pages", CTLFLAG_RD, &dom->ud_pages, 0,
|
|
|
|
"Total pages currently allocated from VM");
|
|
|
|
SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
2020-02-11 20:06:33 +00:00
|
|
|
"free_items", CTLFLAG_RD, &dom->ud_free_items, 0,
|
2021-09-17 12:13:47 -04:00
|
|
|
"Items free in the slab layer");
|
|
|
|
SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"free_slabs", CTLFLAG_RD, &dom->ud_free_slabs, 0,
|
|
|
|
"Unused slabs");
|
2020-01-04 03:30:08 +00:00
|
|
|
}
|
2019-11-28 00:19:09 +00:00
|
|
|
} else
|
|
|
|
SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"name", CTLFLAG_RD, nokeg, "Keg name");
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Information about zone limits.
|
|
|
|
*/
|
|
|
|
oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
|
2020-02-26 14:26:36 +00:00
|
|
|
"limit", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
|
2020-01-04 03:04:46 +00:00
|
|
|
SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
|
|
|
|
zone, 0, sysctl_handle_uma_zone_items, "QU",
|
2020-12-06 22:45:50 +00:00
|
|
|
"Current number of allocated items if limit is set");
|
2019-11-28 00:19:09 +00:00
|
|
|
SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"max_items", CTLFLAG_RD, &zone->uz_max_items, 0,
|
2020-12-06 22:45:50 +00:00
|
|
|
"Maximum number of allocated and cached items");
|
2019-11-28 00:19:09 +00:00
|
|
|
SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0,
|
|
|
|
"Number of threads sleeping at limit");
|
|
|
|
SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0,
|
|
|
|
"Total zone limit sleeps");
|
2020-01-04 03:04:46 +00:00
|
|
|
SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
2020-02-19 18:48:46 +00:00
|
|
|
"bucket_max", CTLFLAG_RD, &zone->uz_bucket_max, 0,
|
|
|
|
"Maximum number of items in each domain's bucket cache");
|
2019-11-28 00:19:09 +00:00
|
|
|
|
|
|
|
/*
|
2020-01-04 03:30:08 +00:00
|
|
|
* Per-domain zone information.
|
2019-11-28 00:19:09 +00:00
|
|
|
*/
|
|
|
|
domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid),
|
2020-02-26 14:26:36 +00:00
|
|
|
OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
|
2019-11-28 00:19:09 +00:00
|
|
|
for (i = 0; i < domains; i++) {
|
2020-02-19 18:48:46 +00:00
|
|
|
zdom = ZDOM_GET(zone, i);
|
2019-11-28 00:19:09 +00:00
|
|
|
oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid),
|
2020-02-26 14:26:36 +00:00
|
|
|
OID_AUTO, VM_DOMAIN(i)->vmd_name,
|
|
|
|
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
|
2019-11-28 00:19:09 +00:00
|
|
|
SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"nitems", CTLFLAG_RD, &zdom->uzd_nitems,
|
|
|
|
"number of items in this domain");
|
|
|
|
SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"imax", CTLFLAG_RD, &zdom->uzd_imax,
|
|
|
|
"maximum item count in this period");
|
|
|
|
SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"imin", CTLFLAG_RD, &zdom->uzd_imin,
|
|
|
|
"minimum item count in this period");
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"bimin", CTLFLAG_RD, &zdom->uzd_bimin,
|
|
|
|
"Minimum item count in this batch");
|
2019-11-28 00:19:09 +00:00
|
|
|
SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"wss", CTLFLAG_RD, &zdom->uzd_wss,
|
|
|
|
"Working set size");
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"limin", CTLFLAG_RD, &zdom->uzd_limin,
|
|
|
|
"Long time minimum item count");
|
|
|
|
SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"timin", CTLFLAG_RD, &zdom->uzd_timin, 0,
|
|
|
|
"Time since zero long time minimum item count");
|
2019-11-28 00:19:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* General statistics.
|
|
|
|
*/
|
|
|
|
oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
|
2020-02-26 14:26:36 +00:00
|
|
|
"stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
|
2019-11-28 00:19:09 +00:00
|
|
|
SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
|
|
|
|
zone, 1, sysctl_handle_uma_zone_cur, "I",
|
|
|
|
"Current number of allocated items");
|
|
|
|
SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
|
|
|
|
zone, 0, sysctl_handle_uma_zone_allocs, "QU",
|
|
|
|
"Total allocation calls");
|
|
|
|
SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
|
|
|
|
zone, 0, sysctl_handle_uma_zone_frees, "QU",
|
|
|
|
"Total free calls");
|
|
|
|
SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"fails", CTLFLAG_RD, &zone->uz_fails,
|
|
|
|
"Number of allocation failures");
|
2020-02-19 18:48:46 +00:00
|
|
|
SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
|
|
|
|
"xdomain", CTLFLAG_RD, &zone->uz_xdomain,
|
2019-11-28 00:19:09 +00:00
|
|
|
"Free calls from the wrong domain");
|
|
|
|
}
|
|
|
|
|
|
|
|
struct uma_zone_count {
|
|
|
|
const char *name;
|
|
|
|
int count;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void
|
|
|
|
zone_count(uma_zone_t zone, void *arg)
|
|
|
|
{
|
|
|
|
struct uma_zone_count *cnt;
|
|
|
|
|
|
|
|
cnt = arg;
|
2019-12-08 01:55:23 +00:00
|
|
|
/*
|
|
|
|
* Some zones are rapidly created with identical names and
|
|
|
|
* destroyed out of order. This can lead to gaps in the count.
|
|
|
|
* Use one greater than the maximum observed for this name.
|
|
|
|
*/
|
2019-11-28 00:19:09 +00:00
|
|
|
if (strcmp(zone->uz_name, cnt->name) == 0)
|
2019-12-08 01:55:23 +00:00
|
|
|
cnt->count = MAX(cnt->count,
|
|
|
|
zone->uz_namecnt + 1);
|
2019-11-28 00:19:09 +00:00
|
|
|
}
|
|
|
|
|
2019-12-25 20:57:24 +00:00
|
|
|
static void
|
|
|
|
zone_update_caches(uma_zone_t zone)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i <= mp_maxid; i++) {
|
|
|
|
cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size);
|
|
|
|
cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
/*
|
|
|
|
* Zone header ctor. This initializes all fields, locks, etc.
|
|
|
|
*
|
|
|
|
* Arguments/Returns follow uma_ctor specifications
|
|
|
|
* udata Actually uma_zctor_args
|
|
|
|
*/
|
2004-08-02 00:18:36 +00:00
|
|
|
static int
|
|
|
|
zone_ctor(void *mem, int size, void *udata, int flags)
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
{
|
2019-11-28 00:19:09 +00:00
|
|
|
struct uma_zone_count cnt;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
struct uma_zctor_args *arg = udata;
|
2020-02-19 18:48:46 +00:00
|
|
|
uma_zone_domain_t zdom;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
uma_zone_t zone = mem;
|
|
|
|
uma_zone_t z;
|
|
|
|
uma_keg_t keg;
|
2019-09-01 22:22:43 +00:00
|
|
|
int i;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
|
|
|
bzero(zone, size);
|
|
|
|
zone->uz_name = arg->name;
|
|
|
|
zone->uz_ctor = arg->ctor;
|
|
|
|
zone->uz_dtor = arg->dtor;
|
|
|
|
zone->uz_init = NULL;
|
|
|
|
zone->uz_fini = NULL;
|
2010-06-15 19:28:37 +00:00
|
|
|
zone->uz_sleeps = 0;
|
2019-11-28 00:19:09 +00:00
|
|
|
zone->uz_bucket_size = 0;
|
|
|
|
zone->uz_bucket_size_min = 0;
|
|
|
|
zone->uz_bucket_size_max = BUCKET_MAX;
|
2020-01-31 00:49:51 +00:00
|
|
|
zone->uz_flags = (arg->flags & UMA_ZONE_SMR);
|
2012-12-07 22:27:13 +00:00
|
|
|
zone->uz_warning = NULL;
|
2018-01-12 23:25:05 +00:00
|
|
|
/* The domain structures follow the cpu structures. */
|
2020-02-19 18:48:46 +00:00
|
|
|
zone->uz_bucket_max = ULONG_MAX;
|
2012-12-07 22:27:13 +00:00
|
|
|
timevalclear(&zone->uz_ratecheck);
|
2013-06-20 19:08:12 +00:00
|
|
|
|
2019-11-28 00:19:09 +00:00
|
|
|
/* Count the number of duplicate names. */
|
|
|
|
cnt.name = arg->name;
|
|
|
|
cnt.count = 0;
|
|
|
|
zone_foreach(zone_count, &cnt);
|
|
|
|
zone->uz_namecnt = cnt.count;
|
2020-01-04 07:56:28 +00:00
|
|
|
ZONE_CROSS_LOCK_INIT(zone);
|
2019-01-15 18:24:34 +00:00
|
|
|
|
2020-02-19 18:48:46 +00:00
|
|
|
for (i = 0; i < vm_ndomains; i++) {
|
|
|
|
zdom = ZDOM_GET(zone, i);
|
|
|
|
ZDOM_LOCK_INIT(zone, zdom, (arg->flags & UMA_ZONE_MTXCLASS));
|
|
|
|
STAILQ_INIT(&zdom->uzd_buckets);
|
|
|
|
}
|
2019-09-01 22:22:43 +00:00
|
|
|
|
2021-08-10 17:15:03 -04:00
|
|
|
#if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN)
|
2019-11-27 19:49:55 +00:00
|
|
|
if (arg->uminit == trash_init && arg->fini == trash_fini)
|
2019-12-25 20:57:24 +00:00
|
|
|
zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR;
|
2021-04-13 17:39:50 -04:00
|
|
|
#elif defined(KASAN)
|
|
|
|
if ((arg->flags & (UMA_ZONE_NOFREE | UMA_ZFLAG_CACHE)) != 0)
|
|
|
|
arg->flags |= UMA_ZONE_NOKASAN;
|
2019-11-27 19:49:55 +00:00
|
|
|
#endif
|
|
|
|
|
2013-06-17 03:43:47 +00:00
|
|
|
/*
|
|
|
|
* This is a pure cache zone, no kegs.
|
|
|
|
*/
|
|
|
|
if (arg->import) {
|
2020-01-04 03:15:34 +00:00
|
|
|
KASSERT((arg->flags & UMA_ZFLAG_CACHE) != 0,
|
|
|
|
("zone_ctor: Import specified for non-cache zone."));
|
2013-06-26 00:57:38 +00:00
|
|
|
zone->uz_flags = arg->flags;
|
2013-06-20 19:08:12 +00:00
|
|
|
zone->uz_size = arg->size;
|
2013-06-17 03:43:47 +00:00
|
|
|
zone->uz_import = arg->import;
|
|
|
|
zone->uz_release = arg->release;
|
|
|
|
zone->uz_arg = arg->arg;
|
2020-02-19 18:48:46 +00:00
|
|
|
#ifdef NUMA
|
|
|
|
/*
|
|
|
|
* Cache zones are round-robin unless a policy is
|
|
|
|
* specified because they may have incompatible
|
|
|
|
* constraints.
|
|
|
|
*/
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0)
|
|
|
|
zone->uz_flags |= UMA_ZONE_ROUNDROBIN;
|
|
|
|
#endif
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_wlock(&uma_rwlock);
|
2013-11-28 19:20:49 +00:00
|
|
|
LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_wunlock(&uma_rwlock);
|
2013-06-20 19:08:12 +00:00
|
|
|
goto out;
|
2013-06-17 03:43:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use the regular zone/keg/slab allocator.
|
|
|
|
*/
|
2019-12-04 18:40:05 +00:00
|
|
|
zone->uz_import = zone_import;
|
|
|
|
zone->uz_release = zone_release;
|
2013-06-17 03:43:47 +00:00
|
|
|
zone->uz_arg = zone;
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
keg = arg->keg;
|
2013-06-17 03:43:47 +00:00
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
if (arg->flags & UMA_ZONE_SECONDARY) {
|
2019-11-28 00:19:09 +00:00
|
|
|
KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
|
|
|
|
("Secondary zone requested UMA_ZFLAG_INTERNAL"));
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
|
|
|
|
zone->uz_init = arg->uminit;
|
|
|
|
zone->uz_fini = arg->fini;
|
2009-01-25 09:11:24 +00:00
|
|
|
zone->uz_flags |= UMA_ZONE_SECONDARY;
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_wlock(&uma_rwlock);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
ZONE_LOCK(zone);
|
|
|
|
LIST_FOREACH(z, &keg->uk_zones, uz_link) {
|
|
|
|
if (LIST_NEXT(z, uz_link) == NULL) {
|
|
|
|
LIST_INSERT_AFTER(z, zone, uz_link);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ZONE_UNLOCK(zone);
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_wunlock(&uma_rwlock);
|
2009-01-25 09:11:24 +00:00
|
|
|
} else if (keg == NULL) {
|
|
|
|
if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
|
|
|
|
arg->align, arg->flags)) == NULL)
|
2004-08-02 00:18:36 +00:00
|
|
|
return (ENOMEM);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
} else {
|
|
|
|
struct uma_kctor_args karg;
|
2004-08-02 00:18:36 +00:00
|
|
|
int error;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
|
|
|
/* We should only be here from uma_startup() */
|
|
|
|
karg.size = arg->size;
|
|
|
|
karg.uminit = arg->uminit;
|
|
|
|
karg.fini = arg->fini;
|
|
|
|
karg.align = arg->align;
|
2020-01-31 00:49:51 +00:00
|
|
|
karg.flags = (arg->flags & ~UMA_ZONE_SMR);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
karg.zone = zone;
|
2004-08-02 00:18:36 +00:00
|
|
|
error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
|
|
|
|
flags);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
}
|
2013-06-17 03:43:47 +00:00
|
|
|
|
2019-11-28 00:19:09 +00:00
|
|
|
/* Inherit properties from the keg. */
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
zone->uz_keg = keg;
|
2009-01-25 09:11:24 +00:00
|
|
|
zone->uz_size = keg->uk_size;
|
|
|
|
zone->uz_flags |= (keg->uk_flags &
|
|
|
|
(UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2019-11-28 00:19:09 +00:00
|
|
|
out:
|
2020-05-14 16:06:54 +00:00
|
|
|
if (booted >= BOOT_PCPU) {
|
2019-11-28 00:19:09 +00:00
|
|
|
zone_alloc_counters(zone, NULL);
|
2020-05-14 16:06:54 +00:00
|
|
|
if (booted >= BOOT_RUNNING)
|
|
|
|
zone_alloc_sysctl(zone, NULL);
|
2019-11-28 00:19:09 +00:00
|
|
|
} else {
|
|
|
|
zone->uz_allocs = EARLY_COUNTER;
|
|
|
|
zone->uz_frees = EARLY_COUNTER;
|
|
|
|
zone->uz_fails = EARLY_COUNTER;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
}
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
/* Caller requests a private SMR context. */
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
|
2020-02-22 03:44:10 +00:00
|
|
|
zone->uz_smr = smr_create(zone->uz_name, 0, 0);
|
2020-01-31 00:49:51 +00:00
|
|
|
|
2018-04-24 20:05:45 +00:00
|
|
|
KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
|
|
|
|
(UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
|
|
|
|
("Invalid zone flag combination"));
|
2019-11-28 00:19:09 +00:00
|
|
|
if (arg->flags & UMA_ZFLAG_INTERNAL)
|
|
|
|
zone->uz_bucket_size_max = zone->uz_bucket_size = 0;
|
|
|
|
if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
|
|
|
|
zone->uz_bucket_size = BUCKET_MAX;
|
|
|
|
else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
|
|
|
|
zone->uz_bucket_size = 0;
|
2018-04-24 20:05:45 +00:00
|
|
|
else
|
2019-11-28 00:19:09 +00:00
|
|
|
zone->uz_bucket_size = bucket_select(zone->uz_size);
|
|
|
|
zone->uz_bucket_size_min = zone->uz_bucket_size;
|
2019-12-25 20:57:24 +00:00
|
|
|
if (zone->uz_dtor != NULL || zone->uz_ctor != NULL)
|
|
|
|
zone->uz_flags |= UMA_ZFLAG_CTORDTOR;
|
|
|
|
zone_update_caches(zone);
|
2013-06-18 04:50:20 +00:00
|
|
|
|
2004-08-02 00:18:36 +00:00
|
|
|
return (0);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2004-01-30 16:26:29 +00:00
|
|
|
/*
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
* Keg header dtor. This frees all data, destroys locks, frees the hash
|
|
|
|
* table and removes the keg from the global list.
|
2002-04-08 04:48:58 +00:00
|
|
|
*
|
|
|
|
* Arguments/Returns follow uma_dtor specifications
|
|
|
|
* udata unused
|
|
|
|
*/
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
static void
|
|
|
|
keg_dtor(void *arg, int size, void *udata)
|
|
|
|
{
|
|
|
|
uma_keg_t keg;
|
2020-01-04 03:30:08 +00:00
|
|
|
uint32_t free, pages;
|
|
|
|
int i;
|
2002-04-08 04:48:58 +00:00
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
keg = (uma_keg_t)arg;
|
2020-01-04 03:30:08 +00:00
|
|
|
free = pages = 0;
|
|
|
|
for (i = 0; i < vm_ndomains; i++) {
|
2020-02-11 20:06:33 +00:00
|
|
|
free += keg->uk_domain[i].ud_free_items;
|
2020-01-04 03:30:08 +00:00
|
|
|
pages += keg->uk_domain[i].ud_pages;
|
|
|
|
KEG_LOCK_FINI(keg, i);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
}
|
2020-01-23 04:56:34 +00:00
|
|
|
if (pages != 0)
|
2020-01-04 03:30:08 +00:00
|
|
|
printf("Freed UMA keg (%s) was not empty (%u items). "
|
|
|
|
" Lost %u pages of memory.\n",
|
|
|
|
keg->uk_name ? keg->uk_name : "",
|
2020-01-23 04:56:34 +00:00
|
|
|
pages / keg->uk_ppera * keg->uk_ipers - free, pages);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
2009-01-25 09:11:24 +00:00
|
|
|
hash_free(&keg->uk_hash);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Zone header dtor.
|
|
|
|
*
|
|
|
|
* Arguments/Returns follow uma_dtor specifications
|
|
|
|
* udata unused
|
|
|
|
*/
|
2002-04-08 04:48:58 +00:00
|
|
|
static void
|
|
|
|
zone_dtor(void *arg, int size, void *udata)
|
|
|
|
{
|
|
|
|
uma_zone_t zone;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
uma_keg_t keg;
|
2020-02-19 18:48:46 +00:00
|
|
|
int i;
|
2002-04-08 04:48:58 +00:00
|
|
|
|
|
|
|
zone = (uma_zone_t)arg;
|
2003-09-19 23:27:46 +00:00
|
|
|
|
2019-11-28 00:19:09 +00:00
|
|
|
sysctl_remove_oid(zone->uz_oid, 1, 1);
|
|
|
|
|
2009-01-25 09:11:24 +00:00
|
|
|
if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
|
2003-09-19 23:27:46 +00:00
|
|
|
cache_drain(zone);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_wlock(&uma_rwlock);
|
2009-01-25 09:11:24 +00:00
|
|
|
LIST_REMOVE(zone, uz_link);
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_wunlock(&uma_rwlock);
|
2020-11-10 18:12:09 +00:00
|
|
|
if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
|
|
|
|
keg = zone->uz_keg;
|
|
|
|
keg->uk_reserve = 0;
|
|
|
|
}
|
2021-04-14 12:57:24 -04:00
|
|
|
zone_reclaim(zone, UMA_ANYDOMAIN, M_WAITOK, true);
|
2020-02-19 18:48:46 +00:00
|
|
|
|
2009-01-25 09:11:24 +00:00
|
|
|
/*
|
2019-04-12 12:46:25 +00:00
|
|
|
* We only destroy kegs from non secondary/non cache zones.
|
2009-01-25 09:11:24 +00:00
|
|
|
*/
|
2019-04-12 12:46:25 +00:00
|
|
|
if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
|
|
|
|
keg = zone->uz_keg;
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_wlock(&uma_rwlock);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
LIST_REMOVE(keg, uk_link);
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_wunlock(&uma_rwlock);
|
2013-06-17 03:43:47 +00:00
|
|
|
zone_free_item(kegs, keg, NULL, SKIP_NONE);
|
2003-11-30 08:04:01 +00:00
|
|
|
}
|
2019-01-15 18:24:34 +00:00
|
|
|
counter_u64_free(zone->uz_allocs);
|
|
|
|
counter_u64_free(zone->uz_frees);
|
|
|
|
counter_u64_free(zone->uz_fails);
|
2020-02-19 18:48:46 +00:00
|
|
|
counter_u64_free(zone->uz_xdomain);
|
2019-11-28 00:19:09 +00:00
|
|
|
free(zone->uz_ctlname, M_UMA);
|
2020-02-19 18:48:46 +00:00
|
|
|
for (i = 0; i < vm_ndomains; i++)
|
|
|
|
ZDOM_LOCK_FINI(ZDOM_GET(zone, i));
|
2020-01-04 07:56:28 +00:00
|
|
|
ZONE_CROSS_LOCK_FINI(zone);
|
2002-04-08 04:48:58 +00:00
|
|
|
}
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
2004-01-30 16:26:29 +00:00
|
|
|
static void
|
2020-01-16 05:01:21 +00:00
|
|
|
zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *arg), void *arg)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
uma_keg_t keg;
|
2002-03-19 09:11:49 +00:00
|
|
|
uma_zone_t zone;
|
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
LIST_FOREACH(keg, &uma_kegs, uk_link) {
|
|
|
|
LIST_FOREACH(zone, &keg->uk_zones, uz_link)
|
2019-11-28 00:19:09 +00:00
|
|
|
zfunc(zone, arg);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
}
|
2019-11-10 09:25:19 +00:00
|
|
|
LIST_FOREACH(zone, &uma_cachezones, uz_link)
|
2019-11-28 00:19:09 +00:00
|
|
|
zfunc(zone, arg);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
/*
|
2020-01-16 05:01:21 +00:00
|
|
|
* Traverses every zone in the system and calls a callback
|
|
|
|
*
|
|
|
|
* Arguments:
|
|
|
|
* zfunc A pointer to a function which accepts a zone
|
|
|
|
* as an argument.
|
|
|
|
*
|
|
|
|
* Returns:
|
|
|
|
* Nothing
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
*/
|
2020-01-16 05:01:21 +00:00
|
|
|
static void
|
|
|
|
zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg)
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
{
|
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
rw_rlock(&uma_rwlock);
|
|
|
|
zone_foreach_unlocked(zfunc, arg);
|
|
|
|
rw_runlock(&uma_rwlock);
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
}
|
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
/*
|
|
|
|
* Initialize the kernel memory allocator. This is done after pages can be
|
|
|
|
* allocated but before general KVA is available.
|
|
|
|
*/
|
2002-03-19 09:11:49 +00:00
|
|
|
void
|
2020-01-16 05:01:21 +00:00
|
|
|
uma_startup1(vm_offset_t virtual_avail)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
|
|
|
struct uma_zctor_args args;
|
2020-01-16 05:01:21 +00:00
|
|
|
size_t ksize, zsize, size;
|
2020-06-20 20:21:04 +00:00
|
|
|
uma_keg_t primarykeg;
|
2018-01-12 23:25:05 +00:00
|
|
|
uintptr_t m;
|
Fix boot on systems where NUMA domain 0 is unpopulated.
- Add vm_phys_early_add_seg(), complementing vm_phys_early_alloc(), to
ensure that segments registered during hammer_time() are placed in the
right domain. Otherwise, since the SRAT is not parsed at that point,
we just add them to domain 0, which may be incorrect and results in a
domain with only several MB worth of memory.
- Fix uma_startup1() to try allocating memory for zones from any domain.
If domain 0 is unpopulated, the allocation will simply fail, resulting
in a page fault slightly later during boot.
- Change _vm_phys_domain() to return -1 for addresses not covered by the
affinity table, and change vm_phys_early_alloc() to handle wildcard
domains. This is necessary on amd64, where the page array is dense
and pmap_page_array_startup() may allocate page table pages for
non-existent page frames.
Reported and tested by: Rafael Kitover <rkitover@gmail.com>
Reviewed by: cem (earlier version), kib
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D25001
2020-05-28 19:41:00 +00:00
|
|
|
int domain;
|
2020-01-16 05:01:21 +00:00
|
|
|
uint8_t pflag;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
bootstart = bootmem = virtual_avail;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
rw_init(&uma_rwlock, "UMA lock");
|
2020-01-16 05:01:21 +00:00
|
|
|
sx_init(&uma_reclaim_lock, "umareclaim");
|
|
|
|
|
|
|
|
ksize = sizeof(struct uma_keg) +
|
|
|
|
(sizeof(struct uma_domain) * vm_ndomains);
|
|
|
|
ksize = roundup(ksize, UMA_SUPER_ALIGN);
|
|
|
|
zsize = sizeof(struct uma_zone) +
|
|
|
|
(sizeof(struct uma_cache) * (mp_maxid + 1)) +
|
|
|
|
(sizeof(struct uma_zone_domain) * vm_ndomains);
|
|
|
|
zsize = roundup(zsize, UMA_SUPER_ALIGN);
|
2018-01-12 23:25:05 +00:00
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
/* Allocate the zone of zones, zone of kegs, and zone of zones keg. */
|
|
|
|
size = (zsize * 2) + ksize;
|
Fix boot on systems where NUMA domain 0 is unpopulated.
- Add vm_phys_early_add_seg(), complementing vm_phys_early_alloc(), to
ensure that segments registered during hammer_time() are placed in the
right domain. Otherwise, since the SRAT is not parsed at that point,
we just add them to domain 0, which may be incorrect and results in a
domain with only several MB worth of memory.
- Fix uma_startup1() to try allocating memory for zones from any domain.
If domain 0 is unpopulated, the allocation will simply fail, resulting
in a page fault slightly later during boot.
- Change _vm_phys_domain() to return -1 for addresses not covered by the
affinity table, and change vm_phys_early_alloc() to handle wildcard
domains. This is necessary on amd64, where the page array is dense
and pmap_page_array_startup() may allocate page table pages for
non-existent page frames.
Reported and tested by: Rafael Kitover <rkitover@gmail.com>
Reviewed by: cem (earlier version), kib
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D25001
2020-05-28 19:41:00 +00:00
|
|
|
for (domain = 0; domain < vm_ndomains; domain++) {
|
|
|
|
m = (uintptr_t)startup_alloc(NULL, size, domain, &pflag,
|
|
|
|
M_NOWAIT | M_ZERO);
|
|
|
|
if (m != 0)
|
|
|
|
break;
|
|
|
|
}
|
2018-01-12 23:25:05 +00:00
|
|
|
zones = (uma_zone_t)m;
|
2020-01-06 02:51:19 +00:00
|
|
|
m += zsize;
|
2018-01-12 23:25:05 +00:00
|
|
|
kegs = (uma_zone_t)m;
|
2020-01-06 02:51:19 +00:00
|
|
|
m += zsize;
|
2020-06-20 20:21:04 +00:00
|
|
|
primarykeg = (uma_keg_t)m;
|
2018-01-12 23:25:05 +00:00
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
/* "manually" create the initial zone */
|
2013-06-17 03:43:47 +00:00
|
|
|
memset(&args, 0, sizeof(args));
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
args.name = "UMA Kegs";
|
2018-01-12 23:25:05 +00:00
|
|
|
args.size = ksize;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
args.ctor = keg_ctor;
|
|
|
|
args.dtor = keg_dtor;
|
2002-03-19 09:11:49 +00:00
|
|
|
args.uminit = zero_init;
|
|
|
|
args.fini = NULL;
|
2020-06-20 20:21:04 +00:00
|
|
|
args.keg = primarykeg;
|
2020-01-06 02:51:19 +00:00
|
|
|
args.align = UMA_SUPER_ALIGN - 1;
|
2003-09-19 08:37:44 +00:00
|
|
|
args.flags = UMA_ZFLAG_INTERNAL;
|
2018-01-12 23:25:05 +00:00
|
|
|
zone_ctor(kegs, zsize, &args, M_WAITOK);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
args.name = "UMA Zones";
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
args.size = zsize;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
args.ctor = zone_ctor;
|
|
|
|
args.dtor = zone_dtor;
|
|
|
|
args.uminit = zero_init;
|
|
|
|
args.fini = NULL;
|
|
|
|
args.keg = NULL;
|
2020-01-06 02:51:19 +00:00
|
|
|
args.align = UMA_SUPER_ALIGN - 1;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
args.flags = UMA_ZFLAG_INTERNAL;
|
2018-01-12 23:25:05 +00:00
|
|
|
zone_ctor(zones, zsize, &args, M_WAITOK);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
2020-01-14 02:14:15 +00:00
|
|
|
/* Now make zones for slab headers */
|
|
|
|
slabzones[0] = uma_zcreate("UMA Slabs 0", SLABZONE0_SIZE,
|
|
|
|
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
|
|
|
|
slabzones[1] = uma_zcreate("UMA Slabs 1", SLABZONE1_SIZE,
|
2019-12-08 01:15:06 +00:00
|
|
|
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
|
|
|
hashzone = uma_zcreate("UMA Hash",
|
|
|
|
sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
|
2019-12-08 01:15:06 +00:00
|
|
|
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
bucket_init();
|
2020-01-31 00:49:51 +00:00
|
|
|
smr_init();
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
}
|
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
#ifndef UMA_MD_SMALL_ALLOC
|
|
|
|
extern void vm_radix_reserve_kva(void);
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
#endif
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
/*
|
|
|
|
* Advertise the availability of normal kva allocations and switch to
|
|
|
|
* the default back-end allocator. Marks the KVA we consumed on startup
|
|
|
|
* as used in the map.
|
|
|
|
*/
|
2002-03-19 09:11:49 +00:00
|
|
|
void
|
2002-09-18 08:26:30 +00:00
|
|
|
uma_startup2(void)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
|
2020-01-23 03:37:35 +00:00
|
|
|
if (bootstart != bootmem) {
|
2020-01-16 05:01:21 +00:00
|
|
|
vm_map_lock(kernel_map);
|
|
|
|
(void)vm_map_insert(kernel_map, NULL, 0, bootstart, bootmem,
|
|
|
|
VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
|
|
|
|
vm_map_unlock(kernel_map);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef UMA_MD_SMALL_ALLOC
|
|
|
|
/* Set up radix zone to use noobj_alloc. */
|
|
|
|
vm_radix_reserve_kva();
|
2018-02-09 04:45:39 +00:00
|
|
|
#endif
|
2020-01-16 05:01:21 +00:00
|
|
|
|
|
|
|
booted = BOOT_KVA;
|
|
|
|
zone_foreach_unlocked(zone_kva_available, NULL);
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
bucket_enable();
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2020-05-14 16:06:54 +00:00
|
|
|
/*
|
|
|
|
* Allocate counters as early as possible so that boot-time allocations are
|
|
|
|
* accounted more precisely.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
uma_startup_pcpu(void *arg __unused)
|
|
|
|
{
|
|
|
|
|
|
|
|
zone_foreach_unlocked(zone_alloc_counters, NULL);
|
|
|
|
booted = BOOT_PCPU;
|
|
|
|
}
|
|
|
|
SYSINIT(uma_startup_pcpu, SI_SUB_COUNTER, SI_ORDER_ANY, uma_startup_pcpu, NULL);
|
|
|
|
|
2020-01-16 05:01:21 +00:00
|
|
|
/*
|
|
|
|
* Finish our initialization steps.
|
|
|
|
*/
|
2002-03-19 09:11:49 +00:00
|
|
|
static void
|
2020-05-14 16:06:54 +00:00
|
|
|
uma_startup3(void *arg __unused)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
2017-06-01 18:36:52 +00:00
|
|
|
|
2018-06-08 00:15:08 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
|
|
|
|
uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
|
|
|
|
uma_skip_cnt = counter_u64_alloc(M_WAITOK);
|
|
|
|
#endif
|
2020-01-16 05:01:21 +00:00
|
|
|
zone_foreach_unlocked(zone_alloc_sysctl, NULL);
|
2015-05-22 17:05:21 +00:00
|
|
|
callout_init(&uma_callout, 1);
|
2003-09-19 23:27:46 +00:00
|
|
|
callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
|
2018-06-08 00:15:08 +00:00
|
|
|
booted = BOOT_RUNNING;
|
2020-01-09 19:17:42 +00:00
|
|
|
|
|
|
|
EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL,
|
|
|
|
EVENTHANDLER_PRI_FIRST);
|
|
|
|
}
|
2020-05-14 16:06:54 +00:00
|
|
|
SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
|
2020-01-09 19:17:42 +00:00
|
|
|
|
|
|
|
static void
|
|
|
|
uma_shutdown(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
booted = BOOT_SHUTDOWN;
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2009-01-25 09:11:24 +00:00
|
|
|
static uma_keg_t
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
|
2013-04-09 17:43:48 +00:00
|
|
|
int align, uint32_t flags)
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
{
|
|
|
|
struct uma_kctor_args args;
|
|
|
|
|
|
|
|
args.size = size;
|
|
|
|
args.uminit = uminit;
|
|
|
|
args.fini = fini;
|
2007-02-11 20:13:52 +00:00
|
|
|
args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
args.flags = flags;
|
|
|
|
args.zone = zone;
|
2018-01-12 23:25:05 +00:00
|
|
|
return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
}
|
|
|
|
|
Followup on r302393 by cperciva, improving calculation of boot pages required
for UMA startup.
o Introduce another stage of UMA startup, which is entered after
vm_page_startup() finishes. After this stage we don't yet enable buckets,
but we can ask VM for pages. Rename stages to meaningful names while here.
New list of stages: BOOT_COLD, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
BOOT_RUNNING.
Enabling page alloc earlier allows us to dramatically reduce number of
boot pages required. What is more important number of zones becomes
consistent across different machines, as no MD allocations are done before
the BOOT_PAGEALLOC stage. Now only UMA internal zones actually need to use
startup_alloc(), however that may change, so vm_page_startup() provides
its need for early zones as argument.
o Introduce uma_startup_count() function, to avoid code duplication. The
functions calculates sizes of zones zone and kegs zone, and calculates how
many pages UMA will need to bootstrap.
It counts not only of zone structures, but also of kegs, slabs and hashes.
o Hide uma_startup_foo() declarations from public file.
o Provide several DIAGNOSTIC printfs on boot_pages usage.
o Bugfix: when calculating zone of zones size use (mp_maxid + 1) instead of
mp_ncpus. Use resulting number not only in the size argument to zone_ctor()
but also as args.size.
Reviewed by: imp, gallatin (earlier version)
Differential Revision: https://reviews.freebsd.org/D14054
2018-02-06 04:16:00 +00:00
|
|
|
/* Public functions */
|
2007-02-11 20:13:52 +00:00
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_set_align(int align)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (align != UMA_ALIGN_CACHE)
|
|
|
|
uma_align_cache = align;
|
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/* See uma.h */
|
2004-01-30 16:26:29 +00:00
|
|
|
uma_zone_t
|
2012-10-26 17:51:05 +00:00
|
|
|
uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
|
2013-04-09 17:43:48 +00:00
|
|
|
uma_init uminit, uma_fini fini, int align, uint32_t flags)
|
2004-01-30 16:26:29 +00:00
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
|
|
|
struct uma_zctor_args args;
|
2014-11-30 20:20:55 +00:00
|
|
|
uma_zone_t res;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2017-04-04 16:26:46 +00:00
|
|
|
KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
|
|
|
|
align, name));
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/* This stuff is essential for the zone ctor */
|
2013-06-17 03:43:47 +00:00
|
|
|
memset(&args, 0, sizeof(args));
|
2002-03-19 09:11:49 +00:00
|
|
|
args.name = name;
|
|
|
|
args.size = size;
|
|
|
|
args.ctor = ctor;
|
|
|
|
args.dtor = dtor;
|
|
|
|
args.uminit = uminit;
|
|
|
|
args.fini = fini;
|
2021-08-10 17:15:03 -04:00
|
|
|
#if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN)
|
2015-06-25 20:44:46 +00:00
|
|
|
/*
|
2019-11-27 19:49:55 +00:00
|
|
|
* Inject procedures which check for memory use after free if we are
|
|
|
|
* allowed to scramble the memory while it is not allocated. This
|
|
|
|
* requires that: UMA is actually able to access the memory, no init
|
|
|
|
* or fini procedures, no dependency on the initial value of the
|
|
|
|
* memory, and no (legitimate) use of the memory after free. Note,
|
|
|
|
* the ctor and dtor do not need to be empty.
|
2015-06-25 20:44:46 +00:00
|
|
|
*/
|
2020-01-09 02:03:03 +00:00
|
|
|
if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOTOUCH |
|
|
|
|
UMA_ZONE_NOFREE))) && uminit == NULL && fini == NULL) {
|
2015-06-25 20:44:46 +00:00
|
|
|
args.uminit = trash_init;
|
|
|
|
args.fini = trash_fini;
|
|
|
|
}
|
|
|
|
#endif
|
2002-03-19 09:11:49 +00:00
|
|
|
args.align = align;
|
|
|
|
args.flags = flags;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
args.keg = NULL;
|
|
|
|
|
2021-04-14 12:57:24 -04:00
|
|
|
sx_xlock(&uma_reclaim_lock);
|
2018-01-12 23:25:05 +00:00
|
|
|
res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
|
2021-04-14 12:57:24 -04:00
|
|
|
sx_xunlock(&uma_reclaim_lock);
|
2020-01-16 05:01:21 +00:00
|
|
|
|
2014-11-30 20:20:55 +00:00
|
|
|
return (res);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* See uma.h */
|
|
|
|
uma_zone_t
|
2020-02-22 17:44:28 +00:00
|
|
|
uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor,
|
2020-06-20 20:21:04 +00:00
|
|
|
uma_init zinit, uma_fini zfini, uma_zone_t primary)
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
{
|
|
|
|
struct uma_zctor_args args;
|
2009-01-25 09:11:24 +00:00
|
|
|
uma_keg_t keg;
|
2014-11-30 20:20:55 +00:00
|
|
|
uma_zone_t res;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
2020-06-20 20:21:04 +00:00
|
|
|
keg = primary->uz_keg;
|
2013-06-17 03:43:47 +00:00
|
|
|
memset(&args, 0, sizeof(args));
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
args.name = name;
|
2009-01-25 09:11:24 +00:00
|
|
|
args.size = keg->uk_size;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
args.ctor = ctor;
|
|
|
|
args.dtor = dtor;
|
|
|
|
args.uminit = zinit;
|
|
|
|
args.fini = zfini;
|
2009-01-25 09:11:24 +00:00
|
|
|
args.align = keg->uk_align;
|
|
|
|
args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
|
|
|
|
args.keg = keg;
|
|
|
|
|
2021-04-14 12:57:24 -04:00
|
|
|
sx_xlock(&uma_reclaim_lock);
|
2018-01-12 23:25:05 +00:00
|
|
|
res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
|
2021-04-14 12:57:24 -04:00
|
|
|
sx_xunlock(&uma_reclaim_lock);
|
2020-01-16 05:01:21 +00:00
|
|
|
|
2014-11-30 20:20:55 +00:00
|
|
|
return (res);
|
2009-01-25 09:11:24 +00:00
|
|
|
}
|
|
|
|
|
2013-06-17 03:43:47 +00:00
|
|
|
/* See uma.h */
|
|
|
|
uma_zone_t
|
2020-02-22 17:44:28 +00:00
|
|
|
uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor,
|
|
|
|
uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease,
|
|
|
|
void *arg, int flags)
|
2013-06-17 03:43:47 +00:00
|
|
|
{
|
|
|
|
struct uma_zctor_args args;
|
|
|
|
|
|
|
|
memset(&args, 0, sizeof(args));
|
|
|
|
args.name = name;
|
2013-06-20 19:08:12 +00:00
|
|
|
args.size = size;
|
2013-06-17 03:43:47 +00:00
|
|
|
args.ctor = ctor;
|
|
|
|
args.dtor = dtor;
|
|
|
|
args.uminit = zinit;
|
|
|
|
args.fini = zfini;
|
|
|
|
args.import = zimport;
|
|
|
|
args.release = zrelease;
|
|
|
|
args.arg = arg;
|
|
|
|
args.align = 0;
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
args.flags = flags | UMA_ZFLAG_CACHE;
|
2013-06-17 03:43:47 +00:00
|
|
|
|
2018-01-12 23:25:05 +00:00
|
|
|
return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
|
2013-06-17 03:43:47 +00:00
|
|
|
}
|
|
|
|
|
2002-04-08 04:48:58 +00:00
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zdestroy(uma_zone_t zone)
|
|
|
|
{
|
2005-07-20 18:47:42 +00:00
|
|
|
|
2020-01-09 19:17:42 +00:00
|
|
|
/*
|
|
|
|
* Large slabs are expensive to reclaim, so don't bother doing
|
|
|
|
* unnecessary work if we're shutting down.
|
|
|
|
*/
|
|
|
|
if (booted == BOOT_SHUTDOWN &&
|
|
|
|
zone->uz_fini == NULL && zone->uz_release == zone_release)
|
|
|
|
return;
|
2021-04-14 12:57:24 -04:00
|
|
|
sx_xlock(&uma_reclaim_lock);
|
2013-06-17 03:43:47 +00:00
|
|
|
zone_free_item(zones, zone, NULL, SKIP_NONE);
|
2021-04-14 12:57:24 -04:00
|
|
|
sx_xunlock(&uma_reclaim_lock);
|
2002-04-08 04:48:58 +00:00
|
|
|
}
|
|
|
|
|
2017-11-08 02:39:37 +00:00
|
|
|
void
|
|
|
|
uma_zwait(uma_zone_t zone)
|
|
|
|
{
|
|
|
|
|
2020-02-17 01:06:18 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
|
|
|
|
uma_zfree_smr(zone, uma_zalloc_smr(zone, M_WAITOK));
|
|
|
|
else if ((zone->uz_flags & UMA_ZONE_PCPU) != 0)
|
|
|
|
uma_zfree_pcpu(zone, uma_zalloc_pcpu(zone, M_WAITOK));
|
|
|
|
else
|
|
|
|
uma_zfree(zone, uma_zalloc(zone, M_WAITOK));
|
2017-11-08 02:39:37 +00:00
|
|
|
}
|
|
|
|
|
2018-06-08 21:40:03 +00:00
|
|
|
void *
|
|
|
|
uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
|
|
|
|
{
|
2020-02-12 11:11:22 +00:00
|
|
|
void *item, *pcpu_item;
|
2018-06-21 11:43:54 +00:00
|
|
|
#ifdef SMP
|
2018-06-08 21:40:03 +00:00
|
|
|
int i;
|
|
|
|
|
|
|
|
MPASS(zone->uz_flags & UMA_ZONE_PCPU);
|
2018-06-21 11:43:54 +00:00
|
|
|
#endif
|
Fix pre-SI_SUB_CPU initialization of per-CPU counters.
r336020 introduced pcpu_page_alloc(), replacing page_alloc() as the
backend allocator for PCPU UMA zones. Unlike page_alloc(), it does
not honour malloc(9) flags such as M_ZERO or M_NODUMP, so fix that.
r336020 also changed counter(9) to initialize each counter using a
CPU_FOREACH() loop instead of an SMP rendezvous. Before SI_SUB_CPU,
smp_rendezvous() will only execute the callback on the current CPU
(i.e., CPU 0), so only one counter gets zeroed. The rest are zeroed
by virtue of the fact that UMA gratuitously zeroes slabs when importing
them into a zone.
Prior to SI_SUB_CPU, all_cpus is clear, so with r336020 we weren't
zeroing vm_cnt counters during boot: the CPU_FOREACH() loop had no
effect, and pcpu_page_alloc() didn't honour M_ZERO. Fix this by
iterating over the full range of CPU IDs when zeroing counters,
ignoring whether the corresponding bits in all_cpus are set.
Reported and tested by: pho (previous version)
Reviewed by: kib (previous version)
Differential Revision: https://reviews.freebsd.org/D16190
2018-07-10 00:18:12 +00:00
|
|
|
item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
|
2020-02-12 11:11:22 +00:00
|
|
|
if (item == NULL)
|
|
|
|
return (NULL);
|
|
|
|
pcpu_item = zpcpu_base_to_offset(item);
|
|
|
|
if (flags & M_ZERO) {
|
2018-06-21 11:43:54 +00:00
|
|
|
#ifdef SMP
|
Fix pre-SI_SUB_CPU initialization of per-CPU counters.
r336020 introduced pcpu_page_alloc(), replacing page_alloc() as the
backend allocator for PCPU UMA zones. Unlike page_alloc(), it does
not honour malloc(9) flags such as M_ZERO or M_NODUMP, so fix that.
r336020 also changed counter(9) to initialize each counter using a
CPU_FOREACH() loop instead of an SMP rendezvous. Before SI_SUB_CPU,
smp_rendezvous() will only execute the callback on the current CPU
(i.e., CPU 0), so only one counter gets zeroed. The rest are zeroed
by virtue of the fact that UMA gratuitously zeroes slabs when importing
them into a zone.
Prior to SI_SUB_CPU, all_cpus is clear, so with r336020 we weren't
zeroing vm_cnt counters during boot: the CPU_FOREACH() loop had no
effect, and pcpu_page_alloc() didn't honour M_ZERO. Fix this by
iterating over the full range of CPU IDs when zeroing counters,
ignoring whether the corresponding bits in all_cpus are set.
Reported and tested by: pho (previous version)
Reviewed by: kib (previous version)
Differential Revision: https://reviews.freebsd.org/D16190
2018-07-10 00:18:12 +00:00
|
|
|
for (i = 0; i <= mp_maxid; i++)
|
2020-02-12 11:11:22 +00:00
|
|
|
bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size);
|
2018-06-21 11:43:54 +00:00
|
|
|
#else
|
|
|
|
bzero(item, zone->uz_size);
|
|
|
|
#endif
|
2018-06-08 21:40:03 +00:00
|
|
|
}
|
2020-02-12 11:11:22 +00:00
|
|
|
return (pcpu_item);
|
2018-06-08 21:40:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A stub while both regular and pcpu cases are identical.
|
|
|
|
*/
|
|
|
|
void
|
2020-02-12 11:11:22 +00:00
|
|
|
uma_zfree_pcpu_arg(uma_zone_t zone, void *pcpu_item, void *udata)
|
2018-06-08 21:40:03 +00:00
|
|
|
{
|
2020-02-12 11:11:22 +00:00
|
|
|
void *item;
|
2018-06-08 21:40:03 +00:00
|
|
|
|
2018-06-22 20:22:26 +00:00
|
|
|
#ifdef SMP
|
2018-06-08 21:40:03 +00:00
|
|
|
MPASS(zone->uz_flags & UMA_ZONE_PCPU);
|
2018-06-22 20:22:26 +00:00
|
|
|
#endif
|
2021-03-10 15:11:59 +01:00
|
|
|
|
|
|
|
/* uma_zfree_pcu_*(..., NULL) does nothing, to match free(9). */
|
|
|
|
if (pcpu_item == NULL)
|
|
|
|
return;
|
|
|
|
|
2020-02-12 11:11:22 +00:00
|
|
|
item = zpcpu_offset_to_base(pcpu_item);
|
2018-06-08 21:40:03 +00:00
|
|
|
uma_zfree_arg(zone, item, udata);
|
|
|
|
}
|
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
static inline void *
|
|
|
|
item_ctor(uma_zone_t zone, int uz_flags, int size, void *udata, int flags,
|
|
|
|
void *item)
|
2019-11-26 22:17:02 +00:00
|
|
|
{
|
|
|
|
#ifdef INVARIANTS
|
2019-11-27 19:49:55 +00:00
|
|
|
bool skipdbg;
|
2021-04-13 17:39:50 -04:00
|
|
|
#endif
|
|
|
|
|
|
|
|
kasan_mark_item_valid(zone, item);
|
2021-08-10 17:15:03 -04:00
|
|
|
kmsan_mark_item_uninitialized(zone, item);
|
2019-11-26 22:17:02 +00:00
|
|
|
|
2021-04-13 17:39:50 -04:00
|
|
|
#ifdef INVARIANTS
|
2019-11-26 22:17:02 +00:00
|
|
|
skipdbg = uma_dbg_zskip(zone, item);
|
2021-04-13 17:39:50 -04:00
|
|
|
if (!skipdbg && (uz_flags & UMA_ZFLAG_TRASH) != 0 &&
|
2019-11-27 19:49:55 +00:00
|
|
|
zone->uz_ctor != trash_ctor)
|
2019-12-25 20:57:24 +00:00
|
|
|
trash_ctor(item, size, udata, flags);
|
2019-11-26 22:17:02 +00:00
|
|
|
#endif
|
2021-04-13 17:39:50 -04:00
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
/* Check flags before loading ctor pointer. */
|
|
|
|
if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0) &&
|
|
|
|
__predict_false(zone->uz_ctor != NULL) &&
|
2019-12-25 20:57:24 +00:00
|
|
|
zone->uz_ctor(item, size, udata, flags) != 0) {
|
2019-11-26 22:17:02 +00:00
|
|
|
counter_u64_add(zone->uz_fails, 1);
|
|
|
|
zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
if (!skipdbg)
|
|
|
|
uma_dbg_alloc(zone, NULL, item);
|
|
|
|
#endif
|
2020-02-16 01:07:19 +00:00
|
|
|
if (__predict_false(flags & M_ZERO))
|
|
|
|
return (memset(item, 0, size));
|
2019-11-26 22:17:02 +00:00
|
|
|
|
|
|
|
return (item);
|
|
|
|
}
|
|
|
|
|
2019-11-27 19:49:55 +00:00
|
|
|
static inline void
|
2019-12-25 20:57:24 +00:00
|
|
|
item_dtor(uma_zone_t zone, void *item, int size, void *udata,
|
|
|
|
enum zfreeskip skip)
|
2019-11-27 19:49:55 +00:00
|
|
|
{
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
bool skipdbg;
|
|
|
|
|
|
|
|
skipdbg = uma_dbg_zskip(zone, item);
|
|
|
|
if (skip == SKIP_NONE && !skipdbg) {
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0)
|
|
|
|
uma_dbg_free(zone, udata, item);
|
|
|
|
else
|
|
|
|
uma_dbg_free(zone, NULL, item);
|
|
|
|
}
|
|
|
|
#endif
|
2019-12-25 20:57:24 +00:00
|
|
|
if (__predict_true(skip < SKIP_DTOR)) {
|
2019-11-27 19:49:55 +00:00
|
|
|
if (zone->uz_dtor != NULL)
|
2019-12-25 20:57:24 +00:00
|
|
|
zone->uz_dtor(item, size, udata);
|
2019-11-27 19:49:55 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
|
|
|
|
zone->uz_dtor != trash_dtor)
|
2019-12-25 20:57:24 +00:00
|
|
|
trash_dtor(item, size, udata);
|
2019-11-27 19:49:55 +00:00
|
|
|
#endif
|
|
|
|
}
|
2021-04-13 17:39:50 -04:00
|
|
|
kasan_mark_item_invalid(zone, item);
|
2019-11-27 19:49:55 +00:00
|
|
|
}
|
|
|
|
|
2020-05-29 08:30:35 +00:00
|
|
|
#ifdef NUMA
|
Fix boot on systems where NUMA domain 0 is unpopulated.
- Add vm_phys_early_add_seg(), complementing vm_phys_early_alloc(), to
ensure that segments registered during hammer_time() are placed in the
right domain. Otherwise, since the SRAT is not parsed at that point,
we just add them to domain 0, which may be incorrect and results in a
domain with only several MB worth of memory.
- Fix uma_startup1() to try allocating memory for zones from any domain.
If domain 0 is unpopulated, the allocation will simply fail, resulting
in a page fault slightly later during boot.
- Change _vm_phys_domain() to return -1 for addresses not covered by the
affinity table, and change vm_phys_early_alloc() to handle wildcard
domains. This is necessary on amd64, where the page array is dense
and pmap_page_array_startup() may allocate page table pages for
non-existent page frames.
Reported and tested by: Rafael Kitover <rkitover@gmail.com>
Reviewed by: cem (earlier version), kib
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D25001
2020-05-28 19:41:00 +00:00
|
|
|
static int
|
|
|
|
item_domain(void *item)
|
|
|
|
{
|
|
|
|
int domain;
|
|
|
|
|
2020-11-19 03:59:21 +00:00
|
|
|
domain = vm_phys_domain(vtophys(item));
|
Fix boot on systems where NUMA domain 0 is unpopulated.
- Add vm_phys_early_add_seg(), complementing vm_phys_early_alloc(), to
ensure that segments registered during hammer_time() are placed in the
right domain. Otherwise, since the SRAT is not parsed at that point,
we just add them to domain 0, which may be incorrect and results in a
domain with only several MB worth of memory.
- Fix uma_startup1() to try allocating memory for zones from any domain.
If domain 0 is unpopulated, the allocation will simply fail, resulting
in a page fault slightly later during boot.
- Change _vm_phys_domain() to return -1 for addresses not covered by the
affinity table, and change vm_phys_early_alloc() to handle wildcard
domains. This is necessary on amd64, where the page array is dense
and pmap_page_array_startup() may allocate page table pages for
non-existent page frames.
Reported and tested by: Rafael Kitover <rkitover@gmail.com>
Reviewed by: cem (earlier version), kib
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D25001
2020-05-28 19:41:00 +00:00
|
|
|
KASSERT(domain >= 0 && domain < vm_ndomains,
|
|
|
|
("%s: unknown domain for item %p", __func__, item));
|
|
|
|
return (domain);
|
|
|
|
}
|
2020-05-29 08:30:35 +00:00
|
|
|
#endif
|
Fix boot on systems where NUMA domain 0 is unpopulated.
- Add vm_phys_early_add_seg(), complementing vm_phys_early_alloc(), to
ensure that segments registered during hammer_time() are placed in the
right domain. Otherwise, since the SRAT is not parsed at that point,
we just add them to domain 0, which may be incorrect and results in a
domain with only several MB worth of memory.
- Fix uma_startup1() to try allocating memory for zones from any domain.
If domain 0 is unpopulated, the allocation will simply fail, resulting
in a page fault slightly later during boot.
- Change _vm_phys_domain() to return -1 for addresses not covered by the
affinity table, and change vm_phys_early_alloc() to handle wildcard
domains. This is necessary on amd64, where the page array is dense
and pmap_page_array_startup() may allocate page table pages for
non-existent page frames.
Reported and tested by: Rafael Kitover <rkitover@gmail.com>
Reviewed by: cem (earlier version), kib
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D25001
2020-05-28 19:41:00 +00:00
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
#if defined(INVARIANTS) || defined(DEBUG_MEMGUARD) || defined(WITNESS)
|
|
|
|
#define UMA_ZALLOC_DEBUG
|
|
|
|
static int
|
|
|
|
uma_zalloc_debug(uma_zone_t zone, void **itemp, void *udata, int flags)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
2020-01-31 00:49:51 +00:00
|
|
|
int error;
|
2002-04-08 02:42:55 +00:00
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
error = 0;
|
2019-12-25 20:57:24 +00:00
|
|
|
#ifdef WITNESS
|
2007-01-10 21:04:43 +00:00
|
|
|
if (flags & M_WAITOK) {
|
|
|
|
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
|
2020-01-31 00:49:51 +00:00
|
|
|
"uma_zalloc_debug: zone \"%s\"", zone->uz_name);
|
2002-05-20 17:54:48 +00:00
|
|
|
}
|
2019-12-25 20:57:24 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef INVARIANTS
|
2020-01-31 00:49:51 +00:00
|
|
|
KASSERT((flags & M_EXEC) == 0,
|
|
|
|
("uma_zalloc_debug: called with M_EXEC"));
|
2015-12-11 20:05:07 +00:00
|
|
|
KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
|
2020-01-31 00:49:51 +00:00
|
|
|
("uma_zalloc_debug: called within spinlock or critical section"));
|
|
|
|
KASSERT((zone->uz_flags & UMA_ZONE_PCPU) == 0 || (flags & M_ZERO) == 0,
|
|
|
|
("uma_zalloc_debug: allocating from a pcpu zone with M_ZERO"));
|
2019-12-25 20:57:24 +00:00
|
|
|
#endif
|
2015-11-19 14:04:53 +00:00
|
|
|
|
2011-10-12 18:08:28 +00:00
|
|
|
#ifdef DEBUG_MEMGUARD
|
2020-01-31 02:03:22 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && memguard_cmp_zone(zone)) {
|
2020-01-31 00:49:51 +00:00
|
|
|
void *item;
|
2011-10-12 18:08:28 +00:00
|
|
|
item = memguard_alloc(zone->uz_size, flags);
|
|
|
|
if (item != NULL) {
|
2020-01-31 00:49:51 +00:00
|
|
|
error = EJUSTRETURN;
|
2011-10-12 18:08:28 +00:00
|
|
|
if (zone->uz_init != NULL &&
|
2020-01-31 00:49:51 +00:00
|
|
|
zone->uz_init(item, zone->uz_size, flags) != 0) {
|
|
|
|
*itemp = NULL;
|
|
|
|
return (error);
|
|
|
|
}
|
2011-10-12 18:08:28 +00:00
|
|
|
if (zone->uz_ctor != NULL &&
|
2013-06-18 04:50:20 +00:00
|
|
|
zone->uz_ctor(item, zone->uz_size, udata,
|
|
|
|
flags) != 0) {
|
2019-11-27 19:49:55 +00:00
|
|
|
counter_u64_add(zone->uz_fails, 1);
|
2011-10-12 18:08:28 +00:00
|
|
|
zone->uz_fini(item, zone->uz_size);
|
2020-01-31 00:49:51 +00:00
|
|
|
*itemp = NULL;
|
|
|
|
return (error);
|
2011-10-12 18:08:28 +00:00
|
|
|
}
|
2020-01-31 00:49:51 +00:00
|
|
|
*itemp = item;
|
|
|
|
return (error);
|
2011-10-12 18:08:28 +00:00
|
|
|
}
|
|
|
|
/* This is unfortunate but should not be fatal. */
|
|
|
|
}
|
|
|
|
#endif
|
2020-01-31 00:49:51 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
uma_zfree_debug(uma_zone_t zone, void *item, void *udata)
|
|
|
|
{
|
|
|
|
KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
|
|
|
|
("uma_zfree_debug: called with spinlock or critical section held"));
|
|
|
|
|
|
|
|
#ifdef DEBUG_MEMGUARD
|
2020-01-31 02:03:22 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && is_memguard_addr(item)) {
|
2020-01-31 00:49:51 +00:00
|
|
|
if (zone->uz_dtor != NULL)
|
|
|
|
zone->uz_dtor(item, zone->uz_size, udata);
|
|
|
|
if (zone->uz_fini != NULL)
|
|
|
|
zone->uz_fini(item, zone->uz_size);
|
|
|
|
memguard_free(item);
|
|
|
|
return (EJUSTRETURN);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2020-02-16 01:07:19 +00:00
|
|
|
static inline void *
|
|
|
|
cache_alloc_item(uma_zone_t zone, uma_cache_t cache, uma_cache_bucket_t bucket,
|
|
|
|
void *udata, int flags)
|
|
|
|
{
|
|
|
|
void *item;
|
|
|
|
int size, uz_flags;
|
|
|
|
|
|
|
|
item = cache_bucket_pop(cache, bucket);
|
|
|
|
size = cache_uz_size(cache);
|
|
|
|
uz_flags = cache_uz_flags(cache);
|
|
|
|
critical_exit();
|
|
|
|
return (item_ctor(zone, uz_flags, size, udata, flags, item));
|
|
|
|
}
|
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
static __noinline void *
|
2020-02-16 01:07:19 +00:00
|
|
|
cache_alloc_retry(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
|
2020-01-31 00:49:51 +00:00
|
|
|
{
|
2020-02-16 01:07:19 +00:00
|
|
|
uma_cache_bucket_t bucket;
|
2020-01-31 00:49:51 +00:00
|
|
|
int domain;
|
|
|
|
|
2020-02-16 01:07:19 +00:00
|
|
|
while (cache_alloc(zone, cache, udata, flags)) {
|
|
|
|
cache = &zone->uz_cpu[curcpu];
|
|
|
|
bucket = &cache->uc_allocbucket;
|
|
|
|
if (__predict_false(bucket->ucb_cnt == 0))
|
|
|
|
continue;
|
|
|
|
return (cache_alloc_item(zone, cache, bucket, udata, flags));
|
|
|
|
}
|
|
|
|
critical_exit();
|
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
/*
|
|
|
|
* We can not get a bucket so try to return a single item.
|
|
|
|
*/
|
|
|
|
if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH)
|
|
|
|
domain = PCPU_GET(domain);
|
|
|
|
else
|
|
|
|
domain = UMA_ANYDOMAIN;
|
|
|
|
return (zone_alloc_item(zone, udata, domain, flags));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* See uma.h */
|
|
|
|
void *
|
|
|
|
uma_zalloc_smr(uma_zone_t zone, int flags)
|
|
|
|
{
|
|
|
|
uma_cache_bucket_t bucket;
|
|
|
|
uma_cache_t cache;
|
|
|
|
|
|
|
|
#ifdef UMA_ZALLOC_DEBUG
|
2020-02-16 01:07:19 +00:00
|
|
|
void *item;
|
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
|
2020-10-02 19:03:42 +00:00
|
|
|
("uma_zalloc_arg: called with non-SMR zone."));
|
2020-01-31 00:49:51 +00:00
|
|
|
if (uma_zalloc_debug(zone, &item, NULL, flags) == EJUSTRETURN)
|
|
|
|
return (item);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
critical_enter();
|
2020-02-16 01:07:19 +00:00
|
|
|
cache = &zone->uz_cpu[curcpu];
|
|
|
|
bucket = &cache->uc_allocbucket;
|
|
|
|
if (__predict_false(bucket->ucb_cnt == 0))
|
|
|
|
return (cache_alloc_retry(zone, cache, NULL, flags));
|
|
|
|
return (cache_alloc_item(zone, cache, bucket, NULL, flags));
|
2020-01-31 00:49:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* See uma.h */
|
|
|
|
void *
|
|
|
|
uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
|
|
|
|
{
|
|
|
|
uma_cache_bucket_t bucket;
|
|
|
|
uma_cache_t cache;
|
|
|
|
|
|
|
|
/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
|
|
|
|
random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
|
|
|
|
|
|
|
|
/* This is the fast path allocation */
|
|
|
|
CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name,
|
|
|
|
zone, flags);
|
|
|
|
|
|
|
|
#ifdef UMA_ZALLOC_DEBUG
|
2020-02-16 01:07:19 +00:00
|
|
|
void *item;
|
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
|
2020-10-02 19:03:42 +00:00
|
|
|
("uma_zalloc_arg: called with SMR zone."));
|
2020-01-31 00:49:51 +00:00
|
|
|
if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN)
|
|
|
|
return (item);
|
|
|
|
#endif
|
|
|
|
|
Modify UMA to use critical sections to protect per-CPU caches, rather than
mutexes, which offers lower overhead on both UP and SMP. When allocating
from or freeing to the per-cpu cache, without INVARIANTS enabled, we now
no longer perform any mutex operations, which offers a 1%-3% performance
improvement in a variety of micro-benchmarks. We rely on critical
sections to prevent (a) preemption resulting in reentrant access to UMA on
a single CPU, and (b) migration of the thread during access. In the event
we need to go back to the zone for a new bucket, we release the critical
section to acquire the global zone mutex, and must re-acquire the critical
section and re-evaluate which cache we are accessing in case migration has
occured, or circumstances have changed in the current cache.
Per-CPU cache statistics are now gathered lock-free by the sysctl, which
can result in small races in statistics reporting for caches.
Reviewed by: bmilekic, jeff (somewhat)
Tested by: rwatson, kris, gnn, scottl, mike at sentex dot net, others
2005-04-29 18:56:36 +00:00
|
|
|
/*
|
|
|
|
* If possible, allocate from the per-CPU cache. There are two
|
|
|
|
* requirements for safe access to the per-CPU cache: (1) the thread
|
|
|
|
* accessing the cache must not be preempted or yield during access,
|
|
|
|
* and (2) the thread must not migrate CPUs without switching which
|
|
|
|
* cache it accesses. We rely on a critical section to prevent
|
|
|
|
* preemption and migration. We release the critical section in
|
|
|
|
* order to acquire the zone mutex if we are unable to allocate from
|
|
|
|
* the current cache; when we re-acquire the critical section, we
|
|
|
|
* must detect and handle migration if it has occurred.
|
|
|
|
*/
|
|
|
|
critical_enter();
|
2020-02-16 01:07:19 +00:00
|
|
|
cache = &zone->uz_cpu[curcpu];
|
|
|
|
bucket = &cache->uc_allocbucket;
|
|
|
|
if (__predict_false(bucket->ucb_cnt == 0))
|
|
|
|
return (cache_alloc_retry(zone, cache, udata, flags));
|
|
|
|
return (cache_alloc_item(zone, cache, bucket, udata, flags));
|
2019-11-26 22:17:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Replenish an alloc bucket and possibly restore an old one. Called in
|
|
|
|
* a critical section. Returns in a critical section.
|
|
|
|
*
|
2020-01-04 03:04:46 +00:00
|
|
|
* A false return value indicates an allocation failure.
|
|
|
|
* A true return value indicates success and the caller should retry.
|
2019-11-26 22:17:02 +00:00
|
|
|
*/
|
|
|
|
static __noinline bool
|
|
|
|
cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
|
|
|
|
{
|
|
|
|
uma_bucket_t bucket;
|
2020-06-28 21:35:04 +00:00
|
|
|
int curdomain, domain;
|
2020-02-19 18:48:46 +00:00
|
|
|
bool new;
|
2019-11-26 22:17:02 +00:00
|
|
|
|
|
|
|
CRITICAL_ASSERT(curthread);
|
2013-06-18 04:50:20 +00:00
|
|
|
|
|
|
|
/*
|
2019-11-26 22:17:02 +00:00
|
|
|
* If we have run out of items in our alloc bucket see
|
|
|
|
* if we can switch with the free bucket.
|
2020-01-31 00:49:51 +00:00
|
|
|
*
|
|
|
|
* SMR Zones can't re-use the free bucket until the sequence has
|
|
|
|
* expired.
|
2013-06-18 04:50:20 +00:00
|
|
|
*/
|
2020-02-19 18:48:46 +00:00
|
|
|
if ((cache_uz_flags(cache) & UMA_ZONE_SMR) == 0 &&
|
2020-01-31 00:49:51 +00:00
|
|
|
cache->uc_freebucket.ucb_cnt != 0) {
|
|
|
|
cache_bucket_swap(&cache->uc_freebucket,
|
|
|
|
&cache->uc_allocbucket);
|
2019-11-26 22:17:02 +00:00
|
|
|
return (true);
|
2002-04-08 02:42:55 +00:00
|
|
|
}
|
2013-06-18 04:50:20 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Discard any empty allocation bucket while we hold no locks.
|
|
|
|
*/
|
2019-12-25 20:50:53 +00:00
|
|
|
bucket = cache_bucket_unload_alloc(cache);
|
2013-06-18 04:50:20 +00:00
|
|
|
critical_exit();
|
2020-02-19 18:48:46 +00:00
|
|
|
|
|
|
|
if (bucket != NULL) {
|
|
|
|
KASSERT(bucket->ub_cnt == 0,
|
|
|
|
("cache_alloc: Entered with non-empty alloc bucket."));
|
2013-06-26 00:57:38 +00:00
|
|
|
bucket_free(zone, bucket, udata);
|
2020-02-19 18:48:46 +00:00
|
|
|
}
|
2013-06-18 04:50:20 +00:00
|
|
|
|
Modify UMA to use critical sections to protect per-CPU caches, rather than
mutexes, which offers lower overhead on both UP and SMP. When allocating
from or freeing to the per-cpu cache, without INVARIANTS enabled, we now
no longer perform any mutex operations, which offers a 1%-3% performance
improvement in a variety of micro-benchmarks. We rely on critical
sections to prevent (a) preemption resulting in reentrant access to UMA on
a single CPU, and (b) migration of the thread during access. In the event
we need to go back to the zone for a new bucket, we release the critical
section to acquire the global zone mutex, and must re-acquire the critical
section and re-evaluate which cache we are accessing in case migration has
occured, or circumstances have changed in the current cache.
Per-CPU cache statistics are now gathered lock-free by the sysctl, which
can result in small races in statistics reporting for caches.
Reviewed by: bmilekic, jeff (somewhat)
Tested by: rwatson, kris, gnn, scottl, mike at sentex dot net, others
2005-04-29 18:56:36 +00:00
|
|
|
/*
|
|
|
|
* Attempt to retrieve the item from the per-CPU cache has failed, so
|
2020-02-19 18:48:46 +00:00
|
|
|
* we must go back to the zone. This requires the zdom lock, so we
|
Modify UMA to use critical sections to protect per-CPU caches, rather than
mutexes, which offers lower overhead on both UP and SMP. When allocating
from or freeing to the per-cpu cache, without INVARIANTS enabled, we now
no longer perform any mutex operations, which offers a 1%-3% performance
improvement in a variety of micro-benchmarks. We rely on critical
sections to prevent (a) preemption resulting in reentrant access to UMA on
a single CPU, and (b) migration of the thread during access. In the event
we need to go back to the zone for a new bucket, we release the critical
section to acquire the global zone mutex, and must re-acquire the critical
section and re-evaluate which cache we are accessing in case migration has
occured, or circumstances have changed in the current cache.
Per-CPU cache statistics are now gathered lock-free by the sysctl, which
can result in small races in statistics reporting for caches.
Reviewed by: bmilekic, jeff (somewhat)
Tested by: rwatson, kris, gnn, scottl, mike at sentex dot net, others
2005-04-29 18:56:36 +00:00
|
|
|
* must drop the critical section, then re-acquire it when we go back
|
|
|
|
* to the cache. Since the critical section is released, we may be
|
|
|
|
* preempted or migrate. As such, make sure not to maintain any
|
|
|
|
* thread-local state specific to the cache from prior to releasing
|
|
|
|
* the critical section.
|
|
|
|
*/
|
2020-02-19 18:48:46 +00:00
|
|
|
domain = PCPU_GET(domain);
|
2020-06-28 21:35:04 +00:00
|
|
|
if ((cache_uz_flags(cache) & UMA_ZONE_ROUNDROBIN) != 0 ||
|
|
|
|
VM_DOMAIN_EMPTY(domain))
|
2020-02-19 18:48:46 +00:00
|
|
|
domain = zone_domain_highest(zone, domain);
|
|
|
|
bucket = cache_fetch_bucket(zone, cache, domain);
|
2020-08-10 20:34:45 +00:00
|
|
|
if (bucket == NULL && zone->uz_bucket_size != 0 && !bucketdisable) {
|
2020-02-19 18:48:46 +00:00
|
|
|
bucket = zone_alloc_bucket(zone, udata, domain, flags);
|
|
|
|
new = true;
|
2020-08-10 20:34:45 +00:00
|
|
|
} else {
|
2020-02-19 18:48:46 +00:00
|
|
|
new = false;
|
2020-08-10 20:34:45 +00:00
|
|
|
}
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
|
2017-06-01 18:36:52 +00:00
|
|
|
CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
|
|
|
|
zone->uz_name, zone, bucket);
|
2020-01-04 03:04:46 +00:00
|
|
|
if (bucket == NULL) {
|
|
|
|
critical_enter();
|
2019-11-26 22:17:02 +00:00
|
|
|
return (false);
|
2020-01-04 03:04:46 +00:00
|
|
|
}
|
2013-06-18 04:50:20 +00:00
|
|
|
|
2002-10-24 07:59:03 +00:00
|
|
|
/*
|
2019-11-26 22:17:02 +00:00
|
|
|
* See if we lost the race or were migrated. Cache the
|
|
|
|
* initialized bucket to make this less likely or claim
|
|
|
|
* the memory directly.
|
2002-10-24 07:59:03 +00:00
|
|
|
*/
|
2020-01-04 03:04:46 +00:00
|
|
|
critical_enter();
|
2019-12-25 20:57:24 +00:00
|
|
|
cache = &zone->uz_cpu[curcpu];
|
2019-12-25 20:50:53 +00:00
|
|
|
if (cache->uc_allocbucket.ucb_bucket == NULL &&
|
2020-02-19 18:48:46 +00:00
|
|
|
((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) == 0 ||
|
2020-06-28 21:35:04 +00:00
|
|
|
(curdomain = PCPU_GET(domain)) == domain ||
|
|
|
|
VM_DOMAIN_EMPTY(curdomain))) {
|
2020-02-19 18:48:46 +00:00
|
|
|
if (new)
|
|
|
|
atomic_add_long(&ZDOM_GET(zone, domain)->uzd_imax,
|
|
|
|
bucket->ub_cnt);
|
2019-12-25 20:50:53 +00:00
|
|
|
cache_bucket_load_alloc(cache, bucket);
|
2019-11-26 22:17:02 +00:00
|
|
|
return (true);
|
2020-02-19 18:48:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We lost the race, release this bucket and start over.
|
|
|
|
*/
|
|
|
|
critical_exit();
|
Improve UMA cache reclamation.
When estimating working set size, measure only allocation batches, not free
batches. Allocation and free patterns can be very different. For example,
ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
but it does not mean it will request the same amount back that fast too, in
fact it won't.
Update working set size on every reclamation call, shrinking caches faster
under pressure. Lack of this caused repeating vm_lowmem events squeezing
more and more memory out of real consumers only to make it stuck in UMA
caches. I saw ZFS drop ARC size in half before previous algorithm after
periodic WSS update decided to reclaim UMA caches.
Introduce voluntary reclamation of UMA caches not used for a long time. For
each zdom track longterm minimal cache size watermark, freeing some unused
items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
memory can get better use by other consumers. For example, ZFS won't grow
its ARC unless it see free memory, since it does not know it is not really
used. And even if memory is not really needed, periodic free during
inactivity periods should reduce its fragmentation.
Reviewed by: markj, jeff (previous version)
MFC after: 2 weeks
Sponsored by: iXsystems, Inc.
Differential Revision: https://reviews.freebsd.org/D29790
2021-05-02 19:35:28 -04:00
|
|
|
zone_put_bucket(zone, domain, bucket, udata, !new);
|
2020-02-19 18:48:46 +00:00
|
|
|
critical_enter();
|
|
|
|
|
2019-11-26 22:17:02 +00:00
|
|
|
return (true);
|
2002-10-24 07:59:03 +00:00
|
|
|
}
|
|
|
|
|
2018-01-12 23:25:05 +00:00
|
|
|
void *
|
|
|
|
uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
|
|
|
|
{
|
2020-10-02 19:04:29 +00:00
|
|
|
#ifdef NUMA
|
|
|
|
uma_bucket_t bucket;
|
|
|
|
uma_zone_domain_t zdom;
|
|
|
|
void *item;
|
|
|
|
#endif
|
2018-01-12 23:25:05 +00:00
|
|
|
|
|
|
|
/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
|
2018-08-26 12:51:46 +00:00
|
|
|
random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
|
2018-01-12 23:25:05 +00:00
|
|
|
|
|
|
|
/* This is the fast path allocation */
|
2020-01-14 02:13:46 +00:00
|
|
|
CTR4(KTR_UMA, "uma_zalloc_domain zone %s(%p) domain %d flags %d",
|
|
|
|
zone->uz_name, zone, domain, flags);
|
2018-01-12 23:25:05 +00:00
|
|
|
|
|
|
|
if (flags & M_WAITOK) {
|
|
|
|
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
|
|
|
|
"uma_zalloc_domain: zone \"%s\"", zone->uz_name);
|
|
|
|
}
|
|
|
|
KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
|
|
|
|
("uma_zalloc_domain: called with spinlock or critical section held"));
|
2020-10-02 19:04:29 +00:00
|
|
|
KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
|
|
|
|
("uma_zalloc_domain: called with SMR zone."));
|
|
|
|
#ifdef NUMA
|
|
|
|
KASSERT((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0,
|
|
|
|
("uma_zalloc_domain: called with non-FIRSTTOUCH zone."));
|
|
|
|
|
|
|
|
if (vm_ndomains == 1)
|
|
|
|
return (uma_zalloc_arg(zone, udata, flags));
|
2018-01-12 23:25:05 +00:00
|
|
|
|
2020-10-02 19:04:29 +00:00
|
|
|
/*
|
|
|
|
* Try to allocate from the bucket cache before falling back to the keg.
|
|
|
|
* We could try harder and attempt to allocate from per-CPU caches or
|
|
|
|
* the per-domain cross-domain buckets, but the complexity is probably
|
|
|
|
* not worth it. It is more important that frees of previous
|
|
|
|
* cross-domain allocations do not blow up the cache.
|
|
|
|
*/
|
|
|
|
zdom = zone_domain_lock(zone, domain);
|
|
|
|
if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) {
|
|
|
|
item = bucket->ub_bucket[bucket->ub_cnt - 1];
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
bucket->ub_bucket[bucket->ub_cnt - 1] = NULL;
|
|
|
|
#endif
|
|
|
|
bucket->ub_cnt--;
|
|
|
|
zone_put_bucket(zone, domain, bucket, udata, true);
|
|
|
|
item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata,
|
|
|
|
flags, item);
|
|
|
|
if (item != NULL) {
|
|
|
|
KASSERT(item_domain(item) == domain,
|
|
|
|
("%s: bucket cache item %p from wrong domain",
|
|
|
|
__func__, item));
|
|
|
|
counter_u64_add(zone->uz_allocs, 1);
|
|
|
|
}
|
|
|
|
return (item);
|
|
|
|
}
|
|
|
|
ZDOM_UNLOCK(zdom);
|
2018-01-12 23:25:05 +00:00
|
|
|
return (zone_alloc_item(zone, udata, domain, flags));
|
2020-10-02 19:04:29 +00:00
|
|
|
#else
|
|
|
|
return (uma_zalloc_arg(zone, udata, flags));
|
|
|
|
#endif
|
2018-01-12 23:25:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find a slab with some space. Prefer slabs that are partially used over those
|
|
|
|
* that are totally full. This helps to reduce fragmentation.
|
|
|
|
*
|
|
|
|
* If 'rr' is 1, search all domains starting from 'domain'. Otherwise check
|
|
|
|
* only 'domain'.
|
|
|
|
*/
|
2002-10-24 07:59:03 +00:00
|
|
|
static uma_slab_t
|
2018-10-24 16:41:47 +00:00
|
|
|
keg_first_slab(uma_keg_t keg, int domain, bool rr)
|
2002-10-24 07:59:03 +00:00
|
|
|
{
|
2018-01-12 23:25:05 +00:00
|
|
|
uma_domain_t dom;
|
2002-10-24 07:59:03 +00:00
|
|
|
uma_slab_t slab;
|
2018-01-12 23:25:05 +00:00
|
|
|
int start;
|
|
|
|
|
|
|
|
KASSERT(domain >= 0 && domain < vm_ndomains,
|
|
|
|
("keg_first_slab: domain %d out of range", domain));
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_LOCK_ASSERT(keg, domain);
|
2018-01-12 23:25:05 +00:00
|
|
|
|
|
|
|
slab = NULL;
|
|
|
|
start = domain;
|
|
|
|
do {
|
|
|
|
dom = &keg->uk_domain[domain];
|
2020-02-11 20:06:33 +00:00
|
|
|
if ((slab = LIST_FIRST(&dom->ud_part_slab)) != NULL)
|
|
|
|
return (slab);
|
|
|
|
if ((slab = LIST_FIRST(&dom->ud_free_slab)) != NULL) {
|
2018-01-12 23:25:05 +00:00
|
|
|
LIST_REMOVE(slab, us_link);
|
2020-02-11 20:06:33 +00:00
|
|
|
dom->ud_free_slabs--;
|
2018-01-12 23:25:05 +00:00
|
|
|
LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
|
|
|
|
return (slab);
|
|
|
|
}
|
|
|
|
if (rr)
|
|
|
|
domain = (domain + 1) % vm_ndomains;
|
|
|
|
} while (domain != start);
|
|
|
|
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
2020-01-04 03:30:08 +00:00
|
|
|
/*
|
|
|
|
* Fetch an existing slab from a free or partial list. Returns with the
|
|
|
|
* keg domain lock held if a slab was found or unlocked if not.
|
|
|
|
*/
|
2018-01-12 23:25:05 +00:00
|
|
|
static uma_slab_t
|
2018-10-24 16:41:47 +00:00
|
|
|
keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
|
|
|
|
{
|
2020-01-04 03:30:08 +00:00
|
|
|
uma_slab_t slab;
|
2018-10-24 16:41:47 +00:00
|
|
|
uint32_t reserve;
|
|
|
|
|
2020-01-04 03:30:08 +00:00
|
|
|
/* HASH has a single free list. */
|
2020-01-09 02:03:03 +00:00
|
|
|
if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0)
|
2020-01-04 03:30:08 +00:00
|
|
|
domain = 0;
|
2018-10-24 16:41:47 +00:00
|
|
|
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_LOCK(keg, domain);
|
2018-10-24 16:41:47 +00:00
|
|
|
reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
|
2020-02-11 20:06:33 +00:00
|
|
|
if (keg->uk_domain[domain].ud_free_items <= reserve ||
|
2020-01-04 03:30:08 +00:00
|
|
|
(slab = keg_first_slab(keg, domain, rr)) == NULL) {
|
|
|
|
KEG_UNLOCK(keg, domain);
|
2018-10-24 16:41:47 +00:00
|
|
|
return (NULL);
|
2020-01-04 03:30:08 +00:00
|
|
|
}
|
|
|
|
return (slab);
|
2018-10-24 16:41:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static uma_slab_t
|
|
|
|
keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
|
2018-01-12 23:25:05 +00:00
|
|
|
{
|
2018-10-24 16:41:47 +00:00
|
|
|
struct vm_domainset_iter di;
|
2018-01-12 23:25:05 +00:00
|
|
|
uma_slab_t slab;
|
2018-10-24 16:41:47 +00:00
|
|
|
int aflags, domain;
|
|
|
|
bool rr;
|
2002-10-24 07:59:03 +00:00
|
|
|
|
2018-10-24 16:41:47 +00:00
|
|
|
restart:
|
2018-01-12 23:25:05 +00:00
|
|
|
/*
|
2018-10-24 16:41:47 +00:00
|
|
|
* Use the keg's policy if upper layers haven't already specified a
|
|
|
|
* domain (as happens with first-touch zones).
|
|
|
|
*
|
|
|
|
* To avoid races we run the iterator with the keg lock held, but that
|
|
|
|
* means that we cannot allow the vm_domainset layer to sleep. Thus,
|
|
|
|
* clear M_WAITOK and handle low memory conditions locally.
|
2018-01-12 23:25:05 +00:00
|
|
|
*/
|
|
|
|
rr = rdomain == UMA_ANYDOMAIN;
|
|
|
|
if (rr) {
|
2018-10-24 16:41:47 +00:00
|
|
|
aflags = (flags & ~M_WAITOK) | M_NOWAIT;
|
|
|
|
vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
|
|
|
|
&aflags);
|
|
|
|
} else {
|
|
|
|
aflags = flags;
|
|
|
|
domain = rdomain;
|
|
|
|
}
|
2018-01-12 23:25:05 +00:00
|
|
|
|
2018-10-24 16:41:47 +00:00
|
|
|
for (;;) {
|
|
|
|
slab = keg_fetch_free_slab(keg, domain, rr, flags);
|
2019-11-28 07:49:25 +00:00
|
|
|
if (slab != NULL)
|
2002-10-24 07:59:03 +00:00
|
|
|
return (slab);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* M_NOVM means don't ask at all!
|
|
|
|
*/
|
|
|
|
if (flags & M_NOVM)
|
|
|
|
break;
|
|
|
|
|
2019-01-23 18:58:15 +00:00
|
|
|
slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
|
2020-01-04 03:30:08 +00:00
|
|
|
if (slab != NULL)
|
2002-10-24 07:59:03 +00:00
|
|
|
return (slab);
|
2019-12-25 19:26:35 +00:00
|
|
|
if (!rr && (flags & M_WAITOK) == 0)
|
|
|
|
break;
|
2018-10-24 16:41:47 +00:00
|
|
|
if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
|
|
|
|
if ((flags & M_WAITOK) != 0) {
|
2020-09-08 23:28:09 +00:00
|
|
|
vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
|
2018-10-24 16:41:47 +00:00
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
break;
|
2018-10-01 14:14:21 +00:00
|
|
|
}
|
2002-10-24 07:59:03 +00:00
|
|
|
}
|
2018-01-12 23:25:05 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We might not have been able to get a slab but another cpu
|
|
|
|
* could have while we were unlocked. Check again before we
|
|
|
|
* fail.
|
|
|
|
*/
|
2020-01-04 03:30:08 +00:00
|
|
|
if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL)
|
2018-01-12 23:25:05 +00:00
|
|
|
return (slab);
|
2020-01-04 03:30:08 +00:00
|
|
|
|
2018-01-12 23:25:05 +00:00
|
|
|
return (NULL);
|
2002-10-24 07:59:03 +00:00
|
|
|
}
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2003-07-30 18:55:15 +00:00
|
|
|
static void *
|
2013-06-17 03:43:47 +00:00
|
|
|
slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
|
2002-10-24 07:59:03 +00:00
|
|
|
{
|
2018-01-12 23:25:05 +00:00
|
|
|
uma_domain_t dom;
|
2002-10-24 07:59:03 +00:00
|
|
|
void *item;
|
2020-01-14 02:14:15 +00:00
|
|
|
int freei;
|
2004-01-30 16:26:29 +00:00
|
|
|
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_LOCK_ASSERT(keg, slab->us_domain);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
2020-01-04 03:30:08 +00:00
|
|
|
dom = &keg->uk_domain[slab->us_domain];
|
2019-12-02 22:44:34 +00:00
|
|
|
freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1;
|
|
|
|
BIT_CLR(keg->uk_ipers, freei, &slab->us_free);
|
2019-12-08 01:15:06 +00:00
|
|
|
item = slab_item(slab, keg, freei);
|
2002-10-24 07:59:03 +00:00
|
|
|
slab->us_freecount--;
|
2020-02-11 20:06:33 +00:00
|
|
|
dom->ud_free_items--;
|
2013-06-13 21:05:38 +00:00
|
|
|
|
2020-02-11 20:06:33 +00:00
|
|
|
/*
|
|
|
|
* Move this slab to the full list. It must be on the partial list, so
|
|
|
|
* we do not need to update the free slab count. In particular,
|
|
|
|
* keg_fetch_slab() always returns slabs on the partial list.
|
|
|
|
*/
|
2002-10-24 07:59:03 +00:00
|
|
|
if (slab->us_freecount == 0) {
|
|
|
|
LIST_REMOVE(slab, us_link);
|
2018-01-12 23:25:05 +00:00
|
|
|
LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
|
2002-10-24 07:59:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return (item);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2019-12-04 18:40:05 +00:00
|
|
|
zone_import(void *arg, void **bucket, int max, int domain, int flags)
|
2002-10-24 07:59:03 +00:00
|
|
|
{
|
2020-01-04 03:30:08 +00:00
|
|
|
uma_domain_t dom;
|
2019-12-04 18:40:05 +00:00
|
|
|
uma_zone_t zone;
|
2002-10-24 07:59:03 +00:00
|
|
|
uma_slab_t slab;
|
2009-01-25 09:11:24 +00:00
|
|
|
uma_keg_t keg;
|
2018-07-07 13:37:44 +00:00
|
|
|
#ifdef NUMA
|
2018-01-12 23:25:05 +00:00
|
|
|
int stripe;
|
2018-07-07 13:37:44 +00:00
|
|
|
#endif
|
2013-06-17 03:43:47 +00:00
|
|
|
int i;
|
2002-06-17 22:02:41 +00:00
|
|
|
|
2019-12-04 18:40:05 +00:00
|
|
|
zone = arg;
|
2013-06-17 03:43:47 +00:00
|
|
|
slab = NULL;
|
2019-11-28 07:49:25 +00:00
|
|
|
keg = zone->uz_keg;
|
2013-06-20 19:08:12 +00:00
|
|
|
/* Try to keep the buckets totally full */
|
2013-06-17 03:43:47 +00:00
|
|
|
for (i = 0; i < max; ) {
|
2019-11-28 07:49:25 +00:00
|
|
|
if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL)
|
2013-06-17 03:43:47 +00:00
|
|
|
break;
|
2018-07-07 13:37:44 +00:00
|
|
|
#ifdef NUMA
|
2018-01-12 23:25:05 +00:00
|
|
|
stripe = howmany(max, vm_ndomains);
|
2018-07-07 13:37:44 +00:00
|
|
|
#endif
|
2020-01-04 03:30:08 +00:00
|
|
|
dom = &keg->uk_domain[slab->us_domain];
|
2020-10-19 16:55:03 +00:00
|
|
|
do {
|
2013-06-17 03:43:47 +00:00
|
|
|
bucket[i++] = slab_alloc_item(keg, slab);
|
2020-10-19 16:55:03 +00:00
|
|
|
if (dom->ud_free_items <= keg->uk_reserve) {
|
|
|
|
/*
|
|
|
|
* Avoid depleting the reserve after a
|
|
|
|
* successful item allocation, even if
|
|
|
|
* M_USE_RESERVE is specified.
|
|
|
|
*/
|
|
|
|
KEG_UNLOCK(keg, slab->us_domain);
|
|
|
|
goto out;
|
|
|
|
}
|
2018-01-14 03:36:03 +00:00
|
|
|
#ifdef NUMA
|
2018-01-12 23:25:05 +00:00
|
|
|
/*
|
|
|
|
* If the zone is striped we pick a new slab for every
|
|
|
|
* N allocations. Eliminating this conditional will
|
|
|
|
* instead pick a new domain for each bucket rather
|
|
|
|
* than stripe within each bucket. The current option
|
|
|
|
* produces more fragmentation and requires more cpu
|
|
|
|
* time but yields better distribution.
|
|
|
|
*/
|
2020-01-04 18:48:13 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0 &&
|
2018-01-12 23:25:05 +00:00
|
|
|
vm_ndomains > 1 && --stripe == 0)
|
|
|
|
break;
|
|
|
|
#endif
|
2020-10-19 16:55:03 +00:00
|
|
|
} while (slab->us_freecount != 0 && i < max);
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_UNLOCK(keg, slab->us_domain);
|
2020-10-19 16:55:03 +00:00
|
|
|
|
2018-01-12 23:25:05 +00:00
|
|
|
/* Don't block if we allocated any successfully. */
|
2013-06-17 03:43:47 +00:00
|
|
|
flags &= ~M_WAITOK;
|
|
|
|
flags |= M_NOWAIT;
|
2002-10-24 07:59:03 +00:00
|
|
|
}
|
2020-10-19 16:55:03 +00:00
|
|
|
out:
|
2013-06-17 03:43:47 +00:00
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
2020-01-04 03:04:46 +00:00
|
|
|
static int
|
|
|
|
zone_alloc_limit_hard(uma_zone_t zone, int count, int flags)
|
|
|
|
{
|
|
|
|
uint64_t old, new, total, max;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The hard case. We're going to sleep because there were existing
|
|
|
|
* sleepers or because we ran out of items. This routine enforces
|
|
|
|
* fairness by keeping fifo order.
|
|
|
|
*
|
|
|
|
* First release our ill gotten gains and make some noise.
|
|
|
|
*/
|
|
|
|
for (;;) {
|
|
|
|
zone_free_limit(zone, count);
|
|
|
|
zone_log_warning(zone);
|
|
|
|
zone_maxaction(zone);
|
|
|
|
if (flags & M_NOWAIT)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to allocate an item or set ourself as a sleeper
|
|
|
|
* while the sleepq lock is held to avoid wakeup races. This
|
|
|
|
* is essentially a home rolled semaphore.
|
|
|
|
*/
|
|
|
|
sleepq_lock(&zone->uz_max_items);
|
|
|
|
old = zone->uz_items;
|
|
|
|
do {
|
|
|
|
MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX);
|
|
|
|
/* Cache the max since we will evaluate twice. */
|
|
|
|
max = zone->uz_max_items;
|
|
|
|
if (UZ_ITEMS_SLEEPERS(old) != 0 ||
|
|
|
|
UZ_ITEMS_COUNT(old) >= max)
|
|
|
|
new = old + UZ_ITEMS_SLEEPER;
|
|
|
|
else
|
|
|
|
new = old + MIN(count, max - old);
|
|
|
|
} while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0);
|
|
|
|
|
|
|
|
/* We may have successfully allocated under the sleepq lock. */
|
|
|
|
if (UZ_ITEMS_SLEEPERS(new) == 0) {
|
|
|
|
sleepq_release(&zone->uz_max_items);
|
|
|
|
return (new - old);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is in a different cacheline from uz_items so that we
|
|
|
|
* don't constantly invalidate the fastpath cacheline when we
|
|
|
|
* adjust item counts. This could be limited to toggling on
|
|
|
|
* transitions.
|
|
|
|
*/
|
|
|
|
atomic_add_32(&zone->uz_sleepers, 1);
|
|
|
|
atomic_add_64(&zone->uz_sleeps, 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have added ourselves as a sleeper. The sleepq lock
|
|
|
|
* protects us from wakeup races. Sleep now and then retry.
|
|
|
|
*/
|
|
|
|
sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0);
|
|
|
|
sleepq_wait(&zone->uz_max_items, PVM);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* After wakeup, remove ourselves as a sleeper and try
|
|
|
|
* again. We no longer have the sleepq lock for protection.
|
|
|
|
*
|
|
|
|
* Subract ourselves as a sleeper while attempting to add
|
|
|
|
* our count.
|
|
|
|
*/
|
|
|
|
atomic_subtract_32(&zone->uz_sleepers, 1);
|
|
|
|
old = atomic_fetchadd_64(&zone->uz_items,
|
|
|
|
-(UZ_ITEMS_SLEEPER - count));
|
|
|
|
/* We're no longer a sleeper. */
|
|
|
|
old -= UZ_ITEMS_SLEEPER;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're still at the limit, restart. Notably do not
|
|
|
|
* block on other sleepers. Cache the max value to protect
|
|
|
|
* against changes via sysctl.
|
|
|
|
*/
|
|
|
|
total = UZ_ITEMS_COUNT(old);
|
|
|
|
max = zone->uz_max_items;
|
|
|
|
if (total >= max)
|
|
|
|
continue;
|
|
|
|
/* Truncate if necessary, otherwise wake other sleepers. */
|
|
|
|
if (total + count > max) {
|
|
|
|
zone_free_limit(zone, total + count - max);
|
|
|
|
count = max - total;
|
|
|
|
} else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0)
|
|
|
|
wakeup_one(&zone->uz_max_items);
|
|
|
|
|
|
|
|
return (count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate 'count' items from our max_items limit. Returns the number
|
|
|
|
* available. If M_NOWAIT is not specified it will sleep until at least
|
|
|
|
* one item can be allocated.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zone_alloc_limit(uma_zone_t zone, int count, int flags)
|
|
|
|
{
|
|
|
|
uint64_t old;
|
|
|
|
uint64_t max;
|
|
|
|
|
|
|
|
max = zone->uz_max_items;
|
|
|
|
MPASS(max > 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We expect normal allocations to succeed with a simple
|
|
|
|
* fetchadd.
|
|
|
|
*/
|
|
|
|
old = atomic_fetchadd_64(&zone->uz_items, count);
|
|
|
|
if (__predict_true(old + count <= max))
|
|
|
|
return (count);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we had some items and no sleepers just return the
|
|
|
|
* truncated value. We have to release the excess space
|
|
|
|
* though because that may wake sleepers who weren't woken
|
|
|
|
* because we were temporarily over the limit.
|
|
|
|
*/
|
|
|
|
if (old < max) {
|
|
|
|
zone_free_limit(zone, (old + count) - max);
|
|
|
|
return (max - old);
|
|
|
|
}
|
|
|
|
return (zone_alloc_limit_hard(zone, count, flags));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a number of items back to the limit.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zone_free_limit(uma_zone_t zone, int count)
|
|
|
|
{
|
|
|
|
uint64_t old;
|
|
|
|
|
|
|
|
MPASS(count > 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In the common case we either have no sleepers or
|
|
|
|
* are still over the limit and can just return.
|
|
|
|
*/
|
|
|
|
old = atomic_fetchadd_64(&zone->uz_items, -count);
|
|
|
|
if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 ||
|
|
|
|
UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Moderate the rate of wakeups. Sleepers will continue
|
|
|
|
* to generate wakeups if necessary.
|
|
|
|
*/
|
|
|
|
wakeup_one(&zone->uz_max_items);
|
|
|
|
}
|
|
|
|
|
2013-06-18 04:50:20 +00:00
|
|
|
static uma_bucket_t
|
2019-11-26 22:17:02 +00:00
|
|
|
zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
|
2013-06-17 03:43:47 +00:00
|
|
|
{
|
|
|
|
uma_bucket_t bucket;
|
2021-04-13 17:39:50 -04:00
|
|
|
int error, maxbucket, cnt;
|
2002-10-24 07:59:03 +00:00
|
|
|
|
2020-01-14 02:13:46 +00:00
|
|
|
CTR3(KTR_UMA, "zone_alloc_bucket zone %s(%p) domain %d", zone->uz_name,
|
|
|
|
zone, domain);
|
2018-10-01 14:14:21 +00:00
|
|
|
|
2019-08-06 21:50:34 +00:00
|
|
|
/* Avoid allocs targeting empty domains. */
|
|
|
|
if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
|
|
|
|
domain = UMA_ANYDOMAIN;
|
2020-06-28 21:35:04 +00:00
|
|
|
else if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0)
|
2020-02-19 18:48:46 +00:00
|
|
|
domain = UMA_ANYDOMAIN;
|
2019-08-06 21:50:34 +00:00
|
|
|
|
2020-01-04 03:04:46 +00:00
|
|
|
if (zone->uz_max_items > 0)
|
|
|
|
maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size,
|
|
|
|
M_NOWAIT);
|
|
|
|
else
|
2019-11-28 00:19:09 +00:00
|
|
|
maxbucket = zone->uz_bucket_size;
|
2020-01-04 03:04:46 +00:00
|
|
|
if (maxbucket == 0)
|
|
|
|
return (false);
|
2019-11-26 22:17:02 +00:00
|
|
|
|
2013-06-26 00:57:38 +00:00
|
|
|
/* Don't wait for buckets, preserve caller's NOVM setting. */
|
|
|
|
bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
|
2019-11-26 22:17:02 +00:00
|
|
|
if (bucket == NULL) {
|
|
|
|
cnt = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
2013-06-17 03:43:47 +00:00
|
|
|
|
|
|
|
bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
|
2019-11-26 22:17:02 +00:00
|
|
|
MIN(maxbucket, bucket->ub_entries), domain, flags);
|
2002-04-08 02:42:55 +00:00
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
/*
|
2013-06-17 03:43:47 +00:00
|
|
|
* Initialize the memory if necessary.
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
*/
|
2013-06-17 03:43:47 +00:00
|
|
|
if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
int i;
|
|
|
|
|
2021-04-13 17:39:50 -04:00
|
|
|
for (i = 0; i < bucket->ub_cnt; i++) {
|
|
|
|
kasan_mark_item_valid(zone, bucket->ub_bucket[i]);
|
|
|
|
error = zone->uz_init(bucket->ub_bucket[i],
|
|
|
|
zone->uz_size, flags);
|
|
|
|
kasan_mark_item_invalid(zone, bucket->ub_bucket[i]);
|
|
|
|
if (error != 0)
|
2004-08-02 00:18:36 +00:00
|
|
|
break;
|
2021-04-13 17:39:50 -04:00
|
|
|
}
|
|
|
|
|
2004-08-02 00:18:36 +00:00
|
|
|
/*
|
|
|
|
* If we couldn't initialize the whole bucket, put the
|
|
|
|
* rest back onto the freelist.
|
|
|
|
*/
|
|
|
|
if (i != bucket->ub_cnt) {
|
2013-06-20 19:08:12 +00:00
|
|
|
zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
|
2013-06-17 03:43:47 +00:00
|
|
|
bucket->ub_cnt - i);
|
2004-10-27 21:19:35 +00:00
|
|
|
#ifdef INVARIANTS
|
2013-06-17 03:43:47 +00:00
|
|
|
bzero(&bucket->ub_bucket[i],
|
|
|
|
sizeof(void *) * (bucket->ub_cnt - i));
|
2004-10-27 21:19:35 +00:00
|
|
|
#endif
|
2004-08-02 00:18:36 +00:00
|
|
|
bucket->ub_cnt = i;
|
|
|
|
}
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
}
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2019-11-26 22:17:02 +00:00
|
|
|
cnt = bucket->ub_cnt;
|
2013-11-27 20:16:18 +00:00
|
|
|
if (bucket->ub_cnt == 0) {
|
|
|
|
bucket_free(zone, bucket, udata);
|
2019-01-15 18:24:34 +00:00
|
|
|
counter_u64_add(zone->uz_fails, 1);
|
2019-11-26 22:17:02 +00:00
|
|
|
bucket = NULL;
|
|
|
|
}
|
|
|
|
out:
|
2020-01-04 03:04:46 +00:00
|
|
|
if (zone->uz_max_items > 0 && cnt < maxbucket)
|
|
|
|
zone_free_limit(zone, maxbucket - cnt);
|
2002-10-24 07:59:03 +00:00
|
|
|
|
2013-06-18 04:50:20 +00:00
|
|
|
return (bucket);
|
2002-10-24 07:59:03 +00:00
|
|
|
}
|
2013-06-18 04:50:20 +00:00
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
2013-06-17 03:43:47 +00:00
|
|
|
* Allocates a single item from a zone.
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Arguments
|
|
|
|
* zone The zone to alloc for.
|
|
|
|
* udata The data to be passed to the constructor.
|
2018-01-12 23:25:05 +00:00
|
|
|
* domain The domain to allocate from or UMA_ANYDOMAIN.
|
2003-02-19 05:47:46 +00:00
|
|
|
* flags M_WAITOK, M_NOWAIT, M_ZERO.
|
2002-03-19 09:11:49 +00:00
|
|
|
*
|
|
|
|
* Returns
|
|
|
|
* NULL if there is no memory and M_NOWAIT is set
|
2002-10-24 07:59:03 +00:00
|
|
|
* An item if successful
|
2002-03-19 09:11:49 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
static void *
|
2018-01-12 23:25:05 +00:00
|
|
|
zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
|
|
|
void *item;
|
|
|
|
|
2020-08-21 18:31:57 +00:00
|
|
|
if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) {
|
|
|
|
counter_u64_add(zone->uz_fails, 1);
|
2020-01-04 03:04:46 +00:00
|
|
|
return (NULL);
|
2020-08-21 18:31:57 +00:00
|
|
|
}
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2019-08-06 21:50:34 +00:00
|
|
|
/* Avoid allocs targeting empty domains. */
|
|
|
|
if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
|
|
|
|
domain = UMA_ANYDOMAIN;
|
|
|
|
|
2018-01-12 23:25:05 +00:00
|
|
|
if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
|
2019-11-26 22:17:02 +00:00
|
|
|
goto fail_cnt;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
/*
|
|
|
|
* We have to call both the zone's init (not the keg's init)
|
|
|
|
* and the zone's ctor. This is because the item is going from
|
|
|
|
* a keg slab directly to the user, and the user is expecting it
|
|
|
|
* to be both zone-init'd as well as zone-ctor'd.
|
|
|
|
*/
|
2004-08-02 00:18:36 +00:00
|
|
|
if (zone->uz_init != NULL) {
|
2021-04-13 17:39:50 -04:00
|
|
|
int error;
|
|
|
|
|
|
|
|
kasan_mark_item_valid(zone, item);
|
|
|
|
error = zone->uz_init(item, zone->uz_size, flags);
|
|
|
|
kasan_mark_item_invalid(zone, item);
|
|
|
|
if (error != 0) {
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT);
|
2019-11-26 22:17:02 +00:00
|
|
|
goto fail_cnt;
|
2004-08-02 00:18:36 +00:00
|
|
|
}
|
|
|
|
}
|
2020-01-31 00:49:51 +00:00
|
|
|
item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags,
|
|
|
|
item);
|
2019-11-26 22:17:02 +00:00
|
|
|
if (item == NULL)
|
2018-06-08 00:15:08 +00:00
|
|
|
goto fail;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2019-01-15 18:24:34 +00:00
|
|
|
counter_u64_add(zone->uz_allocs, 1);
|
2017-06-01 18:36:52 +00:00
|
|
|
CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
|
|
|
|
zone->uz_name, zone);
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
return (item);
|
2013-06-17 03:43:47 +00:00
|
|
|
|
2019-11-26 22:17:02 +00:00
|
|
|
fail_cnt:
|
|
|
|
counter_u64_add(zone->uz_fails, 1);
|
2013-06-17 03:43:47 +00:00
|
|
|
fail:
|
2020-01-04 03:04:46 +00:00
|
|
|
if (zone->uz_max_items > 0)
|
|
|
|
zone_free_limit(zone, 1);
|
2017-06-01 18:36:52 +00:00
|
|
|
CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
|
|
|
|
zone->uz_name, zone);
|
2020-01-04 03:04:46 +00:00
|
|
|
|
2013-06-17 03:43:47 +00:00
|
|
|
return (NULL);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zfree_smr(uma_zone_t zone, void *item)
|
|
|
|
{
|
|
|
|
uma_cache_t cache;
|
|
|
|
uma_cache_bucket_t bucket;
|
2020-02-19 18:48:46 +00:00
|
|
|
int itemdomain, uz_flags;
|
2020-01-31 00:49:51 +00:00
|
|
|
|
|
|
|
#ifdef UMA_ZALLOC_DEBUG
|
|
|
|
KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
|
2020-10-02 19:03:42 +00:00
|
|
|
("uma_zfree_smr: called with non-SMR zone."));
|
2020-01-31 00:49:51 +00:00
|
|
|
KASSERT(item != NULL, ("uma_zfree_smr: Called with NULL pointer."));
|
2020-02-19 18:48:46 +00:00
|
|
|
SMR_ASSERT_NOT_ENTERED(zone->uz_smr);
|
2020-01-31 00:49:51 +00:00
|
|
|
if (uma_zfree_debug(zone, item, NULL) == EJUSTRETURN)
|
|
|
|
return;
|
|
|
|
#endif
|
|
|
|
cache = &zone->uz_cpu[curcpu];
|
|
|
|
uz_flags = cache_uz_flags(cache);
|
2020-02-19 18:48:46 +00:00
|
|
|
itemdomain = 0;
|
2020-01-31 00:49:51 +00:00
|
|
|
#ifdef NUMA
|
|
|
|
if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
|
Fix boot on systems where NUMA domain 0 is unpopulated.
- Add vm_phys_early_add_seg(), complementing vm_phys_early_alloc(), to
ensure that segments registered during hammer_time() are placed in the
right domain. Otherwise, since the SRAT is not parsed at that point,
we just add them to domain 0, which may be incorrect and results in a
domain with only several MB worth of memory.
- Fix uma_startup1() to try allocating memory for zones from any domain.
If domain 0 is unpopulated, the allocation will simply fail, resulting
in a page fault slightly later during boot.
- Change _vm_phys_domain() to return -1 for addresses not covered by the
affinity table, and change vm_phys_early_alloc() to handle wildcard
domains. This is necessary on amd64, where the page array is dense
and pmap_page_array_startup() may allocate page table pages for
non-existent page frames.
Reported and tested by: Rafael Kitover <rkitover@gmail.com>
Reviewed by: cem (earlier version), kib
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D25001
2020-05-28 19:41:00 +00:00
|
|
|
itemdomain = item_domain(item);
|
2020-01-31 00:49:51 +00:00
|
|
|
#endif
|
|
|
|
critical_enter();
|
|
|
|
do {
|
|
|
|
cache = &zone->uz_cpu[curcpu];
|
|
|
|
/* SMR Zones must free to the free bucket. */
|
|
|
|
bucket = &cache->uc_freebucket;
|
|
|
|
#ifdef NUMA
|
|
|
|
if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
|
2020-02-19 18:48:46 +00:00
|
|
|
PCPU_GET(domain) != itemdomain) {
|
2020-01-31 00:49:51 +00:00
|
|
|
bucket = &cache->uc_crossbucket;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
|
|
|
|
cache_bucket_push(cache, bucket, item);
|
|
|
|
critical_exit();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} while (cache_free(zone, cache, NULL, item, itemdomain));
|
|
|
|
critical_exit();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If nothing else caught this, we'll just do an internal free.
|
|
|
|
*/
|
|
|
|
zone_free_item(zone, item, NULL, SKIP_NONE);
|
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
|
|
|
|
{
|
|
|
|
uma_cache_t cache;
|
2019-12-25 20:50:53 +00:00
|
|
|
uma_cache_bucket_t bucket;
|
2020-02-19 18:48:46 +00:00
|
|
|
int itemdomain, uz_flags;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2015-08-22 12:59:05 +00:00
|
|
|
/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
|
2018-08-26 12:51:46 +00:00
|
|
|
random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
|
This is the much-discussed major upgrade to the random(4) device, known to you all as /dev/random.
This code has had an extensive rewrite and a good series of reviews, both by the author and other parties. This means a lot of code has been simplified. Pluggable structures for high-rate entropy generators are available, and it is most definitely not the case that /dev/random can be driven by only a hardware souce any more. This has been designed out of the device. Hardware sources are stirred into the CSPRNG (Yarrow, Fortuna) like any other entropy source. Pluggable modules may be written by third parties for additional sources.
The harvesting structures and consequently the locking have been simplified. Entropy harvesting is done in a more general way (the documentation for this will follow). There is some GREAT entropy to be had in the UMA allocator, but it is disabled for now as messing with that is likely to annoy many people.
The venerable (but effective) Yarrow algorithm, which is no longer supported by its authors now has an alternative, Fortuna. For now, Yarrow is retained as the default algorithm, but this may be changed using a kernel option. It is intended to make Fortuna the default algorithm for 11.0. Interested parties are encouraged to read ISBN 978-0-470-47424-2 "Cryptography Engineering" By Ferguson, Schneier and Kohno for Fortuna's gory details. Heck, read it anyway.
Many thanks to Arthur Mesh who did early grunt work, and who got caught in the crossfire rather more than he deserved to.
My thanks also to folks who helped me thresh this out on whiteboards and in the odd "Hallway track", or otherwise.
My Nomex pants are on. Let the feedback commence!
Reviewed by: trasz,des(partial),imp(partial?),rwatson(partial?)
Approved by: so(des)
2014-10-30 21:21:53 +00:00
|
|
|
|
2020-01-14 02:13:46 +00:00
|
|
|
CTR2(KTR_UMA, "uma_zfree_arg zone %s(%p)", zone->uz_name, zone);
|
2004-08-06 21:52:38 +00:00
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
#ifdef UMA_ZALLOC_DEBUG
|
|
|
|
KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
|
2020-10-02 19:03:42 +00:00
|
|
|
("uma_zfree_arg: called with SMR zone."));
|
2020-01-31 00:49:51 +00:00
|
|
|
if (uma_zfree_debug(zone, item, udata) == EJUSTRETURN)
|
|
|
|
return;
|
|
|
|
#endif
|
2010-10-19 16:06:00 +00:00
|
|
|
/* uma_zfree(..., NULL) does nothing, to match free(9). */
|
|
|
|
if (item == NULL)
|
|
|
|
return;
|
2019-12-25 20:57:24 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We are accessing the per-cpu cache without a critical section to
|
|
|
|
* fetch size and flags. This is acceptable, if we are preempted we
|
|
|
|
* will simply read another cpu's line.
|
|
|
|
*/
|
|
|
|
cache = &zone->uz_cpu[curcpu];
|
|
|
|
uz_flags = cache_uz_flags(cache);
|
2020-01-31 00:49:51 +00:00
|
|
|
if (UMA_ALWAYS_CTORDTOR ||
|
|
|
|
__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0))
|
2019-12-25 20:57:24 +00:00
|
|
|
item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE);
|
2013-06-13 21:05:38 +00:00
|
|
|
|
2002-04-14 01:56:25 +00:00
|
|
|
/*
|
|
|
|
* The race here is acceptable. If we miss it we'll just have to wait
|
|
|
|
* a little longer for the limits to be reset.
|
|
|
|
*/
|
2019-12-25 20:57:24 +00:00
|
|
|
if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) {
|
2020-12-06 22:45:22 +00:00
|
|
|
if (atomic_load_32(&zone->uz_sleepers) > 0)
|
2019-12-25 20:57:24 +00:00
|
|
|
goto zfree_item;
|
|
|
|
}
|
2002-04-14 01:56:25 +00:00
|
|
|
|
Modify UMA to use critical sections to protect per-CPU caches, rather than
mutexes, which offers lower overhead on both UP and SMP. When allocating
from or freeing to the per-cpu cache, without INVARIANTS enabled, we now
no longer perform any mutex operations, which offers a 1%-3% performance
improvement in a variety of micro-benchmarks. We rely on critical
sections to prevent (a) preemption resulting in reentrant access to UMA on
a single CPU, and (b) migration of the thread during access. In the event
we need to go back to the zone for a new bucket, we release the critical
section to acquire the global zone mutex, and must re-acquire the critical
section and re-evaluate which cache we are accessing in case migration has
occured, or circumstances have changed in the current cache.
Per-CPU cache statistics are now gathered lock-free by the sysctl, which
can result in small races in statistics reporting for caches.
Reviewed by: bmilekic, jeff (somewhat)
Tested by: rwatson, kris, gnn, scottl, mike at sentex dot net, others
2005-04-29 18:56:36 +00:00
|
|
|
/*
|
|
|
|
* If possible, free to the per-CPU cache. There are two
|
|
|
|
* requirements for safe access to the per-CPU cache: (1) the thread
|
|
|
|
* accessing the cache must not be preempted or yield during access,
|
|
|
|
* and (2) the thread must not migrate CPUs without switching which
|
|
|
|
* cache it accesses. We rely on a critical section to prevent
|
|
|
|
* preemption and migration. We release the critical section in
|
|
|
|
* order to acquire the zone mutex if we are unable to free to the
|
|
|
|
* current cache; when we re-acquire the critical section, we must
|
|
|
|
* detect and handle migration if it has occurred.
|
|
|
|
*/
|
2020-02-19 18:48:46 +00:00
|
|
|
itemdomain = 0;
|
2020-01-04 18:48:13 +00:00
|
|
|
#ifdef NUMA
|
|
|
|
if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
|
Fix boot on systems where NUMA domain 0 is unpopulated.
- Add vm_phys_early_add_seg(), complementing vm_phys_early_alloc(), to
ensure that segments registered during hammer_time() are placed in the
right domain. Otherwise, since the SRAT is not parsed at that point,
we just add them to domain 0, which may be incorrect and results in a
domain with only several MB worth of memory.
- Fix uma_startup1() to try allocating memory for zones from any domain.
If domain 0 is unpopulated, the allocation will simply fail, resulting
in a page fault slightly later during boot.
- Change _vm_phys_domain() to return -1 for addresses not covered by the
affinity table, and change vm_phys_early_alloc() to handle wildcard
domains. This is necessary on amd64, where the page array is dense
and pmap_page_array_startup() may allocate page table pages for
non-existent page frames.
Reported and tested by: Rafael Kitover <rkitover@gmail.com>
Reviewed by: cem (earlier version), kib
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D25001
2020-05-28 19:41:00 +00:00
|
|
|
itemdomain = item_domain(item);
|
2020-01-04 18:48:13 +00:00
|
|
|
#endif
|
Modify UMA to use critical sections to protect per-CPU caches, rather than
mutexes, which offers lower overhead on both UP and SMP. When allocating
from or freeing to the per-cpu cache, without INVARIANTS enabled, we now
no longer perform any mutex operations, which offers a 1%-3% performance
improvement in a variety of micro-benchmarks. We rely on critical
sections to prevent (a) preemption resulting in reentrant access to UMA on
a single CPU, and (b) migration of the thread during access. In the event
we need to go back to the zone for a new bucket, we release the critical
section to acquire the global zone mutex, and must re-acquire the critical
section and re-evaluate which cache we are accessing in case migration has
occured, or circumstances have changed in the current cache.
Per-CPU cache statistics are now gathered lock-free by the sysctl, which
can result in small races in statistics reporting for caches.
Reviewed by: bmilekic, jeff (somewhat)
Tested by: rwatson, kris, gnn, scottl, mike at sentex dot net, others
2005-04-29 18:56:36 +00:00
|
|
|
critical_enter();
|
2019-11-27 23:19:06 +00:00
|
|
|
do {
|
2019-12-25 20:57:24 +00:00
|
|
|
cache = &zone->uz_cpu[curcpu];
|
2020-01-31 00:49:51 +00:00
|
|
|
/*
|
|
|
|
* Try to free into the allocbucket first to give LIFO
|
|
|
|
* ordering for cache-hot datastructures. Spill over
|
|
|
|
* into the freebucket if necessary. Alloc will swap
|
|
|
|
* them if one runs dry.
|
|
|
|
*/
|
|
|
|
bucket = &cache->uc_allocbucket;
|
2020-01-04 18:48:13 +00:00
|
|
|
#ifdef NUMA
|
|
|
|
if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
|
2020-02-19 18:48:46 +00:00
|
|
|
PCPU_GET(domain) != itemdomain) {
|
2019-12-25 20:50:53 +00:00
|
|
|
bucket = &cache->uc_crossbucket;
|
2019-11-27 23:19:06 +00:00
|
|
|
} else
|
2019-08-06 21:50:34 +00:00
|
|
|
#endif
|
2020-02-27 08:23:10 +00:00
|
|
|
if (bucket->ucb_cnt == bucket->ucb_entries &&
|
|
|
|
cache->uc_freebucket.ucb_cnt <
|
|
|
|
cache->uc_freebucket.ucb_entries)
|
|
|
|
cache_bucket_swap(&cache->uc_freebucket,
|
|
|
|
&cache->uc_allocbucket);
|
2019-12-25 20:50:53 +00:00
|
|
|
if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
|
|
|
|
cache_bucket_push(cache, bucket, item);
|
2019-11-27 23:19:06 +00:00
|
|
|
critical_exit();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} while (cache_free(zone, cache, udata, item, itemdomain));
|
|
|
|
critical_exit();
|
|
|
|
|
2013-06-18 04:50:20 +00:00
|
|
|
/*
|
2019-11-27 23:19:06 +00:00
|
|
|
* If nothing else caught this, we'll just do an internal free.
|
2013-06-18 04:50:20 +00:00
|
|
|
*/
|
2019-11-27 23:19:06 +00:00
|
|
|
zfree_item:
|
|
|
|
zone_free_item(zone, item, udata, SKIP_DTOR);
|
|
|
|
}
|
|
|
|
|
2020-01-04 18:48:13 +00:00
|
|
|
#ifdef NUMA
|
2020-01-04 07:56:28 +00:00
|
|
|
/*
|
|
|
|
* sort crossdomain free buckets to domain correct buckets and cache
|
|
|
|
* them.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata)
|
|
|
|
{
|
2020-11-30 16:18:33 +00:00
|
|
|
struct uma_bucketlist emptybuckets, fullbuckets;
|
2020-01-04 07:56:28 +00:00
|
|
|
uma_zone_domain_t zdom;
|
|
|
|
uma_bucket_t b;
|
2020-02-13 20:58:51 +00:00
|
|
|
smr_seq_t seq;
|
2020-01-04 07:56:28 +00:00
|
|
|
void *item;
|
|
|
|
int domain;
|
|
|
|
|
|
|
|
CTR3(KTR_UMA,
|
|
|
|
"uma_zfree: zone %s(%p) draining cross bucket %p",
|
|
|
|
zone->uz_name, zone, bucket);
|
|
|
|
|
2020-02-22 03:44:10 +00:00
|
|
|
/*
|
|
|
|
* It is possible for buckets to arrive here out of order so we fetch
|
|
|
|
* the current smr seq rather than accepting the bucket's.
|
|
|
|
*/
|
|
|
|
seq = SMR_SEQ_INVALID;
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
|
|
|
|
seq = smr_advance(zone->uz_smr);
|
2020-01-04 07:56:28 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* To avoid having ndomain * ndomain buckets for sorting we have a
|
|
|
|
* lock on the current crossfree bucket. A full matrix with
|
|
|
|
* per-domain locking could be used if necessary.
|
|
|
|
*/
|
2020-11-30 16:18:33 +00:00
|
|
|
STAILQ_INIT(&emptybuckets);
|
2020-02-22 03:44:10 +00:00
|
|
|
STAILQ_INIT(&fullbuckets);
|
2020-01-04 07:56:28 +00:00
|
|
|
ZONE_CROSS_LOCK(zone);
|
2020-11-30 16:18:33 +00:00
|
|
|
for (; bucket->ub_cnt > 0; bucket->ub_cnt--) {
|
2020-01-04 07:56:28 +00:00
|
|
|
item = bucket->ub_bucket[bucket->ub_cnt - 1];
|
Fix boot on systems where NUMA domain 0 is unpopulated.
- Add vm_phys_early_add_seg(), complementing vm_phys_early_alloc(), to
ensure that segments registered during hammer_time() are placed in the
right domain. Otherwise, since the SRAT is not parsed at that point,
we just add them to domain 0, which may be incorrect and results in a
domain with only several MB worth of memory.
- Fix uma_startup1() to try allocating memory for zones from any domain.
If domain 0 is unpopulated, the allocation will simply fail, resulting
in a page fault slightly later during boot.
- Change _vm_phys_domain() to return -1 for addresses not covered by the
affinity table, and change vm_phys_early_alloc() to handle wildcard
domains. This is necessary on amd64, where the page array is dense
and pmap_page_array_startup() may allocate page table pages for
non-existent page frames.
Reported and tested by: Rafael Kitover <rkitover@gmail.com>
Reviewed by: cem (earlier version), kib
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D25001
2020-05-28 19:41:00 +00:00
|
|
|
domain = item_domain(item);
|
2020-02-19 18:48:46 +00:00
|
|
|
zdom = ZDOM_GET(zone, domain);
|
2020-01-04 07:56:28 +00:00
|
|
|
if (zdom->uzd_cross == NULL) {
|
2020-11-30 16:18:33 +00:00
|
|
|
if ((b = STAILQ_FIRST(&emptybuckets)) != NULL) {
|
|
|
|
STAILQ_REMOVE_HEAD(&emptybuckets, ub_link);
|
|
|
|
zdom->uzd_cross = b;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Avoid allocating a bucket with the cross lock
|
|
|
|
* held, since allocation can trigger a
|
|
|
|
* cross-domain free and bucket zones may
|
|
|
|
* allocate from each other.
|
|
|
|
*/
|
|
|
|
ZONE_CROSS_UNLOCK(zone);
|
|
|
|
b = bucket_alloc(zone, udata, M_NOWAIT);
|
|
|
|
if (b == NULL)
|
|
|
|
goto out;
|
|
|
|
ZONE_CROSS_LOCK(zone);
|
|
|
|
if (zdom->uzd_cross != NULL) {
|
|
|
|
STAILQ_INSERT_HEAD(&emptybuckets, b,
|
|
|
|
ub_link);
|
|
|
|
} else {
|
|
|
|
zdom->uzd_cross = b;
|
|
|
|
}
|
|
|
|
}
|
2020-01-04 07:56:28 +00:00
|
|
|
}
|
2020-02-13 20:58:51 +00:00
|
|
|
b = zdom->uzd_cross;
|
|
|
|
b->ub_bucket[b->ub_cnt++] = item;
|
|
|
|
b->ub_seq = seq;
|
|
|
|
if (b->ub_cnt == b->ub_entries) {
|
|
|
|
STAILQ_INSERT_HEAD(&fullbuckets, b, ub_link);
|
2020-11-30 16:18:33 +00:00
|
|
|
if ((b = STAILQ_FIRST(&emptybuckets)) != NULL)
|
|
|
|
STAILQ_REMOVE_HEAD(&emptybuckets, ub_link);
|
|
|
|
zdom->uzd_cross = b;
|
2020-01-04 07:56:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
ZONE_CROSS_UNLOCK(zone);
|
2020-11-30 16:18:33 +00:00
|
|
|
out:
|
2020-02-19 18:48:46 +00:00
|
|
|
if (bucket->ub_cnt == 0)
|
|
|
|
bucket->ub_seq = SMR_SEQ_INVALID;
|
2020-01-04 07:56:28 +00:00
|
|
|
bucket_free(zone, bucket, udata);
|
2020-02-19 18:48:46 +00:00
|
|
|
|
2020-11-30 16:18:33 +00:00
|
|
|
while ((b = STAILQ_FIRST(&emptybuckets)) != NULL) {
|
|
|
|
STAILQ_REMOVE_HEAD(&emptybuckets, ub_link);
|
|
|
|
bucket_free(zone, b, udata);
|
|
|
|
}
|
2020-02-19 18:48:46 +00:00
|
|
|
while ((b = STAILQ_FIRST(&fullbuckets)) != NULL) {
|
|
|
|
STAILQ_REMOVE_HEAD(&fullbuckets, ub_link);
|
Fix boot on systems where NUMA domain 0 is unpopulated.
- Add vm_phys_early_add_seg(), complementing vm_phys_early_alloc(), to
ensure that segments registered during hammer_time() are placed in the
right domain. Otherwise, since the SRAT is not parsed at that point,
we just add them to domain 0, which may be incorrect and results in a
domain with only several MB worth of memory.
- Fix uma_startup1() to try allocating memory for zones from any domain.
If domain 0 is unpopulated, the allocation will simply fail, resulting
in a page fault slightly later during boot.
- Change _vm_phys_domain() to return -1 for addresses not covered by the
affinity table, and change vm_phys_early_alloc() to handle wildcard
domains. This is necessary on amd64, where the page array is dense
and pmap_page_array_startup() may allocate page table pages for
non-existent page frames.
Reported and tested by: Rafael Kitover <rkitover@gmail.com>
Reviewed by: cem (earlier version), kib
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D25001
2020-05-28 19:41:00 +00:00
|
|
|
domain = item_domain(b->ub_bucket[0]);
|
2020-02-19 18:48:46 +00:00
|
|
|
zone_put_bucket(zone, domain, b, udata, true);
|
|
|
|
}
|
2020-01-04 07:56:28 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2019-11-27 23:19:06 +00:00
|
|
|
static void
|
|
|
|
zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata,
|
2020-02-19 18:48:46 +00:00
|
|
|
int itemdomain, bool ws)
|
2019-11-27 23:19:06 +00:00
|
|
|
{
|
|
|
|
|
2020-01-04 18:48:13 +00:00
|
|
|
#ifdef NUMA
|
2019-11-27 23:19:06 +00:00
|
|
|
/*
|
|
|
|
* Buckets coming from the wrong domain will be entirely for the
|
|
|
|
* only other domain on two domain systems. In this case we can
|
|
|
|
* simply cache them. Otherwise we need to sort them back to
|
2020-01-04 07:56:28 +00:00
|
|
|
* correct domains.
|
2019-11-27 23:19:06 +00:00
|
|
|
*/
|
2020-02-19 18:48:46 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
|
|
|
|
vm_ndomains > 2 && PCPU_GET(domain) != itemdomain) {
|
2020-01-04 07:56:28 +00:00
|
|
|
zone_free_cross(zone, bucket, udata);
|
2013-06-18 04:50:20 +00:00
|
|
|
return;
|
2004-01-30 16:26:29 +00:00
|
|
|
}
|
2019-11-27 23:19:06 +00:00
|
|
|
#endif
|
2020-01-04 07:56:28 +00:00
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
2019-11-27 23:19:06 +00:00
|
|
|
* Attempt to save the bucket in the zone's domain bucket cache.
|
2002-03-19 09:11:49 +00:00
|
|
|
*/
|
2019-11-27 23:19:06 +00:00
|
|
|
CTR3(KTR_UMA,
|
|
|
|
"uma_zfree: zone %s(%p) putting bucket %p on free list",
|
|
|
|
zone->uz_name, zone, bucket);
|
|
|
|
/* ub_cnt is pointing to the last free item */
|
2020-02-19 18:48:46 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0)
|
|
|
|
itemdomain = zone_domain_lowest(zone, itemdomain);
|
|
|
|
zone_put_bucket(zone, itemdomain, bucket, udata, ws);
|
2019-11-27 23:19:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Populate a free or cross bucket for the current cpu cache. Free any
|
|
|
|
* existing full bucket either to the zone cache or back to the slab layer.
|
|
|
|
*
|
|
|
|
* Enters and returns in a critical section. false return indicates that
|
|
|
|
* we can not satisfy this free in the cache layer. true indicates that
|
|
|
|
* the caller should retry.
|
|
|
|
*/
|
|
|
|
static __noinline bool
|
|
|
|
cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item,
|
|
|
|
int itemdomain)
|
|
|
|
{
|
2020-01-04 18:48:13 +00:00
|
|
|
uma_cache_bucket_t cbucket;
|
2020-01-31 00:49:51 +00:00
|
|
|
uma_bucket_t newbucket, bucket;
|
2019-11-27 23:19:06 +00:00
|
|
|
|
|
|
|
CRITICAL_ASSERT(curthread);
|
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
if (zone->uz_bucket_size == 0)
|
2019-11-27 23:19:06 +00:00
|
|
|
return false;
|
|
|
|
|
2019-12-25 20:57:24 +00:00
|
|
|
cache = &zone->uz_cpu[curcpu];
|
2020-01-31 00:49:51 +00:00
|
|
|
newbucket = NULL;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2019-11-27 23:19:06 +00:00
|
|
|
/*
|
2020-01-04 18:48:13 +00:00
|
|
|
* FIRSTTOUCH domains need to free to the correct zdom. When
|
|
|
|
* enabled this is the zdom of the item. The bucket is the
|
|
|
|
* cross bucket if the current domain and itemdomain do not match.
|
2019-11-27 23:19:06 +00:00
|
|
|
*/
|
2020-01-04 18:48:13 +00:00
|
|
|
cbucket = &cache->uc_freebucket;
|
|
|
|
#ifdef NUMA
|
2020-02-19 18:48:46 +00:00
|
|
|
if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) {
|
|
|
|
if (PCPU_GET(domain) != itemdomain) {
|
2020-01-04 18:48:13 +00:00
|
|
|
cbucket = &cache->uc_crossbucket;
|
|
|
|
if (cbucket->ucb_cnt != 0)
|
2020-02-19 18:48:46 +00:00
|
|
|
counter_u64_add(zone->uz_xdomain,
|
2020-01-04 18:48:13 +00:00
|
|
|
cbucket->ucb_cnt);
|
|
|
|
}
|
2020-02-19 18:48:46 +00:00
|
|
|
}
|
2019-08-06 21:50:34 +00:00
|
|
|
#endif
|
2020-01-04 18:48:13 +00:00
|
|
|
bucket = cache_bucket_unload(cbucket);
|
2020-02-19 18:48:46 +00:00
|
|
|
KASSERT(bucket == NULL || bucket->ub_cnt == bucket->ub_entries,
|
|
|
|
("cache_free: Entered with non-full free bucket."));
|
2019-11-27 23:19:06 +00:00
|
|
|
|
2016-07-20 01:01:50 +00:00
|
|
|
/* We are no longer associated with this CPU. */
|
|
|
|
critical_exit();
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
/*
|
|
|
|
* Don't let SMR zones operate without a free bucket. Force
|
|
|
|
* a synchronize and re-use this one. We will only degrade
|
|
|
|
* to a synchronize every bucket_size items rather than every
|
|
|
|
* item if we fail to allocate a bucket.
|
|
|
|
*/
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) != 0) {
|
|
|
|
if (bucket != NULL)
|
|
|
|
bucket->ub_seq = smr_advance(zone->uz_smr);
|
|
|
|
newbucket = bucket_alloc(zone, udata, M_NOWAIT);
|
|
|
|
if (newbucket == NULL && bucket != NULL) {
|
|
|
|
bucket_drain(zone, bucket);
|
|
|
|
newbucket = bucket;
|
|
|
|
bucket = NULL;
|
|
|
|
}
|
|
|
|
} else if (!bucketdisable)
|
|
|
|
newbucket = bucket_alloc(zone, udata, M_NOWAIT);
|
|
|
|
|
2019-11-27 23:19:06 +00:00
|
|
|
if (bucket != NULL)
|
2020-02-19 18:48:46 +00:00
|
|
|
zone_free_bucket(zone, bucket, udata, itemdomain, true);
|
2019-11-27 23:19:06 +00:00
|
|
|
|
|
|
|
critical_enter();
|
2020-01-31 00:49:51 +00:00
|
|
|
if ((bucket = newbucket) == NULL)
|
2019-11-27 23:19:06 +00:00
|
|
|
return (false);
|
2019-12-25 20:57:24 +00:00
|
|
|
cache = &zone->uz_cpu[curcpu];
|
2020-01-04 18:48:13 +00:00
|
|
|
#ifdef NUMA
|
2019-11-27 23:19:06 +00:00
|
|
|
/*
|
|
|
|
* Check to see if we should be populating the cross bucket. If it
|
|
|
|
* is already populated we will fall through and attempt to populate
|
|
|
|
* the free bucket.
|
|
|
|
*/
|
2020-02-19 18:48:46 +00:00
|
|
|
if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) {
|
|
|
|
if (PCPU_GET(domain) != itemdomain &&
|
2019-12-25 20:50:53 +00:00
|
|
|
cache->uc_crossbucket.ucb_bucket == NULL) {
|
|
|
|
cache_bucket_load_cross(cache, bucket);
|
2019-11-27 23:19:06 +00:00
|
|
|
return (true);
|
2019-08-06 21:50:34 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2013-11-19 10:17:10 +00:00
|
|
|
/*
|
2019-11-27 23:19:06 +00:00
|
|
|
* We may have lost the race to fill the bucket or switched CPUs.
|
2013-11-19 10:17:10 +00:00
|
|
|
*/
|
2019-12-25 20:50:53 +00:00
|
|
|
if (cache->uc_freebucket.ucb_bucket != NULL) {
|
2013-06-18 04:50:20 +00:00
|
|
|
critical_exit();
|
2013-06-26 00:57:38 +00:00
|
|
|
bucket_free(zone, bucket, udata);
|
2019-11-27 23:19:06 +00:00
|
|
|
critical_enter();
|
|
|
|
} else
|
2019-12-25 20:50:53 +00:00
|
|
|
cache_bucket_load_free(cache, bucket);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2019-11-27 23:19:06 +00:00
|
|
|
return (true);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
uma_keg_t keg;
|
2018-01-12 23:25:05 +00:00
|
|
|
uma_domain_t dom;
|
2020-01-14 02:14:15 +00:00
|
|
|
int freei;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
keg = zone->uz_keg;
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_LOCK_ASSERT(keg, slab->us_domain);
|
2018-01-12 23:25:05 +00:00
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/* Do we need to remove from any lists? */
|
2020-01-04 03:30:08 +00:00
|
|
|
dom = &keg->uk_domain[slab->us_domain];
|
2020-02-11 20:06:33 +00:00
|
|
|
if (slab->us_freecount + 1 == keg->uk_ipers) {
|
2003-06-09 22:51:36 +00:00
|
|
|
LIST_REMOVE(slab, us_link);
|
2018-01-12 23:25:05 +00:00
|
|
|
LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
|
2020-02-11 20:06:33 +00:00
|
|
|
dom->ud_free_slabs++;
|
2002-03-19 09:11:49 +00:00
|
|
|
} else if (slab->us_freecount == 0) {
|
|
|
|
LIST_REMOVE(slab, us_link);
|
2018-01-12 23:25:05 +00:00
|
|
|
LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2013-06-13 21:05:38 +00:00
|
|
|
/* Slab management. */
|
2019-12-08 01:15:06 +00:00
|
|
|
freei = slab_item_index(slab, keg, item);
|
2019-12-02 22:44:34 +00:00
|
|
|
BIT_SET(keg->uk_ipers, freei, &slab->us_free);
|
2002-03-19 09:11:49 +00:00
|
|
|
slab->us_freecount++;
|
|
|
|
|
2013-06-13 21:05:38 +00:00
|
|
|
/* Keg statistics. */
|
2020-02-11 20:06:33 +00:00
|
|
|
dom->ud_free_items++;
|
2013-06-17 03:43:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2019-12-04 18:40:05 +00:00
|
|
|
zone_release(void *arg, void **bucket, int cnt)
|
2013-06-17 03:43:47 +00:00
|
|
|
{
|
2020-01-04 03:30:08 +00:00
|
|
|
struct mtx *lock;
|
2019-12-04 18:40:05 +00:00
|
|
|
uma_zone_t zone;
|
2013-06-17 03:43:47 +00:00
|
|
|
uma_slab_t slab;
|
|
|
|
uma_keg_t keg;
|
|
|
|
uint8_t *mem;
|
2020-01-04 03:30:08 +00:00
|
|
|
void *item;
|
2013-06-17 03:43:47 +00:00
|
|
|
int i;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2019-12-04 18:40:05 +00:00
|
|
|
zone = arg;
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
keg = zone->uz_keg;
|
2020-01-04 03:30:08 +00:00
|
|
|
lock = NULL;
|
2020-01-09 02:03:03 +00:00
|
|
|
if (__predict_false((zone->uz_flags & UMA_ZFLAG_HASH) != 0))
|
2020-01-04 03:30:08 +00:00
|
|
|
lock = KEG_LOCK(keg, 0);
|
2013-06-17 03:43:47 +00:00
|
|
|
for (i = 0; i < cnt; i++) {
|
|
|
|
item = bucket[i];
|
2020-01-09 02:03:03 +00:00
|
|
|
if (__predict_true((zone->uz_flags & UMA_ZFLAG_VTOSLAB) != 0)) {
|
2020-01-04 03:30:08 +00:00
|
|
|
slab = vtoslab((vm_offset_t)item);
|
|
|
|
} else {
|
2013-06-17 03:43:47 +00:00
|
|
|
mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
|
2020-01-09 02:03:03 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZFLAG_HASH) != 0)
|
2013-06-17 03:43:47 +00:00
|
|
|
slab = hash_sfind(&keg->uk_hash, mem);
|
2020-01-04 03:30:08 +00:00
|
|
|
else
|
|
|
|
slab = (uma_slab_t)(mem + keg->uk_pgoff);
|
|
|
|
}
|
|
|
|
if (lock != KEG_LOCKPTR(keg, slab->us_domain)) {
|
|
|
|
if (lock != NULL)
|
|
|
|
mtx_unlock(lock);
|
|
|
|
lock = KEG_LOCK(keg, slab->us_domain);
|
|
|
|
}
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
slab_free_item(zone, slab, item);
|
2002-04-14 01:56:25 +00:00
|
|
|
}
|
2020-01-04 03:30:08 +00:00
|
|
|
if (lock != NULL)
|
|
|
|
mtx_unlock(lock);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2013-06-17 03:43:47 +00:00
|
|
|
/*
|
|
|
|
* Frees a single item to any zone.
|
|
|
|
*
|
|
|
|
* Arguments:
|
|
|
|
* zone The zone to free to
|
|
|
|
* item The item we're freeing
|
|
|
|
* udata User supplied data for the dtor
|
|
|
|
* skip Skip dtors and finis
|
|
|
|
*/
|
2020-02-16 01:07:19 +00:00
|
|
|
static __noinline void
|
2013-06-17 03:43:47 +00:00
|
|
|
zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
|
|
|
|
{
|
2018-06-08 00:15:08 +00:00
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
/*
|
|
|
|
* If a free is sent directly to an SMR zone we have to
|
|
|
|
* synchronize immediately because the item can instantly
|
|
|
|
* be reallocated. This should only happen in degenerate
|
|
|
|
* cases when no memory is available for per-cpu caches.
|
|
|
|
*/
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_NONE)
|
|
|
|
smr_synchronize(zone->uz_smr);
|
|
|
|
|
2019-12-25 20:57:24 +00:00
|
|
|
item_dtor(zone, item, zone->uz_size, udata, skip);
|
2013-06-17 03:43:47 +00:00
|
|
|
|
2021-04-13 17:39:50 -04:00
|
|
|
if (skip < SKIP_FINI && zone->uz_fini) {
|
|
|
|
kasan_mark_item_valid(zone, item);
|
2013-06-17 03:43:47 +00:00
|
|
|
zone->uz_fini(item, zone->uz_size);
|
2021-04-13 17:39:50 -04:00
|
|
|
kasan_mark_item_invalid(zone, item);
|
|
|
|
}
|
2013-06-17 03:43:47 +00:00
|
|
|
|
|
|
|
zone->uz_release(zone->uz_arg, &item, 1);
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
|
|
|
|
if (skip & SKIP_CNT)
|
|
|
|
return;
|
|
|
|
|
2019-01-15 18:24:34 +00:00
|
|
|
counter_u64_add(zone->uz_frees, 1);
|
|
|
|
|
2020-01-04 03:04:46 +00:00
|
|
|
if (zone->uz_max_items > 0)
|
|
|
|
zone_free_limit(zone, 1);
|
2013-06-17 03:43:47 +00:00
|
|
|
}
|
|
|
|
|
2002-03-20 05:28:34 +00:00
|
|
|
/* See uma.h */
|
2010-10-16 04:41:45 +00:00
|
|
|
int
|
2002-03-20 05:28:34 +00:00
|
|
|
uma_zone_set_max(uma_zone_t zone, int nitems)
|
|
|
|
{
|
2020-12-06 22:45:50 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the limit is small, we may need to constrain the maximum per-CPU
|
|
|
|
* cache size, or disable caching entirely.
|
|
|
|
*/
|
|
|
|
uma_zone_set_maxcache(zone, nitems);
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
|
2020-01-04 03:04:46 +00:00
|
|
|
/*
|
|
|
|
* XXX This can misbehave if the zone has any allocations with
|
|
|
|
* no limit and a limit is imposed. There is currently no
|
|
|
|
* way to clear a limit.
|
|
|
|
*/
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
ZONE_LOCK(zone);
|
|
|
|
zone->uz_max_items = nitems;
|
2019-12-25 20:57:24 +00:00
|
|
|
zone->uz_flags |= UMA_ZFLAG_LIMIT;
|
|
|
|
zone_update_caches(zone);
|
2020-01-04 03:04:46 +00:00
|
|
|
/* We may need to wake waiters. */
|
|
|
|
wakeup(&zone->uz_max_items);
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
ZONE_UNLOCK(zone);
|
|
|
|
|
|
|
|
return (nitems);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* See uma.h */
|
2019-11-22 16:30:47 +00:00
|
|
|
void
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
uma_zone_set_maxcache(uma_zone_t zone, int nitems)
|
|
|
|
{
|
2020-12-06 22:45:50 +00:00
|
|
|
int bpcpu, bpdom, bsize, nb;
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
|
|
|
|
ZONE_LOCK(zone);
|
2020-12-06 22:45:50 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Compute a lower bound on the number of items that may be cached in
|
|
|
|
* the zone. Each CPU gets at least two buckets, and for cross-domain
|
|
|
|
* frees we use an additional bucket per CPU and per domain. Select the
|
|
|
|
* largest bucket size that does not exceed half of the requested limit,
|
|
|
|
* with the left over space given to the full bucket cache.
|
|
|
|
*/
|
|
|
|
bpdom = 0;
|
|
|
|
bpcpu = 2;
|
|
|
|
#ifdef NUMA
|
|
|
|
if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && vm_ndomains > 1) {
|
|
|
|
bpcpu++;
|
|
|
|
bpdom++;
|
2019-11-22 16:30:47 +00:00
|
|
|
}
|
2020-12-06 22:45:50 +00:00
|
|
|
#endif
|
|
|
|
nb = bpcpu * mp_ncpus + bpdom * vm_ndomains;
|
|
|
|
bsize = nitems / nb / 2;
|
|
|
|
if (bsize > BUCKET_MAX)
|
|
|
|
bsize = BUCKET_MAX;
|
|
|
|
else if (bsize == 0 && nitems / nb > 0)
|
|
|
|
bsize = 1;
|
|
|
|
zone->uz_bucket_size_max = zone->uz_bucket_size = bsize;
|
2019-11-28 00:19:09 +00:00
|
|
|
if (zone->uz_bucket_size_min > zone->uz_bucket_size_max)
|
|
|
|
zone->uz_bucket_size_min = zone->uz_bucket_size_max;
|
2020-12-06 22:45:50 +00:00
|
|
|
zone->uz_bucket_max = nitems - nb * bsize;
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
ZONE_UNLOCK(zone);
|
2002-03-20 05:28:34 +00:00
|
|
|
}
|
|
|
|
|
2010-08-16 14:24:00 +00:00
|
|
|
/* See uma.h */
|
|
|
|
int
|
|
|
|
uma_zone_get_max(uma_zone_t zone)
|
|
|
|
{
|
|
|
|
int nitems;
|
|
|
|
|
2020-01-04 03:15:34 +00:00
|
|
|
nitems = atomic_load_64(&zone->uz_max_items);
|
2010-08-16 14:24:00 +00:00
|
|
|
|
|
|
|
return (nitems);
|
|
|
|
}
|
|
|
|
|
2012-12-07 22:27:13 +00:00
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zone_set_warning(uma_zone_t zone, const char *warning)
|
|
|
|
{
|
|
|
|
|
2020-01-04 03:15:34 +00:00
|
|
|
ZONE_ASSERT_COLD(zone);
|
2012-12-07 22:27:13 +00:00
|
|
|
zone->uz_warning = warning;
|
|
|
|
}
|
|
|
|
|
2015-12-20 02:05:33 +00:00
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
|
|
|
|
{
|
|
|
|
|
2020-01-04 03:15:34 +00:00
|
|
|
ZONE_ASSERT_COLD(zone);
|
2016-02-03 23:30:17 +00:00
|
|
|
TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
|
2015-12-20 02:05:33 +00:00
|
|
|
}
|
|
|
|
|
2010-10-16 04:14:45 +00:00
|
|
|
/* See uma.h */
|
|
|
|
int
|
|
|
|
uma_zone_get_cur(uma_zone_t zone)
|
|
|
|
{
|
|
|
|
int64_t nitems;
|
|
|
|
u_int i;
|
|
|
|
|
2020-01-05 22:54:25 +00:00
|
|
|
nitems = 0;
|
|
|
|
if (zone->uz_allocs != EARLY_COUNTER && zone->uz_frees != EARLY_COUNTER)
|
|
|
|
nitems = counter_u64_fetch(zone->uz_allocs) -
|
|
|
|
counter_u64_fetch(zone->uz_frees);
|
2020-01-04 03:15:34 +00:00
|
|
|
CPU_FOREACH(i)
|
|
|
|
nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs) -
|
|
|
|
atomic_load_64(&zone->uz_cpu[i].uc_frees);
|
2010-10-16 04:14:45 +00:00
|
|
|
|
|
|
|
return (nitems < 0 ? 0 : nitems);
|
|
|
|
}
|
|
|
|
|
2019-11-28 00:19:09 +00:00
|
|
|
static uint64_t
|
|
|
|
uma_zone_get_allocs(uma_zone_t zone)
|
|
|
|
{
|
|
|
|
uint64_t nitems;
|
|
|
|
u_int i;
|
|
|
|
|
2020-01-05 22:54:25 +00:00
|
|
|
nitems = 0;
|
|
|
|
if (zone->uz_allocs != EARLY_COUNTER)
|
|
|
|
nitems = counter_u64_fetch(zone->uz_allocs);
|
2020-01-04 03:15:34 +00:00
|
|
|
CPU_FOREACH(i)
|
|
|
|
nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs);
|
2019-11-28 00:19:09 +00:00
|
|
|
|
|
|
|
return (nitems);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t
|
|
|
|
uma_zone_get_frees(uma_zone_t zone)
|
|
|
|
{
|
|
|
|
uint64_t nitems;
|
|
|
|
u_int i;
|
|
|
|
|
2020-01-05 22:54:25 +00:00
|
|
|
nitems = 0;
|
|
|
|
if (zone->uz_frees != EARLY_COUNTER)
|
|
|
|
nitems = counter_u64_fetch(zone->uz_frees);
|
2020-01-04 03:15:34 +00:00
|
|
|
CPU_FOREACH(i)
|
|
|
|
nitems += atomic_load_64(&zone->uz_cpu[i].uc_frees);
|
2019-11-28 00:19:09 +00:00
|
|
|
|
|
|
|
return (nitems);
|
|
|
|
}
|
|
|
|
|
2020-01-04 19:29:25 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
/* Used only for KEG_ASSERT_COLD(). */
|
|
|
|
static uint64_t
|
|
|
|
uma_keg_get_allocs(uma_keg_t keg)
|
|
|
|
{
|
|
|
|
uma_zone_t z;
|
|
|
|
uint64_t nitems;
|
|
|
|
|
|
|
|
nitems = 0;
|
|
|
|
LIST_FOREACH(z, &keg->uk_zones, uz_link)
|
|
|
|
nitems += uma_zone_get_allocs(z);
|
|
|
|
|
|
|
|
return (nitems);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zone_set_init(uma_zone_t zone, uma_init uminit)
|
|
|
|
{
|
2009-01-25 09:11:24 +00:00
|
|
|
uma_keg_t keg;
|
|
|
|
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
KEG_GET(zone, keg);
|
2020-01-04 03:15:34 +00:00
|
|
|
KEG_ASSERT_COLD(keg);
|
2009-01-25 09:11:24 +00:00
|
|
|
keg->uk_init = uminit;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
|
|
|
|
{
|
2009-01-25 09:11:24 +00:00
|
|
|
uma_keg_t keg;
|
|
|
|
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
KEG_GET(zone, keg);
|
2020-01-04 03:15:34 +00:00
|
|
|
KEG_ASSERT_COLD(keg);
|
2009-01-25 09:11:24 +00:00
|
|
|
keg->uk_fini = fini;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
|
|
|
|
{
|
2013-06-20 19:08:12 +00:00
|
|
|
|
2020-01-04 03:15:34 +00:00
|
|
|
ZONE_ASSERT_COLD(zone);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
zone->uz_init = zinit;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
|
|
|
|
{
|
2013-06-20 19:08:12 +00:00
|
|
|
|
2020-01-04 03:15:34 +00:00
|
|
|
ZONE_ASSERT_COLD(zone);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
zone->uz_fini = zfini;
|
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zone_set_freef(uma_zone_t zone, uma_free freef)
|
|
|
|
{
|
2013-06-17 03:43:47 +00:00
|
|
|
uma_keg_t keg;
|
2009-01-25 09:11:24 +00:00
|
|
|
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
KEG_GET(zone, keg);
|
2020-01-04 03:15:34 +00:00
|
|
|
KEG_ASSERT_COLD(keg);
|
2013-06-17 03:43:47 +00:00
|
|
|
keg->uk_freef = freef;
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
|
|
|
|
{
|
2009-01-25 09:11:24 +00:00
|
|
|
uma_keg_t keg;
|
|
|
|
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
KEG_GET(zone, keg);
|
2020-01-04 03:15:34 +00:00
|
|
|
KEG_ASSERT_COLD(keg);
|
2009-01-25 09:11:24 +00:00
|
|
|
keg->uk_allocf = allocf;
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2020-01-31 00:49:51 +00:00
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zone_set_smr(uma_zone_t zone, smr_t smr)
|
|
|
|
{
|
|
|
|
|
|
|
|
ZONE_ASSERT_COLD(zone);
|
|
|
|
|
2020-03-01 21:49:16 +00:00
|
|
|
KASSERT(smr != NULL, ("Got NULL smr"));
|
|
|
|
KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
|
|
|
|
("zone %p (%s) already uses SMR", zone, zone->uz_name));
|
2020-01-31 00:49:51 +00:00
|
|
|
zone->uz_flags |= UMA_ZONE_SMR;
|
|
|
|
zone->uz_smr = smr;
|
|
|
|
zone_update_caches(zone);
|
|
|
|
}
|
|
|
|
|
|
|
|
smr_t
|
|
|
|
uma_zone_get_smr(uma_zone_t zone)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (zone->uz_smr);
|
|
|
|
}
|
|
|
|
|
2013-06-26 00:57:38 +00:00
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zone_reserve(uma_zone_t zone, int items)
|
|
|
|
{
|
|
|
|
uma_keg_t keg;
|
|
|
|
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
KEG_GET(zone, keg);
|
2020-01-04 03:15:34 +00:00
|
|
|
KEG_ASSERT_COLD(keg);
|
2013-06-26 00:57:38 +00:00
|
|
|
keg->uk_reserve = items;
|
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/* See uma.h */
|
|
|
|
int
|
2013-02-26 23:35:27 +00:00
|
|
|
uma_zone_reserve_kva(uma_zone_t zone, int count)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
uma_keg_t keg;
|
2002-03-19 09:11:49 +00:00
|
|
|
vm_offset_t kva;
|
2015-08-10 17:16:49 +00:00
|
|
|
u_int pages;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
KEG_GET(zone, keg);
|
2020-01-04 03:15:34 +00:00
|
|
|
KEG_ASSERT_COLD(keg);
|
|
|
|
ZONE_ASSERT_COLD(zone);
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2020-01-06 02:51:19 +00:00
|
|
|
pages = howmany(count, keg->uk_ipers) * keg->uk_ppera;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
2013-02-26 23:35:27 +00:00
|
|
|
#ifdef UMA_MD_SMALL_ALLOC
|
|
|
|
if (keg->uk_ppera > 1) {
|
|
|
|
#else
|
|
|
|
if (1) {
|
|
|
|
#endif
|
2017-03-11 16:43:38 +00:00
|
|
|
kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
|
2013-02-26 23:35:27 +00:00
|
|
|
if (kva == 0)
|
|
|
|
return (0);
|
|
|
|
} else
|
|
|
|
kva = 0;
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
|
|
|
|
MPASS(keg->uk_kva == 0);
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
keg->uk_kva = kva;
|
2013-02-26 23:35:27 +00:00
|
|
|
keg->uk_offset = 0;
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
zone->uz_max_items = pages * keg->uk_ipers;
|
2013-02-26 23:35:27 +00:00
|
|
|
#ifdef UMA_MD_SMALL_ALLOC
|
|
|
|
keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
|
|
|
|
#else
|
|
|
|
keg->uk_allocf = noobj_alloc;
|
|
|
|
#endif
|
2019-12-25 20:57:24 +00:00
|
|
|
keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
|
|
|
|
zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
|
|
|
|
zone_update_caches(zone);
|
2013-06-20 19:08:12 +00:00
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_prealloc(uma_zone_t zone, int items)
|
|
|
|
{
|
2018-10-30 17:57:40 +00:00
|
|
|
struct vm_domainset_iter di;
|
2018-01-12 23:25:05 +00:00
|
|
|
uma_domain_t dom;
|
2002-03-19 09:11:49 +00:00
|
|
|
uma_slab_t slab;
|
Bring in mbuma to replace mballoc.
mbuma is an Mbuf & Cluster allocator built on top of a number of
extensions to the UMA framework, all included herein.
Extensions to UMA worth noting:
- Better layering between slab <-> zone caches; introduce
Keg structure which splits off slab cache away from the
zone structure and allows multiple zones to be stacked
on top of a single Keg (single type of slab cache);
perhaps we should look into defining a subset API on
top of the Keg for special use by malloc(9),
for example.
- UMA_ZONE_REFCNT zones can now be added, and reference
counters automagically allocated for them within the end
of the associated slab structures. uma_find_refcnt()
does a kextract to fetch the slab struct reference from
the underlying page, and lookup the corresponding refcnt.
mbuma things worth noting:
- integrates mbuf & cluster allocations with extended UMA
and provides caches for commonly-allocated items; defines
several zones (two primary, one secondary) and two kegs.
- change up certain code paths that always used to do:
m_get() + m_clget() to instead just use m_getcl() and
try to take advantage of the newly defined secondary
Packet zone.
- netstat(1) and systat(1) quickly hacked up to do basic
stat reporting but additional stats work needs to be
done once some other details within UMA have been taken
care of and it becomes clearer to how stats will work
within the modified framework.
From the user perspective, one implication is that the
NMBCLUSTERS compile-time option is no longer used. The
maximum number of clusters is still capped off according
to maxusers, but it can be made unlimited by setting
the kern.ipc.nmbclusters boot-time tunable to zero.
Work should be done to write an appropriate sysctl
handler allowing dynamic tuning of kern.ipc.nmbclusters
at runtime.
Additional things worth noting/known issues (READ):
- One report of 'ips' (ServeRAID) driver acting really
slow in conjunction with mbuma. Need more data.
Latest report is that ips is equally sucking with
and without mbuma.
- Giant leak in NFS code sometimes occurs, can't
reproduce but currently analyzing; brueffer is
able to reproduce but THIS IS NOT an mbuma-specific
problem and currently occurs even WITHOUT mbuma.
- Issues in network locking: there is at least one
code path in the rip code where one or more locks
are acquired and we end up in m_prepend() with
M_WAITOK, which causes WITNESS to whine from within
UMA. Current temporary solution: force all UMA
allocations to be M_NOWAIT from within UMA for now
to avoid deadlocks unless WITNESS is defined and we
can determine with certainty that we're not holding
any locks when we're M_WAITOK.
- I've seen at least one weird socketbuffer empty-but-
mbuf-still-attached panic. I don't believe this
to be related to mbuma but please keep your eyes
open, turn on debugging, and capture crash dumps.
This change removes more code than it adds.
A paper is available detailing the change and considering
various performance issues, it was presented at BSDCan2004:
http://www.unixdaemons.com/~bmilekic/netbuf_bmilekic.pdf
Please read the paper for Future Work and implementation
details, as well as credits.
Testing and Debugging:
rwatson,
brueffer,
Ketrien I. Saihr-Kesenchedra,
...
Reviewed by: Lots of people (for different parts)
2004-05-31 21:46:06 +00:00
|
|
|
uma_keg_t keg;
|
2019-01-23 18:58:15 +00:00
|
|
|
int aflags, domain, slabs;
|
2002-03-19 09:11:49 +00:00
|
|
|
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
KEG_GET(zone, keg);
|
2020-01-06 02:51:19 +00:00
|
|
|
slabs = howmany(items, keg->uk_ipers);
|
2018-10-24 16:41:47 +00:00
|
|
|
while (slabs-- > 0) {
|
2019-01-23 18:58:15 +00:00
|
|
|
aflags = M_NOWAIT;
|
|
|
|
vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
|
|
|
|
&aflags);
|
|
|
|
for (;;) {
|
|
|
|
slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
|
|
|
|
aflags);
|
|
|
|
if (slab != NULL) {
|
|
|
|
dom = &keg->uk_domain[slab->us_domain];
|
2020-02-11 20:06:33 +00:00
|
|
|
/*
|
|
|
|
* keg_alloc_slab() always returns a slab on the
|
|
|
|
* partial list.
|
|
|
|
*/
|
2020-01-04 03:30:08 +00:00
|
|
|
LIST_REMOVE(slab, us_link);
|
2019-01-23 18:58:15 +00:00
|
|
|
LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
|
|
|
|
us_link);
|
2020-02-11 20:06:33 +00:00
|
|
|
dom->ud_free_slabs++;
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_UNLOCK(keg, slab->us_domain);
|
2019-01-23 18:58:15 +00:00
|
|
|
break;
|
|
|
|
}
|
2020-01-04 03:30:08 +00:00
|
|
|
if (vm_domainset_iter_policy(&di, &domain) != 0)
|
2020-09-08 23:28:09 +00:00
|
|
|
vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
|
2019-01-23 18:58:15 +00:00
|
|
|
}
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-17 01:59:55 +00:00
|
|
|
/*
|
|
|
|
* Returns a snapshot of memory consumption in bytes.
|
|
|
|
*/
|
|
|
|
size_t
|
|
|
|
uma_zone_memory(uma_zone_t zone)
|
|
|
|
{
|
|
|
|
size_t sz;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
sz = 0;
|
|
|
|
if (zone->uz_flags & UMA_ZFLAG_CACHE) {
|
|
|
|
for (i = 0; i < vm_ndomains; i++)
|
2020-02-19 18:48:46 +00:00
|
|
|
sz += ZDOM_GET(zone, i)->uzd_nitems;
|
2020-02-17 01:59:55 +00:00
|
|
|
return (sz * zone->uz_size);
|
|
|
|
}
|
|
|
|
for (i = 0; i < vm_ndomains; i++)
|
|
|
|
sz += zone->uz_keg->uk_domain[i].ud_pages;
|
|
|
|
|
|
|
|
return (sz * PAGE_SIZE);
|
|
|
|
}
|
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/* See uma.h */
|
2019-09-01 22:22:43 +00:00
|
|
|
void
|
|
|
|
uma_reclaim(int req)
|
2002-03-19 09:11:49 +00:00
|
|
|
{
|
2021-04-14 12:57:24 -04:00
|
|
|
uma_reclaim_domain(req, UMA_ANYDOMAIN);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
uma_reclaim_domain(int req, int domain)
|
|
|
|
{
|
|
|
|
void *arg;
|
2015-05-09 20:08:36 +00:00
|
|
|
|
2002-04-08 06:20:34 +00:00
|
|
|
bucket_enable();
|
2019-09-01 22:22:43 +00:00
|
|
|
|
2021-04-14 12:57:24 -04:00
|
|
|
arg = (void *)(uintptr_t)domain;
|
|
|
|
sx_slock(&uma_reclaim_lock);
|
2019-09-01 22:22:43 +00:00
|
|
|
switch (req) {
|
|
|
|
case UMA_RECLAIM_TRIM:
|
2021-04-14 12:57:24 -04:00
|
|
|
zone_foreach(zone_trim, arg);
|
2019-09-01 22:22:43 +00:00
|
|
|
break;
|
|
|
|
case UMA_RECLAIM_DRAIN:
|
2021-04-14 12:57:24 -04:00
|
|
|
zone_foreach(zone_drain, arg);
|
|
|
|
break;
|
2019-09-01 22:22:43 +00:00
|
|
|
case UMA_RECLAIM_DRAIN_CPU:
|
2021-04-14 12:57:24 -04:00
|
|
|
zone_foreach(zone_drain, arg);
|
|
|
|
pcpu_cache_drain_safe(NULL);
|
|
|
|
zone_foreach(zone_drain, arg);
|
2019-09-01 22:22:43 +00:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
panic("unhandled reclamation request %d", req);
|
2013-11-19 10:51:46 +00:00
|
|
|
}
|
2018-11-13 19:44:40 +00:00
|
|
|
|
2002-03-19 09:11:49 +00:00
|
|
|
/*
|
|
|
|
* Some slabs may have been freed but this zone will be visited early
|
|
|
|
* we visit again so that we can free pages that are empty once other
|
|
|
|
* zones are drained. We have to do the same for buckets.
|
|
|
|
*/
|
2021-04-14 12:57:24 -04:00
|
|
|
zone_drain(slabzones[0], arg);
|
|
|
|
zone_drain(slabzones[1], arg);
|
|
|
|
bucket_zone_drain(domain);
|
|
|
|
sx_sunlock(&uma_reclaim_lock);
|
2002-03-19 09:11:49 +00:00
|
|
|
}
|
|
|
|
|
2017-11-28 23:40:54 +00:00
|
|
|
static volatile int uma_reclaim_needed;
|
2015-05-09 20:08:36 +00:00
|
|
|
|
|
|
|
void
|
|
|
|
uma_reclaim_wakeup(void)
|
|
|
|
{
|
|
|
|
|
2017-11-28 23:40:54 +00:00
|
|
|
if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
|
|
|
|
wakeup(uma_reclaim);
|
2015-05-09 20:08:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
uma_reclaim_worker(void *arg __unused)
|
|
|
|
{
|
|
|
|
|
|
|
|
for (;;) {
|
2019-09-01 22:22:43 +00:00
|
|
|
sx_xlock(&uma_reclaim_lock);
|
2017-12-19 10:06:55 +00:00
|
|
|
while (atomic_load_int(&uma_reclaim_needed) == 0)
|
2019-09-01 22:22:43 +00:00
|
|
|
sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl",
|
2017-11-28 23:40:54 +00:00
|
|
|
hz);
|
2019-09-01 22:22:43 +00:00
|
|
|
sx_xunlock(&uma_reclaim_lock);
|
2017-11-28 23:40:54 +00:00
|
|
|
EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
|
2019-09-01 22:22:43 +00:00
|
|
|
uma_reclaim(UMA_RECLAIM_DRAIN_CPU);
|
2017-12-19 10:06:55 +00:00
|
|
|
atomic_store_int(&uma_reclaim_needed, 0);
|
2017-11-28 23:40:54 +00:00
|
|
|
/* Don't fire more than once per-second. */
|
|
|
|
pause("umarclslp", hz);
|
2015-05-09 20:08:36 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-01 22:22:43 +00:00
|
|
|
/* See uma.h */
|
|
|
|
void
|
|
|
|
uma_zone_reclaim(uma_zone_t zone, int req)
|
|
|
|
{
|
2021-04-14 12:57:24 -04:00
|
|
|
uma_zone_reclaim_domain(zone, req, UMA_ANYDOMAIN);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
uma_zone_reclaim_domain(uma_zone_t zone, int req, int domain)
|
|
|
|
{
|
|
|
|
void *arg;
|
2019-09-01 22:22:43 +00:00
|
|
|
|
2021-04-14 12:57:24 -04:00
|
|
|
arg = (void *)(uintptr_t)domain;
|
2019-09-01 22:22:43 +00:00
|
|
|
switch (req) {
|
|
|
|
case UMA_RECLAIM_TRIM:
|
2021-04-14 12:57:24 -04:00
|
|
|
zone_trim(zone, arg);
|
2019-09-01 22:22:43 +00:00
|
|
|
break;
|
|
|
|
case UMA_RECLAIM_DRAIN:
|
2021-04-14 12:57:24 -04:00
|
|
|
zone_drain(zone, arg);
|
2019-09-01 22:22:43 +00:00
|
|
|
break;
|
|
|
|
case UMA_RECLAIM_DRAIN_CPU:
|
|
|
|
pcpu_cache_drain_safe(zone);
|
2021-04-14 12:57:24 -04:00
|
|
|
zone_drain(zone, arg);
|
2019-09-01 22:22:43 +00:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
panic("unhandled reclamation request %d", req);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-01-05 19:09:01 +00:00
|
|
|
/* See uma.h */
|
|
|
|
int
|
|
|
|
uma_zone_exhausted(uma_zone_t zone)
|
|
|
|
{
|
|
|
|
|
2020-01-04 03:15:34 +00:00
|
|
|
return (atomic_load_32(&zone->uz_sleepers) > 0);
|
2007-01-25 01:05:23 +00:00
|
|
|
}
|
|
|
|
|
2017-11-28 23:40:54 +00:00
|
|
|
unsigned long
|
|
|
|
uma_limit(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (uma_kmem_limit);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
uma_set_limit(unsigned long limit)
|
|
|
|
{
|
|
|
|
|
|
|
|
uma_kmem_limit = limit;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long
|
|
|
|
uma_size(void)
|
|
|
|
{
|
|
|
|
|
2019-06-06 16:23:44 +00:00
|
|
|
return (atomic_load_long(&uma_kmem_total));
|
2018-01-02 04:35:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
long
|
|
|
|
uma_avail(void)
|
|
|
|
{
|
|
|
|
|
2019-06-06 16:23:44 +00:00
|
|
|
return (uma_kmem_limit - uma_size());
|
2017-11-28 23:40:54 +00:00
|
|
|
}
|
|
|
|
|
2006-07-18 01:13:18 +00:00
|
|
|
#ifdef DDB
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
/*
|
|
|
|
* Generate statistics across both the zone and its per-cpu cache's. Return
|
|
|
|
* desired statistics if the pointer is non-NULL for that statistic.
|
|
|
|
*
|
|
|
|
* Note: does not update the zone statistics, as it can't safely clear the
|
|
|
|
* per-CPU cache statistic.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
static void
|
2018-11-13 19:44:40 +00:00
|
|
|
uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
|
2019-08-06 21:50:34 +00:00
|
|
|
uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
{
|
|
|
|
uma_cache_t cache;
|
2019-08-06 21:50:34 +00:00
|
|
|
uint64_t allocs, frees, sleeps, xdomain;
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
int cachefree, cpu;
|
|
|
|
|
2019-08-06 21:50:34 +00:00
|
|
|
allocs = frees = sleeps = xdomain = 0;
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
cachefree = 0;
|
2010-06-11 18:46:34 +00:00
|
|
|
CPU_FOREACH(cpu) {
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
cache = &z->uz_cpu[cpu];
|
2019-12-25 20:50:53 +00:00
|
|
|
cachefree += cache->uc_allocbucket.ucb_cnt;
|
|
|
|
cachefree += cache->uc_freebucket.ucb_cnt;
|
|
|
|
xdomain += cache->uc_crossbucket.ucb_cnt;
|
|
|
|
cachefree += cache->uc_crossbucket.ucb_cnt;
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
allocs += cache->uc_allocs;
|
|
|
|
frees += cache->uc_frees;
|
|
|
|
}
|
2019-01-15 18:24:34 +00:00
|
|
|
allocs += counter_u64_fetch(z->uz_allocs);
|
|
|
|
frees += counter_u64_fetch(z->uz_frees);
|
2020-02-19 18:48:46 +00:00
|
|
|
xdomain += counter_u64_fetch(z->uz_xdomain);
|
2010-06-15 19:28:37 +00:00
|
|
|
sleeps += z->uz_sleeps;
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
if (cachefreep != NULL)
|
|
|
|
*cachefreep = cachefree;
|
|
|
|
if (allocsp != NULL)
|
|
|
|
*allocsp = allocs;
|
|
|
|
if (freesp != NULL)
|
|
|
|
*freesp = frees;
|
2010-06-15 19:28:37 +00:00
|
|
|
if (sleepsp != NULL)
|
|
|
|
*sleepsp = sleeps;
|
2019-08-06 21:50:34 +00:00
|
|
|
if (xdomainp != NULL)
|
|
|
|
*xdomainp = xdomain;
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
}
|
2006-07-18 01:13:18 +00:00
|
|
|
#endif /* DDB */
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
uma_keg_t kz;
|
|
|
|
uma_zone_t z;
|
|
|
|
int count;
|
|
|
|
|
|
|
|
count = 0;
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_rlock(&uma_rwlock);
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
LIST_FOREACH(kz, &uma_kegs, uk_link) {
|
|
|
|
LIST_FOREACH(z, &kz->uk_zones, uz_link)
|
|
|
|
count++;
|
|
|
|
}
|
2019-02-07 03:32:45 +00:00
|
|
|
LIST_FOREACH(z, &uma_cachezones, uz_link)
|
|
|
|
count++;
|
|
|
|
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_runlock(&uma_rwlock);
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
return (sysctl_handle_int(oidp, &count, 0, req));
|
|
|
|
}
|
|
|
|
|
2019-02-07 03:32:45 +00:00
|
|
|
static void
|
|
|
|
uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
|
|
|
|
struct uma_percpu_stat *ups, bool internal)
|
|
|
|
{
|
|
|
|
uma_zone_domain_t zdom;
|
|
|
|
uma_cache_t cache;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < vm_ndomains; i++) {
|
2020-02-19 18:48:46 +00:00
|
|
|
zdom = ZDOM_GET(z, i);
|
2019-02-07 03:32:45 +00:00
|
|
|
uth->uth_zone_free += zdom->uzd_nitems;
|
|
|
|
}
|
|
|
|
uth->uth_allocs = counter_u64_fetch(z->uz_allocs);
|
|
|
|
uth->uth_frees = counter_u64_fetch(z->uz_frees);
|
|
|
|
uth->uth_fails = counter_u64_fetch(z->uz_fails);
|
2020-02-19 18:48:46 +00:00
|
|
|
uth->uth_xdomain = counter_u64_fetch(z->uz_xdomain);
|
2019-02-07 03:32:45 +00:00
|
|
|
uth->uth_sleeps = z->uz_sleeps;
|
2019-10-22 14:20:06 +00:00
|
|
|
|
2019-02-07 03:32:45 +00:00
|
|
|
for (i = 0; i < mp_maxid + 1; i++) {
|
|
|
|
bzero(&ups[i], sizeof(*ups));
|
|
|
|
if (internal || CPU_ABSENT(i))
|
|
|
|
continue;
|
|
|
|
cache = &z->uz_cpu[i];
|
2019-12-25 20:50:53 +00:00
|
|
|
ups[i].ups_cache_free += cache->uc_allocbucket.ucb_cnt;
|
|
|
|
ups[i].ups_cache_free += cache->uc_freebucket.ucb_cnt;
|
|
|
|
ups[i].ups_cache_free += cache->uc_crossbucket.ucb_cnt;
|
2019-02-07 03:32:45 +00:00
|
|
|
ups[i].ups_allocs = cache->uc_allocs;
|
|
|
|
ups[i].ups_frees = cache->uc_frees;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
static int
|
|
|
|
sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
struct uma_stream_header ush;
|
|
|
|
struct uma_type_header uth;
|
2018-03-24 13:48:53 +00:00
|
|
|
struct uma_percpu_stat *ups;
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
struct sbuf sbuf;
|
|
|
|
uma_keg_t kz;
|
|
|
|
uma_zone_t z;
|
2020-01-04 03:04:46 +00:00
|
|
|
uint64_t items;
|
2020-01-04 03:30:08 +00:00
|
|
|
uint32_t kfree, pages;
|
2010-09-16 16:13:12 +00:00
|
|
|
int count, error, i;
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
|
2011-01-27 00:34:12 +00:00
|
|
|
error = sysctl_wire_old_buffer(req, 0);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
2010-09-16 16:13:12 +00:00
|
|
|
sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
|
2015-03-14 17:08:28 +00:00
|
|
|
sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
|
2018-03-24 13:48:53 +00:00
|
|
|
ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
|
2010-09-13 18:48:23 +00:00
|
|
|
|
2010-09-16 16:13:12 +00:00
|
|
|
count = 0;
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_rlock(&uma_rwlock);
|
2010-09-13 18:48:23 +00:00
|
|
|
LIST_FOREACH(kz, &uma_kegs, uk_link) {
|
|
|
|
LIST_FOREACH(z, &kz->uk_zones, uz_link)
|
2010-09-16 16:13:12 +00:00
|
|
|
count++;
|
2010-09-13 18:48:23 +00:00
|
|
|
}
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
|
2019-02-07 03:32:45 +00:00
|
|
|
LIST_FOREACH(z, &uma_cachezones, uz_link)
|
|
|
|
count++;
|
|
|
|
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
/*
|
|
|
|
* Insert stream header.
|
|
|
|
*/
|
|
|
|
bzero(&ush, sizeof(ush));
|
|
|
|
ush.ush_version = UMA_STREAM_VERSION;
|
2005-07-16 11:03:06 +00:00
|
|
|
ush.ush_maxcpus = (mp_maxid + 1);
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
ush.ush_count = count;
|
2010-09-16 16:13:12 +00:00
|
|
|
(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
|
|
|
|
LIST_FOREACH(kz, &uma_kegs, uk_link) {
|
2020-01-04 03:30:08 +00:00
|
|
|
kfree = pages = 0;
|
|
|
|
for (i = 0; i < vm_ndomains; i++) {
|
2020-02-11 20:06:33 +00:00
|
|
|
kfree += kz->uk_domain[i].ud_free_items;
|
2020-01-04 03:30:08 +00:00
|
|
|
pages += kz->uk_domain[i].ud_pages;
|
|
|
|
}
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
LIST_FOREACH(z, &kz->uk_zones, uz_link) {
|
|
|
|
bzero(&uth, sizeof(uth));
|
2005-07-25 00:47:32 +00:00
|
|
|
strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
uth.uth_align = kz->uk_align;
|
|
|
|
uth.uth_size = kz->uk_size;
|
|
|
|
uth.uth_rsize = kz->uk_rsize;
|
2020-01-04 03:04:46 +00:00
|
|
|
if (z->uz_max_items > 0) {
|
|
|
|
items = UZ_ITEMS_COUNT(z->uz_items);
|
|
|
|
uth.uth_pages = (items / kz->uk_ipers) *
|
2019-01-15 18:32:26 +00:00
|
|
|
kz->uk_ppera;
|
2020-01-04 03:04:46 +00:00
|
|
|
} else
|
2020-01-04 03:30:08 +00:00
|
|
|
uth.uth_pages = pages;
|
2019-01-15 18:49:31 +00:00
|
|
|
uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) *
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
kz->uk_ppera;
|
|
|
|
uth.uth_limit = z->uz_max_items;
|
2020-01-04 03:30:08 +00:00
|
|
|
uth.uth_keg_free = kfree;
|
2005-07-25 00:47:32 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A zone is secondary is it is not the first entry
|
|
|
|
* on the keg's zone list.
|
|
|
|
*/
|
2009-01-25 09:11:24 +00:00
|
|
|
if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
|
2005-07-25 00:47:32 +00:00
|
|
|
(LIST_FIRST(&kz->uk_zones) != z))
|
|
|
|
uth.uth_zone_flags = UTH_ZONE_SECONDARY;
|
2019-02-07 03:32:45 +00:00
|
|
|
uma_vm_zone_stats(&uth, z, &sbuf, ups,
|
|
|
|
kz->uk_flags & UMA_ZFLAG_INTERNAL);
|
2018-03-24 13:48:53 +00:00
|
|
|
(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
|
|
|
|
for (i = 0; i < mp_maxid + 1; i++)
|
|
|
|
(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
}
|
|
|
|
}
|
2019-02-07 03:32:45 +00:00
|
|
|
LIST_FOREACH(z, &uma_cachezones, uz_link) {
|
|
|
|
bzero(&uth, sizeof(uth));
|
|
|
|
strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
|
|
|
|
uth.uth_size = z->uz_size;
|
|
|
|
uma_vm_zone_stats(&uth, z, &sbuf, ups, false);
|
|
|
|
(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
|
|
|
|
for (i = 0; i < mp_maxid + 1; i++)
|
|
|
|
(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
|
|
|
|
}
|
|
|
|
|
2014-10-05 21:34:56 +00:00
|
|
|
rw_runlock(&uma_rwlock);
|
2010-09-16 16:13:12 +00:00
|
|
|
error = sbuf_finish(&sbuf);
|
|
|
|
sbuf_delete(&sbuf);
|
2018-03-24 13:48:53 +00:00
|
|
|
free(ups, M_TEMP);
|
Introduce a new sysctl, vm.zone_stats, which exports UMA(9) allocator
statistics via a binary structure stream:
- Add structure 'uma_stream_header', which defines a stream version,
definition of MAXCPUs used in the stream, and the number of zone
records in the stream.
- Add structure 'uma_type_header', which defines the name, alignment,
size, resource allocation limits, current pages allocated, preferred
bucket size, and central zone + keg statistics.
- Add structure 'uma_percpu_stat', which, for each per-CPU cache,
includes the number of allocations and frees, as well as the number
of free items in the cache.
- When the sysctl is queried, return a stream header, followed by a
series of type descriptions, each consisting of a type header
followed by a series of MAXCPUs uma_percpu_stat structures holding
per-CPU allocation information. Typical values of MAXCPU will be
1 (UP compiled kernel) and 16 (SMP compiled kernel).
This query mechanism allows user space monitoring tools to extract
memory allocation statistics in a machine-readable form, and to do so
at a per-CPU granularity, allowing monitoring of allocation patterns
across CPUs in order to better understand the distribution of work and
memory flow over multiple CPUs.
While here, also export the number of UMA zones as a sysctl
vm.uma_count, in order to assist in sizing user swpace buffers to
receive the stream.
A follow-up commit of libmemstat(3), a library to monitor kernel memory
allocation, will occur in the next few days. This change directly
supports converting netstat(1)'s "-mb" mode to using UMA-sourced stats
rather than separately maintained mbuf allocator statistics.
MFC after: 1 week
2005-07-14 16:35:13 +00:00
|
|
|
return (error);
|
|
|
|
}
|
2005-10-20 16:39:33 +00:00
|
|
|
|
2014-02-07 14:29:03 +00:00
|
|
|
int
|
|
|
|
sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
uma_zone_t zone = *(uma_zone_t *)arg1;
|
2015-04-10 06:56:49 +00:00
|
|
|
int error, max;
|
2014-02-07 14:29:03 +00:00
|
|
|
|
2015-04-10 06:56:49 +00:00
|
|
|
max = uma_zone_get_max(zone);
|
2014-02-07 14:29:03 +00:00
|
|
|
error = sysctl_handle_int(oidp, &max, 0, req);
|
|
|
|
if (error || !req->newptr)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
uma_zone_set_max(zone, max);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
2019-11-28 00:19:09 +00:00
|
|
|
uma_zone_t zone;
|
2014-02-07 14:29:03 +00:00
|
|
|
int cur;
|
|
|
|
|
2019-11-28 00:19:09 +00:00
|
|
|
/*
|
|
|
|
* Some callers want to add sysctls for global zones that
|
|
|
|
* may not yet exist so they pass a pointer to a pointer.
|
|
|
|
*/
|
|
|
|
if (arg2 == 0)
|
|
|
|
zone = *(uma_zone_t *)arg1;
|
|
|
|
else
|
|
|
|
zone = arg1;
|
2014-02-07 14:29:03 +00:00
|
|
|
cur = uma_zone_get_cur(zone);
|
|
|
|
return (sysctl_handle_int(oidp, &cur, 0, req));
|
|
|
|
}
|
|
|
|
|
2019-11-28 00:19:09 +00:00
|
|
|
static int
|
|
|
|
sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
uma_zone_t zone = arg1;
|
|
|
|
uint64_t cur;
|
|
|
|
|
|
|
|
cur = uma_zone_get_allocs(zone);
|
|
|
|
return (sysctl_handle_64(oidp, &cur, 0, req));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
uma_zone_t zone = arg1;
|
|
|
|
uint64_t cur;
|
|
|
|
|
|
|
|
cur = uma_zone_get_frees(zone);
|
|
|
|
return (sysctl_handle_64(oidp, &cur, 0, req));
|
|
|
|
}
|
|
|
|
|
2019-12-11 06:50:55 +00:00
|
|
|
static int
|
|
|
|
sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
struct sbuf sbuf;
|
|
|
|
uma_zone_t zone = arg1;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
sbuf_new_for_sysctl(&sbuf, NULL, 0, req);
|
|
|
|
if (zone->uz_flags != 0)
|
|
|
|
sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS);
|
|
|
|
else
|
|
|
|
sbuf_printf(&sbuf, "0");
|
|
|
|
error = sbuf_finish(&sbuf);
|
|
|
|
sbuf_delete(&sbuf);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2019-12-13 09:32:09 +00:00
|
|
|
static int
|
|
|
|
sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
uma_keg_t keg = arg1;
|
|
|
|
int avail, effpct, total;
|
|
|
|
|
|
|
|
total = keg->uk_ppera * PAGE_SIZE;
|
2020-01-09 02:03:03 +00:00
|
|
|
if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0)
|
2020-01-14 02:14:15 +00:00
|
|
|
total += slabzone(keg->uk_ipers)->uz_keg->uk_rsize;
|
2019-12-13 09:32:09 +00:00
|
|
|
/*
|
|
|
|
* We consider the client's requested size and alignment here, not the
|
|
|
|
* real size determination uk_rsize, because we also adjust the real
|
|
|
|
* size for internal implementation reasons (max bitset size).
|
|
|
|
*/
|
|
|
|
avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1);
|
|
|
|
if ((keg->uk_flags & UMA_ZONE_PCPU) != 0)
|
|
|
|
avail *= mp_maxid + 1;
|
|
|
|
effpct = 100 * avail / total;
|
|
|
|
return (sysctl_handle_int(oidp, &effpct, 0, req));
|
|
|
|
}
|
|
|
|
|
2020-01-04 03:04:46 +00:00
|
|
|
static int
|
|
|
|
sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
uma_zone_t zone = arg1;
|
|
|
|
uint64_t cur;
|
|
|
|
|
|
|
|
cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items));
|
|
|
|
return (sysctl_handle_64(oidp, &cur, 0, req));
|
|
|
|
}
|
|
|
|
|
2016-02-03 22:02:36 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
static uma_slab_t
|
|
|
|
uma_dbg_getslab(uma_zone_t zone, void *item)
|
|
|
|
{
|
|
|
|
uma_slab_t slab;
|
|
|
|
uma_keg_t keg;
|
|
|
|
uint8_t *mem;
|
|
|
|
|
2020-01-04 03:15:34 +00:00
|
|
|
/*
|
|
|
|
* It is safe to return the slab here even though the
|
|
|
|
* zone is unlocked because the item's allocation state
|
|
|
|
* essentially holds a reference.
|
|
|
|
*/
|
2016-02-03 22:02:36 +00:00
|
|
|
mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
|
2020-01-04 03:15:34 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
|
|
|
|
return (NULL);
|
2020-01-09 02:03:03 +00:00
|
|
|
if (zone->uz_flags & UMA_ZFLAG_VTOSLAB)
|
2020-01-04 03:15:34 +00:00
|
|
|
return (vtoslab((vm_offset_t)mem));
|
|
|
|
keg = zone->uz_keg;
|
2020-01-09 02:03:03 +00:00
|
|
|
if ((keg->uk_flags & UMA_ZFLAG_HASH) == 0)
|
2020-01-04 03:15:34 +00:00
|
|
|
return ((uma_slab_t)(mem + keg->uk_pgoff));
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_LOCK(keg, 0);
|
2020-01-04 03:15:34 +00:00
|
|
|
slab = hash_sfind(&keg->uk_hash, mem);
|
2020-01-04 03:30:08 +00:00
|
|
|
KEG_UNLOCK(keg, 0);
|
2016-02-03 22:02:36 +00:00
|
|
|
|
|
|
|
return (slab);
|
|
|
|
}
|
|
|
|
|
2018-06-08 00:15:08 +00:00
|
|
|
static bool
|
|
|
|
uma_dbg_zskip(uma_zone_t zone, void *mem)
|
|
|
|
{
|
|
|
|
|
2020-01-04 03:15:34 +00:00
|
|
|
if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
|
2018-06-08 00:15:08 +00:00
|
|
|
return (true);
|
|
|
|
|
o Move zone limit from keg level up to zone level. This means that now
two zones sharing a keg may have different limits. Now this is going
to work:
zone = uma_zcreate();
uma_zone_set_max(zone, limit);
zone2 = uma_zsecond_create(zone);
uma_zone_set_max(zone2, limit2);
Kegs no longer have uk_maxpages field, but zones have uz_items. When
set, it may be rounded up to minimum possible CPU bucket cache size.
For small limits bucket cache can also be reconfigured to be smaller.
Counter uz_items is updated whenever items transition from keg to a
bucket cache or directly to a consumer. If zone has uz_maxitems set and
it is reached, then we are going to sleep.
o Since new limits don't play well with multi-keg zones, remove them. The
idea of multi-keg zones was introduced exactly 10 years ago, and never
have had a practical usage. In discussion with Jeff we came to a wild
agreement that if we ever want to reintroduce the idea of a smart allocator
that would be able to choose between two (or more) totally different
backing stores, that choice should be made one level higher than UMA,
e.g. in malloc(9) or in mget(), or whatever and choice should be controlled
by the caller.
o Sleeping code is improved to account number of sleepers and wake them one
by one, to avoid thundering herd problem.
o Flag UMA_ZONE_NOBUCKETCACHE removed, instead uma_zone_set_maxcache()
KPI added. Having no bucket cache basically means setting maxcache to 0.
o Now with many fields added and many removed (no multi-keg zones!) make
sure that struct uma_zone is perfectly aligned.
Reviewed by: markj, jeff
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D17773
2019-01-15 00:02:06 +00:00
|
|
|
return (uma_dbg_kskip(zone->uz_keg, mem));
|
2018-06-08 00:15:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
uma_dbg_kskip(uma_keg_t keg, void *mem)
|
|
|
|
{
|
|
|
|
uintptr_t idx;
|
|
|
|
|
|
|
|
if (dbg_divisor == 0)
|
|
|
|
return (true);
|
|
|
|
|
|
|
|
if (dbg_divisor == 1)
|
|
|
|
return (false);
|
|
|
|
|
|
|
|
idx = (uintptr_t)mem >> PAGE_SHIFT;
|
|
|
|
if (keg->uk_ipers > 1) {
|
|
|
|
idx *= keg->uk_ipers;
|
|
|
|
idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((idx / dbg_divisor) * dbg_divisor != idx) {
|
|
|
|
counter_u64_add(uma_skip_cnt, 1);
|
|
|
|
return (true);
|
|
|
|
}
|
|
|
|
counter_u64_add(uma_dbg_cnt, 1);
|
|
|
|
|
|
|
|
return (false);
|
|
|
|
}
|
|
|
|
|
2016-02-03 22:02:36 +00:00
|
|
|
/*
|
|
|
|
* Set up the slab's freei data such that uma_dbg_free can function.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
|
|
|
|
{
|
|
|
|
uma_keg_t keg;
|
|
|
|
int freei;
|
|
|
|
|
|
|
|
if (slab == NULL) {
|
|
|
|
slab = uma_dbg_getslab(zone, item);
|
|
|
|
if (slab == NULL)
|
2020-10-02 19:03:42 +00:00
|
|
|
panic("uma: item %p did not belong to zone %s",
|
2016-02-03 22:02:36 +00:00
|
|
|
item, zone->uz_name);
|
|
|
|
}
|
2019-11-28 07:49:25 +00:00
|
|
|
keg = zone->uz_keg;
|
2019-12-08 01:15:06 +00:00
|
|
|
freei = slab_item_index(slab, keg, item);
|
2016-02-03 22:02:36 +00:00
|
|
|
|
2020-12-31 13:02:45 -08:00
|
|
|
if (BIT_TEST_SET_ATOMIC(keg->uk_ipers, freei,
|
|
|
|
slab_dbg_bits(slab, keg)))
|
2020-10-02 19:03:42 +00:00
|
|
|
panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)",
|
2016-02-03 22:02:36 +00:00
|
|
|
item, zone, zone->uz_name, slab, freei);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verifies freed addresses. Checks for alignment, valid slab membership
|
|
|
|
* and duplicate frees.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
|
|
|
|
{
|
|
|
|
uma_keg_t keg;
|
|
|
|
int freei;
|
|
|
|
|
|
|
|
if (slab == NULL) {
|
|
|
|
slab = uma_dbg_getslab(zone, item);
|
|
|
|
if (slab == NULL)
|
2020-10-02 19:03:42 +00:00
|
|
|
panic("uma: Freed item %p did not belong to zone %s",
|
2016-02-03 22:02:36 +00:00
|
|
|
item, zone->uz_name);
|
|
|
|
}
|
2019-11-28 07:49:25 +00:00
|
|
|
keg = zone->uz_keg;
|
2019-12-08 01:15:06 +00:00
|
|
|
freei = slab_item_index(slab, keg, item);
|
2016-02-03 22:02:36 +00:00
|
|
|
|
|
|
|
if (freei >= keg->uk_ipers)
|
2020-10-02 19:03:42 +00:00
|
|
|
panic("Invalid free of %p from zone %p(%s) slab %p(%d)",
|
2016-02-03 22:02:36 +00:00
|
|
|
item, zone, zone->uz_name, slab, freei);
|
|
|
|
|
2019-12-08 01:15:06 +00:00
|
|
|
if (slab_item(slab, keg, freei) != item)
|
2020-10-02 19:03:42 +00:00
|
|
|
panic("Unaligned free of %p from zone %p(%s) slab %p(%d)",
|
2016-02-03 22:02:36 +00:00
|
|
|
item, zone, zone->uz_name, slab, freei);
|
|
|
|
|
2020-12-31 13:02:45 -08:00
|
|
|
if (!BIT_TEST_CLR_ATOMIC(keg->uk_ipers, freei,
|
|
|
|
slab_dbg_bits(slab, keg)))
|
2020-10-02 19:03:42 +00:00
|
|
|
panic("Duplicate free of %p from zone %p(%s) slab %p(%d)",
|
2016-02-03 22:02:36 +00:00
|
|
|
item, zone, zone->uz_name, slab, freei);
|
|
|
|
}
|
|
|
|
#endif /* INVARIANTS */
|
|
|
|
|
2005-10-20 16:39:33 +00:00
|
|
|
#ifdef DDB
|
2019-10-11 01:31:31 +00:00
|
|
|
static int64_t
|
|
|
|
get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used,
|
2019-10-11 06:02:03 +00:00
|
|
|
uint64_t *sleeps, long *cachefree, uint64_t *xdomain)
|
2019-10-11 01:31:31 +00:00
|
|
|
{
|
|
|
|
uint64_t frees;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
|
|
|
|
*allocs = counter_u64_fetch(z->uz_allocs);
|
|
|
|
frees = counter_u64_fetch(z->uz_frees);
|
|
|
|
*sleeps = z->uz_sleeps;
|
|
|
|
*cachefree = 0;
|
|
|
|
*xdomain = 0;
|
|
|
|
} else
|
|
|
|
uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps,
|
|
|
|
xdomain);
|
2020-01-04 03:30:08 +00:00
|
|
|
for (i = 0; i < vm_ndomains; i++) {
|
2020-02-19 18:48:46 +00:00
|
|
|
*cachefree += ZDOM_GET(z, i)->uzd_nitems;
|
2020-01-04 03:30:08 +00:00
|
|
|
if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
|
|
|
|
(LIST_FIRST(&kz->uk_zones) != z)))
|
2020-02-11 20:06:33 +00:00
|
|
|
*cachefree += kz->uk_domain[i].ud_free_items;
|
2020-01-04 03:30:08 +00:00
|
|
|
}
|
2019-10-11 01:31:31 +00:00
|
|
|
*used = *allocs - frees;
|
|
|
|
return (((int64_t)*used + *cachefree) * kz->uk_size);
|
|
|
|
}
|
|
|
|
|
2005-10-20 16:39:33 +00:00
|
|
|
DB_SHOW_COMMAND(uma, db_show_uma)
|
|
|
|
{
|
2019-10-11 01:31:31 +00:00
|
|
|
const char *fmt_hdr, *fmt_entry;
|
2005-10-20 16:39:33 +00:00
|
|
|
uma_keg_t kz;
|
|
|
|
uma_zone_t z;
|
2019-10-11 01:31:31 +00:00
|
|
|
uint64_t allocs, used, sleeps, xdomain;
|
2018-11-13 19:44:40 +00:00
|
|
|
long cachefree;
|
2019-10-11 01:31:31 +00:00
|
|
|
/* variables for sorting */
|
|
|
|
uma_keg_t cur_keg;
|
|
|
|
uma_zone_t cur_zone, last_zone;
|
|
|
|
int64_t cur_size, last_size, size;
|
|
|
|
int ties;
|
|
|
|
|
|
|
|
/* /i option produces machine-parseable CSV output */
|
|
|
|
if (modif[0] == 'i') {
|
|
|
|
fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n";
|
|
|
|
fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n";
|
|
|
|
} else {
|
|
|
|
fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n";
|
|
|
|
fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n";
|
|
|
|
}
|
2005-10-20 16:39:33 +00:00
|
|
|
|
2019-10-11 01:31:31 +00:00
|
|
|
db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests",
|
|
|
|
"Sleeps", "Bucket", "Total Mem", "XFree");
|
|
|
|
|
|
|
|
/* Sort the zones with largest size first. */
|
|
|
|
last_zone = NULL;
|
|
|
|
last_size = INT64_MAX;
|
|
|
|
for (;;) {
|
|
|
|
cur_zone = NULL;
|
|
|
|
cur_size = -1;
|
|
|
|
ties = 0;
|
|
|
|
LIST_FOREACH(kz, &uma_kegs, uk_link) {
|
|
|
|
LIST_FOREACH(z, &kz->uk_zones, uz_link) {
|
|
|
|
/*
|
|
|
|
* In the case of size ties, print out zones
|
|
|
|
* in the order they are encountered. That is,
|
|
|
|
* when we encounter the most recently output
|
|
|
|
* zone, we have already printed all preceding
|
|
|
|
* ties, and we must print all following ties.
|
|
|
|
*/
|
|
|
|
if (z == last_zone) {
|
|
|
|
ties = 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
size = get_uma_stats(kz, z, &allocs, &used,
|
|
|
|
&sleeps, &cachefree, &xdomain);
|
|
|
|
if (size > cur_size && size < last_size + ties)
|
|
|
|
{
|
|
|
|
cur_size = size;
|
|
|
|
cur_zone = z;
|
|
|
|
cur_keg = kz;
|
|
|
|
}
|
|
|
|
}
|
2005-10-20 16:39:33 +00:00
|
|
|
}
|
2019-10-11 01:31:31 +00:00
|
|
|
if (cur_zone == NULL)
|
|
|
|
break;
|
|
|
|
|
|
|
|
size = get_uma_stats(cur_keg, cur_zone, &allocs, &used,
|
|
|
|
&sleeps, &cachefree, &xdomain);
|
|
|
|
db_printf(fmt_entry, cur_zone->uz_name,
|
|
|
|
(uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree,
|
|
|
|
(uintmax_t)allocs, (uintmax_t)sleeps,
|
2019-11-28 00:19:09 +00:00
|
|
|
(unsigned)cur_zone->uz_bucket_size, (intmax_t)size,
|
|
|
|
xdomain);
|
2019-10-11 01:31:31 +00:00
|
|
|
|
|
|
|
if (db_pager_quit)
|
|
|
|
return;
|
|
|
|
last_zone = cur_zone;
|
|
|
|
last_size = cur_size;
|
2005-10-20 16:39:33 +00:00
|
|
|
}
|
|
|
|
}
|
2013-11-28 19:20:49 +00:00
|
|
|
|
|
|
|
DB_SHOW_COMMAND(umacache, db_show_umacache)
|
|
|
|
{
|
|
|
|
uma_zone_t z;
|
2018-01-12 23:25:05 +00:00
|
|
|
uint64_t allocs, frees;
|
2018-11-13 19:44:40 +00:00
|
|
|
long cachefree;
|
|
|
|
int i;
|
2013-11-28 19:20:49 +00:00
|
|
|
|
|
|
|
db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
|
|
|
|
"Requests", "Bucket");
|
|
|
|
LIST_FOREACH(z, &uma_cachezones, uz_link) {
|
2019-08-06 21:50:34 +00:00
|
|
|
uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
|
2018-11-13 19:44:40 +00:00
|
|
|
for (i = 0; i < vm_ndomains; i++)
|
2020-02-19 18:48:46 +00:00
|
|
|
cachefree += ZDOM_GET(z, i)->uzd_nitems;
|
2018-11-13 19:44:40 +00:00
|
|
|
db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
|
2013-11-28 19:20:49 +00:00
|
|
|
z->uz_name, (uintmax_t)z->uz_size,
|
|
|
|
(intmax_t)(allocs - frees), cachefree,
|
2019-11-28 00:19:09 +00:00
|
|
|
(uintmax_t)allocs, z->uz_bucket_size);
|
2013-11-28 19:20:49 +00:00
|
|
|
if (db_pager_quit)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2016-02-03 22:02:36 +00:00
|
|
|
#endif /* DDB */
|