Remove support for lazy deallocation. Benchmarks across a wide range of
allocation patterns, number of CPUs, and MALLOC_OPTIONS settings indicate that lazy deallocation has the potential to worsen throughput dramatically. Performance degradation occurs when multiple threads try to clear the lazy free cache simultaneously. Various experiments to avoid this bottleneck failed to completely solve this problem, while adding yet more complexity.
This commit is contained in:
parent
370f990d30
commit
196d0d4b59
@ -32,7 +32,7 @@
|
||||
.\" @(#)malloc.3 8.1 (Berkeley) 6/4/93
|
||||
.\" $FreeBSD$
|
||||
.\"
|
||||
.Dd February 5, 2008
|
||||
.Dd February 17, 2008
|
||||
.Dt MALLOC 3
|
||||
.Os
|
||||
.Sh NAME
|
||||
@ -231,17 +231,6 @@ This is intended for debugging and will impact performance negatively.
|
||||
.It K
|
||||
Double/halve the virtual memory chunk size.
|
||||
The default chunk size is 1 MB.
|
||||
.It L
|
||||
Double/halve the per-arena number of slots for lazy deallocation.
|
||||
Lazy deallocation can decrease lock contention, especially for programs that use
|
||||
the producer/consumer model.
|
||||
The default is 256 slots per arena (so
|
||||
.Ev MALLOC_OPTIONS=9l
|
||||
will disable lazy deallocation), but note that due to algorithmic details, the
|
||||
cache is typically flushed well before it completely fills.
|
||||
This option has no impact unless there are multiple CPUs, and lazy deallocation
|
||||
does not activate unless the program uses multiple threads.
|
||||
This option is not available for some configurations (non-PIC).
|
||||
.It M
|
||||
Use
|
||||
.Xr mmap 2
|
||||
|
@ -111,13 +111,6 @@
|
||||
# define MALLOC_STATS
|
||||
#endif
|
||||
|
||||
/*
|
||||
* MALLOC_LAZY_FREE enables the use of a per-thread vector of slots that free()
|
||||
* can atomically stuff object pointers into. This can reduce arena lock
|
||||
* contention.
|
||||
*/
|
||||
#define MALLOC_LAZY_FREE
|
||||
|
||||
/*
|
||||
* MALLOC_BALANCE enables monitoring of arena lock contention and dynamically
|
||||
* re-balances arena load if exponentially averaged contention exceeds a
|
||||
@ -241,10 +234,6 @@ __FBSDID("$FreeBSD$");
|
||||
/* MALLOC_BALANCE requires TLS. */
|
||||
# ifdef MALLOC_BALANCE
|
||||
# undef MALLOC_BALANCE
|
||||
# endif
|
||||
/* MALLOC_LAZY_FREE requires TLS. */
|
||||
# ifdef MALLOC_LAZY_FREE
|
||||
# undef MALLOC_LAZY_FREE
|
||||
# endif
|
||||
#endif
|
||||
|
||||
@ -305,20 +294,6 @@ __FBSDID("$FreeBSD$");
|
||||
#define RUN_MAX_SMALL_2POW 15
|
||||
#define RUN_MAX_SMALL (1U << RUN_MAX_SMALL_2POW)
|
||||
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
/* Default size of each arena's lazy free cache. */
|
||||
# define LAZY_FREE_2POW_DEFAULT 8
|
||||
/*
|
||||
* Number of pseudo-random probes to conduct before considering the cache to
|
||||
* be overly full. It takes on average n probes to detect fullness of
|
||||
* (n-1)/n. However, we are effectively doing multiple non-independent
|
||||
* trials (each deallocation is a trial), so the actual average threshold
|
||||
* for clearing the cache is somewhat lower.
|
||||
*/
|
||||
# define LAZY_FREE_NPROBES_2POW_MIN 2
|
||||
# define LAZY_FREE_NPROBES_2POW_MAX 3
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Hyper-threaded CPUs may need a special instruction inside spin loops in
|
||||
* order to yield to another virtual CPU. If no such instruction is defined
|
||||
@ -652,16 +627,6 @@ struct arena_s {
|
||||
uint32_t contention;
|
||||
#endif
|
||||
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
/*
|
||||
* Deallocation of small objects can be lazy, in which case free_cache
|
||||
* stores pointers to those objects that have not yet been deallocated.
|
||||
* In order to avoid lock contention, slots are chosen randomly. Empty
|
||||
* slots contain NULL.
|
||||
*/
|
||||
void **free_cache;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* bins is used to store rings of free regions of the following sizes,
|
||||
* assuming a 16-byte quantum, 4kB pagesize, and default MALLOC_OPTIONS.
|
||||
@ -831,9 +796,6 @@ static bool opt_dss = true;
|
||||
static bool opt_mmap = true;
|
||||
#endif
|
||||
static size_t opt_dirty_max = DIRTY_MAX_DEFAULT;
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
static int opt_lazy_free_2pow = LAZY_FREE_2POW_DEFAULT;
|
||||
#endif
|
||||
#ifdef MALLOC_BALANCE
|
||||
static uint64_t opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT;
|
||||
#endif
|
||||
@ -930,10 +892,6 @@ static void *arena_malloc_large(arena_t *arena, size_t size, bool zero);
|
||||
static void *arena_palloc(arena_t *arena, size_t alignment, size_t size,
|
||||
size_t alloc_size);
|
||||
static size_t arena_salloc(const void *ptr);
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
static void arena_dalloc_lazy_hard(arena_t *arena, arena_chunk_t *chunk,
|
||||
void *ptr, size_t pageind, arena_chunk_map_t *mapelm, unsigned slot);
|
||||
#endif
|
||||
static void arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk,
|
||||
void *ptr);
|
||||
static void arena_ralloc_resize_shrink(arena_t *arena, arena_chunk_t *chunk,
|
||||
@ -1115,7 +1073,7 @@ pow2_ceil(size_t x)
|
||||
return (x);
|
||||
}
|
||||
|
||||
#if (defined(MALLOC_LAZY_FREE) || defined(MALLOC_BALANCE))
|
||||
#ifdef MALLOC_BALANCE
|
||||
/*
|
||||
* Use a simple linear congruential pseudo-random number generator:
|
||||
*
|
||||
@ -1159,17 +1117,6 @@ prn_##suffix(uint32_t lg_range) \
|
||||
# define PRN(suffix, lg_range) prn_##suffix(lg_range)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Define PRNGs, one for each purpose, in order to avoid auto-correlation
|
||||
* problems.
|
||||
*/
|
||||
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
/* Define the per-thread PRNG used for lazy deallocation. */
|
||||
static __thread uint32_t lazy_free_x;
|
||||
PRN_DEFINE(lazy_free, lazy_free_x, 12345, 12347)
|
||||
#endif
|
||||
|
||||
#ifdef MALLOC_BALANCE
|
||||
/* Define the PRNG used for arena assignment. */
|
||||
static __thread uint32_t balance_x;
|
||||
@ -2000,27 +1947,8 @@ choose_arena_hard(void)
|
||||
|
||||
assert(__isthreaded);
|
||||
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
/*
|
||||
* Seed the PRNG used for lazy deallocation. Since seeding only occurs
|
||||
* on the first allocation by a thread, it is possible for a thread to
|
||||
* deallocate before seeding. This is not a critical issue though,
|
||||
* since it is extremely unusual for an application to to use threads
|
||||
* that deallocate but *never* allocate, and because even if seeding
|
||||
* never occurs for multiple threads, they will tend to drift apart
|
||||
* unless some aspect of the application forces deallocation
|
||||
* synchronization.
|
||||
*/
|
||||
SPRN(lazy_free, (uint32_t)(uintptr_t)(_pthread_self()));
|
||||
#endif
|
||||
|
||||
#ifdef MALLOC_BALANCE
|
||||
/*
|
||||
* Seed the PRNG used for arena load balancing. We can get away with
|
||||
* using the same seed here as for the lazy_free PRNG without
|
||||
* introducing autocorrelation because the PRNG parameters are
|
||||
* distinct.
|
||||
*/
|
||||
/* Seed the PRNG used for arena load balancing. */
|
||||
SPRN(balance, (uint32_t)(uintptr_t)(_pthread_self()));
|
||||
#endif
|
||||
|
||||
@ -3343,92 +3271,6 @@ arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
static inline void
|
||||
arena_dalloc_lazy(arena_t *arena, arena_chunk_t *chunk, void *ptr,
|
||||
size_t pageind, arena_chunk_map_t *mapelm)
|
||||
{
|
||||
void **free_cache = arena->free_cache;
|
||||
unsigned i, nprobes, slot;
|
||||
|
||||
if (__isthreaded == false || opt_lazy_free_2pow < 0) {
|
||||
malloc_spin_lock(&arena->lock);
|
||||
arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm);
|
||||
malloc_spin_unlock(&arena->lock);
|
||||
return;
|
||||
}
|
||||
|
||||
nprobes = (1U << LAZY_FREE_NPROBES_2POW_MIN) + PRN(lazy_free,
|
||||
(LAZY_FREE_NPROBES_2POW_MAX - LAZY_FREE_NPROBES_2POW_MIN));
|
||||
for (i = 0; i < nprobes; i++) {
|
||||
slot = PRN(lazy_free, opt_lazy_free_2pow);
|
||||
if (atomic_cmpset_ptr((uintptr_t *)&free_cache[slot],
|
||||
(uintptr_t)NULL, (uintptr_t)ptr)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
arena_dalloc_lazy_hard(arena, chunk, ptr, pageind, mapelm, slot);
|
||||
}
|
||||
|
||||
static void
|
||||
arena_dalloc_lazy_hard(arena_t *arena, arena_chunk_t *chunk, void *ptr,
|
||||
size_t pageind, arena_chunk_map_t *mapelm, unsigned slot)
|
||||
{
|
||||
void **free_cache = arena->free_cache;
|
||||
unsigned i;
|
||||
|
||||
malloc_spin_lock(&arena->lock);
|
||||
arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm);
|
||||
|
||||
/*
|
||||
* Check whether another thread already cleared the cache. It is
|
||||
* possible that another thread cleared the cache *and* this slot was
|
||||
* already refilled, which could result in a mostly fruitless cache
|
||||
* sweep, but such a sequence of events causes no correctness issues.
|
||||
*/
|
||||
if ((ptr = (void *)atomic_readandclear_ptr(
|
||||
(uintptr_t *)&free_cache[slot]))
|
||||
!= NULL) {
|
||||
unsigned lazy_free_mask;
|
||||
|
||||
/*
|
||||
* Clear the cache, since we failed to find a slot. It is
|
||||
* possible that other threads will continue to insert objects
|
||||
* into the cache while this one sweeps, but that is okay,
|
||||
* since on average the cache is still swept with the same
|
||||
* frequency.
|
||||
*/
|
||||
|
||||
/* Handle pointer at current slot. */
|
||||
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
|
||||
pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >>
|
||||
pagesize_2pow);
|
||||
mapelm = &chunk->map[pageind];
|
||||
arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm);
|
||||
|
||||
/* Sweep remainder of slots. */
|
||||
lazy_free_mask = (1U << opt_lazy_free_2pow) - 1;
|
||||
for (i = (slot + 1) & lazy_free_mask;
|
||||
i != slot;
|
||||
i = (i + 1) & lazy_free_mask) {
|
||||
ptr = (void *)atomic_readandclear_ptr(
|
||||
(uintptr_t *)&free_cache[i]);
|
||||
if (ptr != NULL) {
|
||||
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
|
||||
pageind = (((uintptr_t)ptr - (uintptr_t)chunk)
|
||||
>> pagesize_2pow);
|
||||
mapelm = &chunk->map[pageind];
|
||||
arena_dalloc_small(arena, chunk, ptr, pageind,
|
||||
*mapelm);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
malloc_spin_unlock(&arena->lock);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void
|
||||
arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr)
|
||||
{
|
||||
@ -3479,13 +3321,9 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
|
||||
mapelm = &chunk->map[pageind];
|
||||
if ((*mapelm & CHUNK_MAP_LARGE) == 0) {
|
||||
/* Small allocation. */
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
arena_dalloc_lazy(arena, chunk, ptr, pageind, mapelm);
|
||||
#else
|
||||
malloc_spin_lock(&arena->lock);
|
||||
arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm);
|
||||
malloc_spin_unlock(&arena->lock);
|
||||
#endif
|
||||
} else {
|
||||
assert((*mapelm & CHUNK_MAP_POS_MASK) == 0);
|
||||
arena_dalloc_large(arena, chunk, ptr);
|
||||
@ -3710,15 +3548,6 @@ arena_new(arena_t *arena)
|
||||
#ifdef MALLOC_BALANCE
|
||||
arena->contention = 0;
|
||||
#endif
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
if (opt_lazy_free_2pow >= 0) {
|
||||
arena->free_cache = (void **) base_calloc(1, sizeof(void *)
|
||||
* (1U << opt_lazy_free_2pow));
|
||||
if (arena->free_cache == NULL)
|
||||
return (true);
|
||||
} else
|
||||
arena->free_cache = NULL;
|
||||
#endif
|
||||
|
||||
/* Initialize bins. */
|
||||
prev_run_size = pagesize;
|
||||
@ -4037,13 +3866,6 @@ malloc_print_stats(void)
|
||||
|
||||
_malloc_message("CPUs: ", umax2s(ncpus, s), "\n", "");
|
||||
_malloc_message("Max arenas: ", umax2s(narenas, s), "\n", "");
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
if (opt_lazy_free_2pow >= 0) {
|
||||
_malloc_message("Lazy free slots: ",
|
||||
umax2s(1U << opt_lazy_free_2pow, s), "\n", "");
|
||||
} else
|
||||
_malloc_message("Lazy free slots: 0\n", "", "", "");
|
||||
#endif
|
||||
#ifdef MALLOC_BALANCE
|
||||
_malloc_message("Arena balance threshold: ",
|
||||
umax2s(opt_balance_threshold, s), "\n", "");
|
||||
@ -4189,11 +4011,6 @@ malloc_init_hard(void)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
if (ncpus == 1)
|
||||
opt_lazy_free_2pow = -1;
|
||||
#endif
|
||||
|
||||
/* Get page size. */
|
||||
{
|
||||
long result;
|
||||
@ -4345,18 +4162,6 @@ MALLOC_OUT:
|
||||
(sizeof(size_t) << 3))
|
||||
opt_chunk_2pow++;
|
||||
break;
|
||||
case 'l':
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
if (opt_lazy_free_2pow >= 0)
|
||||
opt_lazy_free_2pow--;
|
||||
#endif
|
||||
break;
|
||||
case 'L':
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
if (ncpus > 1)
|
||||
opt_lazy_free_2pow++;
|
||||
#endif
|
||||
break;
|
||||
case 'm':
|
||||
#ifdef MALLOC_DSS
|
||||
opt_mmap = false;
|
||||
@ -4493,14 +4298,6 @@ MALLOC_OUT:
|
||||
}
|
||||
arena_maxclass = chunksize - (arena_chunk_header_npages <<
|
||||
pagesize_2pow);
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
/*
|
||||
* Make sure that allocating the free_cache does not exceed the limits
|
||||
* of what base_alloc() can handle.
|
||||
*/
|
||||
while ((sizeof(void *) << opt_lazy_free_2pow) > chunksize)
|
||||
opt_lazy_free_2pow--;
|
||||
#endif
|
||||
|
||||
UTRACE(0, 0, 0);
|
||||
|
||||
@ -4644,11 +4441,8 @@ MALLOC_OUT:
|
||||
#endif
|
||||
/*
|
||||
* Seed here for the initial thread, since choose_arena_hard() is only
|
||||
* called for other threads. The seed values don't really matter.
|
||||
* called for other threads. The seed value doesn't really matter.
|
||||
*/
|
||||
#ifdef MALLOC_LAZY_FREE
|
||||
SPRN(lazy_free, 42);
|
||||
#endif
|
||||
#ifdef MALLOC_BALANCE
|
||||
SPRN(balance, 42);
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user