Implement lazy deallocation of small objects. For each arena, maintain a
vector of slots for lazily freed objects. For each deallocation, before doing the hard work of locking the arena and deallocating, try several times to randomly insert the object into the vector using atomic operations. This approach is particularly effective at reducing contention for multi-threaded applications that use the producer-consumer model, wherein one producer thread allocates objects, then multiple consumer threads deallocate those objects.
This commit is contained in:
parent
bcd3523138
commit
26b5e3a18e
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=173966
@ -259,6 +259,17 @@ __FBSDID("$FreeBSD$");
|
||||
#define RUN_MAX_SMALL_2POW 15
|
||||
#define RUN_MAX_SMALL (1U << RUN_MAX_SMALL_2POW)
|
||||
|
||||
/* Default size of each arena's lazy free cache. */
|
||||
#define LAZY_FREE_2POW_DEFAULT 8
|
||||
/*
|
||||
* Number of pseudo-random probes to conduct before considering the cache to be
|
||||
* overly full. It takes on average n probes to detect fullness of (n-1)/n.
|
||||
* However, we are effectively doing multiple non-independent trials (each
|
||||
* deallocation is a trial), so the actual average threshold for clearing the
|
||||
* cache is somewhat lower.
|
||||
*/
|
||||
#define LAZY_FREE_NPROBES 5
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
/*
|
||||
@ -529,6 +540,16 @@ struct arena_s {
|
||||
*/
|
||||
arena_chunk_t *spare;
|
||||
|
||||
#ifndef NO_TLS
|
||||
/*
|
||||
* Deallocation of small objects can be lazy, in which case free_cache
|
||||
* stores pointers to those objects that have not yet been deallocated.
|
||||
* In order to avoid lock contention, slots are chosen randomly. Empty
|
||||
* slots contain NULL.
|
||||
*/
|
||||
void **free_cache;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* bins is used to store rings of free regions of the following sizes,
|
||||
* assuming a 16-byte quantum, 4kB pagesize, and default MALLOC_OPTIONS.
|
||||
@ -691,6 +712,9 @@ static bool opt_abort = false;
|
||||
static bool opt_junk = false;
|
||||
#endif
|
||||
static bool opt_hint = false;
|
||||
#ifndef NO_TLS
|
||||
static int opt_lazy_free_2pow = LAZY_FREE_2POW_DEFAULT;
|
||||
#endif
|
||||
static bool opt_print_stats = false;
|
||||
static size_t opt_quantum_2pow = QUANTUM_2POW_MIN;
|
||||
static size_t opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT;
|
||||
@ -851,6 +875,59 @@ pow2_ceil(size_t x)
|
||||
return (x);
|
||||
}
|
||||
|
||||
#ifndef NO_TLS
|
||||
/*
|
||||
* Use a simple linear congruential pseudo-random number generator:
|
||||
*
|
||||
* prn(y) = (a*x + c) % m
|
||||
*
|
||||
* where the following constants ensure maximal period:
|
||||
*
|
||||
* a == Odd number (relatively prime to 2^n), and (a-1) is a multiple of 4.
|
||||
* c == Odd number (relatively prime to 2^n).
|
||||
* m == 2^32
|
||||
*
|
||||
* See Knuth's TAOCP 3rd Ed., Vol. 2, pg. 17 for details on these constraints.
|
||||
*
|
||||
* This choice of m has the disadvantage that the quality of the bits is
|
||||
* proportional to bit position. For example. the lowest bit has a cycle of 2,
|
||||
* the next has a cycle of 4, etc. For this reason, we prefer to use the upper
|
||||
* bits.
|
||||
*/
|
||||
#define PRN_DEFINE(suffix, var, a, c) \
|
||||
static inline void \
|
||||
sprn_##suffix(uint32_t seed) \
|
||||
{ \
|
||||
var = seed; \
|
||||
} \
|
||||
\
|
||||
static inline uint32_t \
|
||||
prn_##suffix(uint32_t lg_range) \
|
||||
{ \
|
||||
uint32_t ret, x; \
|
||||
\
|
||||
assert(lg_range > 0); \
|
||||
assert(lg_range <= 32); \
|
||||
\
|
||||
x = (var * (a)) + (c); \
|
||||
var = x; \
|
||||
ret = x >> (32 - lg_range); \
|
||||
\
|
||||
return (ret); \
|
||||
}
|
||||
#define SPRN(suffix, seed) sprn_##suffix(seed)
|
||||
#define PRN(suffix, lg_range) prn_##suffix(lg_range)
|
||||
|
||||
/*
|
||||
* Define PRNGs, one for each purpose, in order to avoid auto-correlation
|
||||
* problems.
|
||||
*/
|
||||
|
||||
/* Define the per-thread PRNG used for lazy deallocation. */
|
||||
static __thread uint32_t lazy_free_x;
|
||||
PRN_DEFINE(lazy_free, lazy_free_x, 12345, 12347)
|
||||
#endif
|
||||
|
||||
static void
|
||||
wrtmessage(const char *p1, const char *p2, const char *p3, const char *p4)
|
||||
{
|
||||
@ -1524,6 +1601,18 @@ choose_arena_hard(void)
|
||||
|
||||
assert(__isthreaded);
|
||||
|
||||
/*
|
||||
* Seed the PRNG used for lazy deallocation. Since seeding only occurs
|
||||
* on the first allocation by a thread, it is possible for a thread to
|
||||
* deallocate before seeding. This is not a critical issue though,
|
||||
* since it is extremely unusual for an application to to use threads
|
||||
* that deallocate but *never* allocate, and because even if seeding
|
||||
* never occurs for multiple threads, they will tend to drift apart
|
||||
* unless some aspect of the application forces deallocation
|
||||
* synchronization.
|
||||
*/
|
||||
SPRN(lazy_free, (uint32_t)(uintptr_t)(_pthread_self()));
|
||||
|
||||
/* Assign one of the arenas to this thread, in a round-robin fashion. */
|
||||
malloc_mutex_lock(&arenas_mtx);
|
||||
ret = arenas[next_arena];
|
||||
@ -2577,6 +2666,80 @@ arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef NO_TLS
|
||||
static inline void
|
||||
arena_dalloc_lazy(arena_t *arena, arena_chunk_t *chunk, void *ptr,
|
||||
unsigned pageind, arena_chunk_map_t *mapelm)
|
||||
{
|
||||
void **free_cache = arena->free_cache;
|
||||
unsigned i, slot;
|
||||
|
||||
if (!__isthreaded || opt_lazy_free_2pow < 0) {
|
||||
malloc_mutex_lock(&arena->mtx);
|
||||
arena_dalloc_small(arena, chunk, ptr, pageind, mapelm);
|
||||
malloc_mutex_unlock(&arena->mtx);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < LAZY_FREE_NPROBES; i++) {
|
||||
slot = PRN(lazy_free, opt_lazy_free_2pow);
|
||||
if (atomic_cmpset_ptr((uintptr_t *)&free_cache[slot],
|
||||
(uintptr_t)NULL, (uintptr_t)ptr)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
malloc_mutex_lock(&arena->mtx);
|
||||
arena_dalloc_small(arena, chunk, ptr, pageind, mapelm);
|
||||
|
||||
/*
|
||||
* Check whether another thread already cleared the cache. It is
|
||||
* possible that another thread cleared the cache *and* this slot was
|
||||
* already refilled, which could result in a mostly fruitless cache
|
||||
* sweep, but such a sequence of events causes no correctness issues.
|
||||
*/
|
||||
if ((ptr = (void *)atomic_readandclear_ptr(
|
||||
(uintptr_t *)&free_cache[slot]))
|
||||
!= NULL) {
|
||||
unsigned lazy_free_mask;
|
||||
|
||||
/*
|
||||
* Clear the cache, since we failed to find a slot. It is
|
||||
* possible that other threads will continue to insert objects
|
||||
* into the cache while this one sweeps, but that is okay,
|
||||
* since on average the cache is still swept with the same
|
||||
* frequency.
|
||||
*/
|
||||
|
||||
/* Handle pointer at current slot. */
|
||||
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
|
||||
pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >>
|
||||
pagesize_2pow);
|
||||
mapelm = &chunk->map[pageind];
|
||||
arena_dalloc_small(arena, chunk, ptr, pageind, mapelm);
|
||||
|
||||
/* Sweep remainder of slots. */
|
||||
lazy_free_mask = (1U << opt_lazy_free_2pow) - 1;
|
||||
for (i = (slot + 1) & lazy_free_mask;
|
||||
i != slot;
|
||||
i = (i + 1) & lazy_free_mask) {
|
||||
ptr = (void *)atomic_readandclear_ptr(
|
||||
(uintptr_t *)&free_cache[i]);
|
||||
if (ptr != NULL) {
|
||||
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
|
||||
pageind = (((uintptr_t)ptr - (uintptr_t)chunk)
|
||||
>> pagesize_2pow);
|
||||
mapelm = &chunk->map[pageind];
|
||||
arena_dalloc_small(arena, chunk, ptr, pageind,
|
||||
mapelm);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
malloc_mutex_unlock(&arena->mtx);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void
|
||||
arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
|
||||
{
|
||||
@ -2594,9 +2757,13 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
|
||||
if (mapelm->pos != 0 || ptr != (void *)((uintptr_t)chunk) + (pageind <<
|
||||
pagesize_2pow)) {
|
||||
/* Small allocation. */
|
||||
#ifndef NO_TLS
|
||||
arena_dalloc_lazy(arena, chunk, ptr, pageind, mapelm);
|
||||
#else
|
||||
malloc_mutex_lock(&arena->mtx);
|
||||
arena_dalloc_small(arena, chunk, ptr, pageind, mapelm);
|
||||
malloc_mutex_unlock(&arena->mtx);
|
||||
#endif
|
||||
} else {
|
||||
size_t size;
|
||||
|
||||
@ -2635,6 +2802,18 @@ arena_new(arena_t *arena)
|
||||
RB_INIT(&arena->chunks);
|
||||
arena->spare = NULL;
|
||||
|
||||
#ifndef NO_TLS
|
||||
if (opt_lazy_free_2pow >= 0) {
|
||||
arena->free_cache = (void **) base_alloc(sizeof(void *)
|
||||
* (1U << opt_lazy_free_2pow));
|
||||
if (arena->free_cache == NULL)
|
||||
return (true);
|
||||
memset(arena->free_cache, 0, sizeof(void *)
|
||||
* (1U << opt_lazy_free_2pow));
|
||||
} else
|
||||
arena->free_cache = NULL;
|
||||
#endif
|
||||
|
||||
/* Initialize bins. */
|
||||
prev_run_size = pagesize;
|
||||
|
||||
@ -3147,6 +3326,13 @@ malloc_print_stats(void)
|
||||
|
||||
_malloc_message("CPUs: ", umax2s(ncpus, s), "\n", "");
|
||||
_malloc_message("Max arenas: ", umax2s(narenas, s), "\n", "");
|
||||
#ifndef NO_TLS
|
||||
if (opt_lazy_free_2pow >= 0) {
|
||||
_malloc_message("Lazy free slots: ",
|
||||
umax2s(1U << opt_lazy_free_2pow, s), "\n", "");
|
||||
} else
|
||||
_malloc_message("Lazy free slots: 0\n", "", "", "");
|
||||
#endif
|
||||
_malloc_message("Pointer size: ", umax2s(sizeof(void *), s),
|
||||
"\n", "");
|
||||
_malloc_message("Quantum size: ", umax2s(quantum, s), "\n", "");
|
||||
@ -3275,6 +3461,11 @@ malloc_init_hard(void)
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NO_TLS
|
||||
if (ncpus == 1)
|
||||
opt_lazy_free_2pow = -1;
|
||||
#endif
|
||||
|
||||
/* Get page size. */
|
||||
{
|
||||
long result;
|
||||
@ -3381,6 +3572,18 @@ malloc_init_hard(void)
|
||||
< (sizeof(uint32_t) << 3) - 1)
|
||||
opt_chunk_2pow++;
|
||||
break;
|
||||
case 'l':
|
||||
#ifndef NO_TLS
|
||||
if (opt_lazy_free_2pow >= 0)
|
||||
opt_lazy_free_2pow--;
|
||||
#endif
|
||||
break;
|
||||
case 'L':
|
||||
#ifndef NO_TLS
|
||||
if (ncpus > 1)
|
||||
opt_lazy_free_2pow++;
|
||||
#endif
|
||||
break;
|
||||
case 'n':
|
||||
opt_narenas_lshift--;
|
||||
break;
|
||||
@ -3489,6 +3692,14 @@ malloc_init_hard(void)
|
||||
}
|
||||
arena_maxclass = chunksize - (arena_chunk_header_npages <<
|
||||
pagesize_2pow);
|
||||
#ifndef NO_TLS
|
||||
/*
|
||||
* Make sure that allocating the free_cache does not exceed the limits
|
||||
* of what base_alloc() can handle.
|
||||
*/
|
||||
while ((sizeof(void *) << opt_lazy_free_2pow) > chunksize)
|
||||
opt_lazy_free_2pow--;
|
||||
#endif
|
||||
|
||||
UTRACE(0, 0, 0);
|
||||
|
||||
@ -3612,6 +3823,13 @@ malloc_init_hard(void)
|
||||
malloc_mutex_unlock(&init_lock);
|
||||
return (true);
|
||||
}
|
||||
#ifndef NO_TLS
|
||||
/*
|
||||
* Seed here for the initial thread, since choose_arena_hard() is only
|
||||
* called for other threads. The seed values don't really matter.
|
||||
*/
|
||||
SPRN(lazy_free, 42);
|
||||
#endif
|
||||
|
||||
malloc_mutex_init(&arenas_mtx);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user