Implement lazy deallocation of small objects. For each arena, maintain a

vector of slots for lazily freed objects. For each deallocation, before doing the hard work of locking the arena and deallocating, try several times to randomly insert the object into the vector using atomic operations. This approach is particularly effective at reducing contention for multi-threaded applications that use the producer-consumer model, wherein one producer thread allocates objects, then multiple consumer threads deallocate those objects.
svn path=/head/; revision=173966
2007-11-27 03:13:15 +00:00 · 2007-11-27 03:13:15 +00:00 · 26b5e3a18e · 2020-12-20 02:59:44 +00:00
commit 26b5e3a18e
parent bcd3523138
1 changed files with 218 additions and 0 deletions
--- a/lib/libc/stdlib/malloc.c
+++ b/lib/libc/stdlib/malloc.c
@ -259,6 +259,17 @@ __FBSDID("$FreeBSD$");
 #define	RUN_MAX_SMALL_2POW	15
 #define	RUN_MAX_SMALL		(1U << RUN_MAX_SMALL_2POW)

+/* Default size of each arena's lazy free cache. */
+#define	LAZY_FREE_2POW_DEFAULT	8
+/*
+ * Number of pseudo-random probes to conduct before considering the cache to be
+ * overly full.  It takes on average n probes to detect fullness of (n-1)/n.
+ * However, we are effectively doing multiple non-independent trials (each
+ * deallocation is a trial), so the actual average threshold for clearing the
+ * cache is somewhat lower.
+ */
+#define	LAZY_FREE_NPROBES	5
+
 /******************************************************************************/

 /*
@ -529,6 +540,16 @@ struct arena_s {
 	 */
 	arena_chunk_t		*spare;

+#ifndef NO_TLS
+	/*
+	 * Deallocation of small objects can be lazy, in which case free_cache
+	 * stores pointers to those objects that have not yet been deallocated.
+	 * In order to avoid lock contention, slots are chosen randomly.  Empty
+	 * slots contain NULL.
+	 */
+	void			**free_cache;
+#endif
+
 	/*
 	 * bins is used to store rings of free regions of the following sizes,
 	 * assuming a 16-byte quantum, 4kB pagesize, and default MALLOC_OPTIONS.
@ -691,6 +712,9 @@ static bool	opt_abort = false;
 static bool	opt_junk = false;
 #endif
 static bool	opt_hint = false;
+#ifndef NO_TLS
+static int	opt_lazy_free_2pow = LAZY_FREE_2POW_DEFAULT;
+#endif
 static bool	opt_print_stats = false;
 static size_t	opt_quantum_2pow = QUANTUM_2POW_MIN;
 static size_t	opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT;
@ -851,6 +875,59 @@ pow2_ceil(size_t x)
 	return (x);
 }

+#ifndef NO_TLS
+/*
+ * Use a simple linear congruential pseudo-random number generator:
+ *
+ *   prn(y) = (a*x + c) % m
+ *
+ * where the following constants ensure maximal period:
+ *
+ *   a == Odd number (relatively prime to 2^n), and (a-1) is a multiple of 4.
+ *   c == Odd number (relatively prime to 2^n).
+ *   m == 2^32
+ *
+ * See Knuth's TAOCP 3rd Ed., Vol. 2, pg. 17 for details on these constraints.
+ *
+ * This choice of m has the disadvantage that the quality of the bits is
+ * proportional to bit position.  For example. the lowest bit has a cycle of 2,
+ * the next has a cycle of 4, etc.  For this reason, we prefer to use the upper
+ * bits.
+ */
+#define	PRN_DEFINE(suffix, var, a, c)					\
+static inline void							\
+sprn_##suffix(uint32_t seed)						\
+{									\
+	var = seed;							\
+}									\
+									\
+static inline uint32_t							\
+prn_##suffix(uint32_t lg_range)						\
+{									\
+	uint32_t ret, x;						\
+									\
+	assert(lg_range > 0);						\
+	assert(lg_range <= 32);						\
+									\
+	x = (var * (a)) + (c);						\
+	var = x;							\
+	ret = x >> (32 - lg_range);					\
+									\
+	return (ret);							\
+}
+#define	SPRN(suffix, seed)	sprn_##suffix(seed)
+#define	PRN(suffix, lg_range)	prn_##suffix(lg_range)
+
+/*
+ * Define PRNGs, one for each purpose, in order to avoid auto-correlation
+ * problems.
+ */
+
+/* Define the per-thread PRNG used for lazy deallocation. */
+static __thread uint32_t lazy_free_x;
+PRN_DEFINE(lazy_free, lazy_free_x, 12345, 12347)
+#endif
+
 static void
 wrtmessage(const char *p1, const char *p2, const char *p3, const char *p4)
 {
@ -1524,6 +1601,18 @@ choose_arena_hard(void)

 	assert(__isthreaded);

+	/*
+	 * Seed the PRNG used for lazy deallocation.  Since seeding only occurs
+	 * on the first allocation by a thread, it is possible for a thread to
+	 * deallocate before seeding.  This is not a critical issue though,
+	 * since it is extremely unusual for an application to to use threads
+	 * that deallocate but *never* allocate, and because even if seeding
+	 * never occurs for multiple threads, they will tend to drift apart
+	 * unless some aspect of the application forces deallocation
+	 * synchronization.
+	 */
+	SPRN(lazy_free, (uint32_t)(uintptr_t)(_pthread_self()));
+
 	/* Assign one of the arenas to this thread, in a round-robin fashion. */
 	malloc_mutex_lock(&arenas_mtx);
 	ret = arenas[next_arena];
@ -2577,6 +2666,80 @@ arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 #endif
 }

+#ifndef NO_TLS
+static inline void
+arena_dalloc_lazy(arena_t *arena, arena_chunk_t *chunk, void *ptr,
+    unsigned pageind, arena_chunk_map_t *mapelm)
+{
+	void **free_cache = arena->free_cache;
+	unsigned i, slot;
+
+	if (!__isthreaded || opt_lazy_free_2pow < 0) {
+		malloc_mutex_lock(&arena->mtx);
+		arena_dalloc_small(arena, chunk, ptr, pageind, mapelm);
+		malloc_mutex_unlock(&arena->mtx);
+		return;
+	}
+
+	for (i = 0; i < LAZY_FREE_NPROBES; i++) {
+		slot = PRN(lazy_free, opt_lazy_free_2pow);
+		if (atomic_cmpset_ptr((uintptr_t *)&free_cache[slot],
+		    (uintptr_t)NULL, (uintptr_t)ptr)) {
+			return;
+		}
+	}
+
+	malloc_mutex_lock(&arena->mtx);
+	arena_dalloc_small(arena, chunk, ptr, pageind, mapelm);
+
+	/*
+	 * Check whether another thread already cleared the cache.  It is
+	 * possible that another thread cleared the cache *and* this slot was
+	 * already refilled, which could result in a mostly fruitless cache
+	 * sweep, but such a sequence of events causes no correctness issues.
+	 */
+	if ((ptr = (void *)atomic_readandclear_ptr(
+	    (uintptr_t *)&free_cache[slot]))
+	    != NULL) {
+		unsigned lazy_free_mask;
+		
+		/*
+		 * Clear the cache, since we failed to find a slot.  It is
+		 * possible that other threads will continue to insert objects
+		 * into the cache while this one sweeps, but that is okay,
+		 * since on average the cache is still swept with the same
+		 * frequency.
+		 */
+
+		/* Handle pointer at current slot. */
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+		pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >>
+		    pagesize_2pow);
+		mapelm = &chunk->map[pageind];
+		arena_dalloc_small(arena, chunk, ptr, pageind, mapelm);
+
+		/* Sweep remainder of slots. */
+		lazy_free_mask = (1U << opt_lazy_free_2pow) - 1;
+		for (i = (slot + 1) & lazy_free_mask;
+		     i != slot;
+		     i = (i + 1) & lazy_free_mask) {
+			ptr = (void *)atomic_readandclear_ptr(
+			    (uintptr_t *)&free_cache[i]);
+			if (ptr != NULL) {
+				chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+				pageind = (((uintptr_t)ptr - (uintptr_t)chunk)
+				    >> pagesize_2pow);
+				mapelm = &chunk->map[pageind];
+				arena_dalloc_small(arena, chunk, ptr, pageind,
+				    mapelm);
+			}
+		}
+	}
+
+	malloc_mutex_unlock(&arena->mtx);
+}
+#endif
+
 static void
 arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
@ -2594,9 +2757,13 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 	if (mapelm->pos != 0 || ptr != (void *)((uintptr_t)chunk) + (pageind <<
 	    pagesize_2pow)) {
 		/* Small allocation. */
+#ifndef NO_TLS
+		arena_dalloc_lazy(arena, chunk, ptr, pageind, mapelm);
+#else
 		malloc_mutex_lock(&arena->mtx);
 		arena_dalloc_small(arena, chunk, ptr, pageind, mapelm);
 		malloc_mutex_unlock(&arena->mtx);
+#endif
 	} else {
 		size_t size;

@ -2635,6 +2802,18 @@ arena_new(arena_t *arena)
 	RB_INIT(&arena->chunks);
 	arena->spare = NULL;

+#ifndef NO_TLS
+	if (opt_lazy_free_2pow >= 0) {
+		arena->free_cache = (void **) base_alloc(sizeof(void *)
+		    * (1U << opt_lazy_free_2pow));
+		if (arena->free_cache == NULL)
+			return (true);
+		memset(arena->free_cache, 0, sizeof(void *)
+		    * (1U << opt_lazy_free_2pow));
+	} else
+		arena->free_cache = NULL;
+#endif
+
 	/* Initialize bins. */
 	prev_run_size = pagesize;

@ -3147,6 +3326,13 @@ malloc_print_stats(void)

 		_malloc_message("CPUs: ", umax2s(ncpus, s), "\n", "");
 		_malloc_message("Max arenas: ", umax2s(narenas, s), "\n", "");
+#ifndef NO_TLS
+		if (opt_lazy_free_2pow >= 0) {
+			_malloc_message("Lazy free slots: ",
+			    umax2s(1U << opt_lazy_free_2pow, s), "\n", "");
+		} else
+			_malloc_message("Lazy free slots: 0\n", "", "", "");
+#endif
 		_malloc_message("Pointer size: ", umax2s(sizeof(void *), s),
 		    "\n", "");
 		_malloc_message("Quantum size: ", umax2s(quantum, s), "\n", "");
@ -3275,6 +3461,11 @@ malloc_init_hard(void)
 		}
 	}

+#ifndef NO_TLS
+	if (ncpus == 1)
+		opt_lazy_free_2pow = -1;
+#endif
+
 	/* Get page size. */
 	{
 		long result;
@ -3381,6 +3572,18 @@ malloc_init_hard(void)
 				    < (sizeof(uint32_t) << 3) - 1)
 					opt_chunk_2pow++;
 				break;
+			case 'l':
+#ifndef NO_TLS
+				if (opt_lazy_free_2pow >= 0)
+					opt_lazy_free_2pow--;
+#endif
+				break;
+			case 'L':
+#ifndef NO_TLS
+				if (ncpus > 1)
+					opt_lazy_free_2pow++;
+#endif
+				break;
 			case 'n':
 				opt_narenas_lshift--;
 				break;
@ -3489,6 +3692,14 @@ malloc_init_hard(void)
 	}
 	arena_maxclass = chunksize - (arena_chunk_header_npages <<
 	    pagesize_2pow);
+#ifndef NO_TLS
+	/*
+	 * Make sure that allocating the free_cache does not exceed the limits
+	 * of what base_alloc() can handle.
+	 */
+	while ((sizeof(void *) << opt_lazy_free_2pow) > chunksize)
+		opt_lazy_free_2pow--;
+#endif

 	UTRACE(0, 0, 0);

@ -3612,6 +3823,13 @@ malloc_init_hard(void)
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
+#ifndef NO_TLS
+	/*
+	 * Seed here for the initial thread, since choose_arena_hard() is only
+	 * called for other threads.  The seed values don't really matter.
+	 */
+	SPRN(lazy_free, 42);
+#endif

 	malloc_mutex_init(&arenas_mtx);