Update jemalloc to version 5.1.0.

2018-05-11 00:32:31 +00:00 · 2018-05-11 00:32:31 +00:00 · 0ef50b4ec8
commit 0ef50b4ec8
parent 20f8d7bc7e
77 changed files with 4398 additions and 2103 deletions
--- a/contrib/jemalloc/COPYING
+++ b/contrib/jemalloc/COPYING
@ -1,10 +1,10 @@
 Unless otherwise specified, files in the jemalloc source distribution are
 subject to the following license:
 --------------------------------------------------------------------------------
-Copyright (C) 2002-2017 Jason Evans <jasone@canonware.com>.
+Copyright (C) 2002-2018 Jason Evans <jasone@canonware.com>.
 All rights reserved.
 Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
-Copyright (C) 2009-2017 Facebook, Inc.  All rights reserved.
+Copyright (C) 2009-2018 Facebook, Inc.  All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
--- a/contrib/jemalloc/ChangeLog
+++ b/contrib/jemalloc/ChangeLog
@ -4,6 +4,123 @@ brevity.  Much more detail can be found in the git revision history:

    https://github.com/jemalloc/jemalloc

+* 5.1.0 (May 4th, 2018)
+
+  This release is primarily about fine-tuning, ranging from several new features
+  to numerous notable performance and portability enhancements.  The release and
+  prior dev versions have been running in multiple large scale applications for
+  months, and the cumulative improvements are substantial in many cases.
+
+  Given the long and successful production runs, this release is likely a good
+  candidate for applications to upgrade, from both jemalloc 5.0 and before.  For
+  performance-critical applications, the newly added TUNING.md provides
+  guidelines on jemalloc tuning.
+
+  New features:
+  - Implement transparent huge page support for internal metadata.  (@interwq)
+  - Add opt.thp to allow enabling / disabling transparent huge pages for all
+    mappings.  (@interwq)
+  - Add maximum background thread count option.  (@djwatson)
+  - Allow prof_active to control opt.lg_prof_interval and prof.gdump.
+    (@interwq)
+  - Allow arena index lookup based on allocation addresses via mallctl.
+    (@lionkov)
+  - Allow disabling initial-exec TLS model.  (@davidtgoldblatt, @KenMacD)
+  - Add opt.lg_extent_max_active_fit to set the max ratio between the size of
+    the active extent selected (to split off from) and the size of the requested
+    allocation.  (@interwq, @davidtgoldblatt)
+  - Add retain_grow_limit to set the max size when growing virtual address
+    space.  (@interwq)
+  - Add mallctl interfaces:
+    + arena.<i>.retain_grow_limit  (@interwq)
+    + arenas.lookup  (@lionkov)
+    + max_background_threads  (@djwatson)
+    + opt.lg_extent_max_active_fit  (@interwq)
+    + opt.max_background_threads  (@djwatson)
+    + opt.metadata_thp  (@interwq)
+    + opt.thp  (@interwq)
+    + stats.metadata_thp  (@interwq)
+
+  Portability improvements:
+  - Support GNU/kFreeBSD configuration.  (@paravoid)
+  - Support m68k, nios2 and SH3 architectures.  (@paravoid)
+  - Fall back to FD_CLOEXEC when O_CLOEXEC is unavailable.  (@zonyitoo)
+  - Fix symbol listing for cross-compiling.  (@tamird)
+  - Fix high bits computation on ARM.  (@davidtgoldblatt, @paravoid)
+  - Disable the CPU_SPINWAIT macro for Power.  (@davidtgoldblatt, @marxin)
+  - Fix MSVC 2015 & 2017 builds.  (@rustyx)
+  - Improve RISC-V support.  (@EdSchouten)
+  - Set name mangling script in strict mode.  (@nicolov)
+  - Avoid MADV_HUGEPAGE on ARM.  (@marxin)
+  - Modify configure to determine return value of strerror_r.
+    (@davidtgoldblatt, @cferris1000)
+  - Make sure CXXFLAGS is tested with CPP compiler.  (@nehaljwani)
+  - Fix 32-bit build on MSVC.  (@rustyx)
+  - Fix external symbol on MSVC.  (@maksqwe)
+  - Avoid a printf format specifier warning.  (@jasone)
+  - Add configure option --disable-initial-exec-tls which can allow jemalloc to
+    be dynamically loaded after program startup.  (@davidtgoldblatt, @KenMacD)
+  - AArch64: Add ILP32 support.  (@cmuellner)
+  - Add --with-lg-vaddr configure option to support cross compiling.
+    (@cmuellner, @davidtgoldblatt)
+
+  Optimizations and refactors:
+  - Improve active extent fit with extent_max_active_fit.  This considerably
+    reduces fragmentation over time and improves virtual memory and metadata
+    usage.  (@davidtgoldblatt, @interwq)
+  - Eagerly coalesce large extents to reduce fragmentation.  (@interwq)
+  - sdallocx: only read size info when page aligned (i.e. possibly sampled),
+    which speeds up the sized deallocation path significantly.  (@interwq)
+  - Avoid attempting new mappings for in place expansion with retain, since
+    it rarely succeeds in practice and causes high overhead.  (@interwq)
+  - Refactor OOM handling in newImpl.  (@wqfish)
+  - Add internal fine-grained logging functionality for debugging use.
+    (@davidtgoldblatt)
+  - Refactor arena / tcache interactions.  (@davidtgoldblatt)
+  - Refactor extent management with dumpable flag.  (@davidtgoldblatt)
+  - Add runtime detection of lazy purging.  (@interwq)
+  - Use pairing heap instead of red-black tree for extents_avail.  (@djwatson)
+  - Use sysctl on startup in FreeBSD.  (@trasz)
+  - Use thread local prng state instead of atomic.  (@djwatson)
+  - Make decay to always purge one more extent than before, because in
+    practice large extents are usually the ones that cross the decay threshold.
+    Purging the additional extent helps save memory as well as reduce VM
+    fragmentation.  (@interwq)
+  - Fast division by dynamic values.  (@davidtgoldblatt)
+  - Improve the fit for aligned allocation.  (@interwq, @edwinsmith)
+  - Refactor extent_t bitpacking.  (@rkmisra)
+  - Optimize the generated assembly for ticker operations.  (@davidtgoldblatt)
+  - Convert stats printing to use a structured text emitter.  (@davidtgoldblatt)
+  - Remove preserve_lru feature for extents management.  (@djwatson)
+  - Consolidate two memory loads into one on the fast deallocation path.
+    (@davidtgoldblatt, @interwq)
+
+  Bug fixes (most of the issues are only relevant to jemalloc 5.0):
+  - Fix deadlock with multithreaded fork in OS X.  (@davidtgoldblatt)
+  - Validate returned file descriptor before use.  (@zonyitoo)
+  - Fix a few background thread initialization and shutdown issues.  (@interwq)
+  - Fix an extent coalesce + decay race by taking both coalescing extents off
+    the LRU list.  (@interwq)
+  - Fix potentially unbound increase during decay, caused by one thread keep
+    stashing memory to purge while other threads generating new pages.  The
+    number of pages to purge is checked to prevent this.  (@interwq)
+  - Fix a FreeBSD bootstrap assertion.  (@strejda, @interwq)
+  - Handle 32 bit mutex counters.  (@rkmisra)
+  - Fix a indexing bug when creating background threads.  (@davidtgoldblatt,
+    @binliu19)
+  - Fix arguments passed to extent_init.  (@yuleniwo, @interwq)
+  - Fix addresses used for ordering mutexes.  (@rkmisra)
+  - Fix abort_conf processing during bootstrap.  (@interwq)
+  - Fix include path order for out-of-tree builds.  (@cmuellner)
+
+  Incompatible changes:
+  - Remove --disable-thp.  (@interwq)
+  - Remove mallctl interfaces:
+    + config.thp  (@interwq)
+
+  Documentation:
+  - Add TUNING.md.  (@interwq, @davidtgoldblatt, @djwatson)
+
 * 5.0.1 (July 1, 2017)

  This bugfix release fixes several issues, most of which are obscure enough
@ -515,7 +632,7 @@ brevity.  Much more detail can be found in the git revision history:
  these fixes, xallocx() now tries harder to partially fulfill requests for
  optional extra space.  Note that a couple of minor heap profiling
  optimizations are included, but these are better thought of as performance
-  fixes that were integral to disovering most of the other bugs.
+  fixes that were integral to discovering most of the other bugs.

  Optimizations:
  - Avoid a chunk metadata read in arena_prof_tctx_set(), since it is in the
--- a/contrib/jemalloc/FREEBSD-Xlist
+++ b/contrib/jemalloc/FREEBSD-Xlist
@ -7,6 +7,7 @@ FREEBSD-*
 INSTALL.md
 Makefile*
 README
+TUNING.md
 autogen.sh
 autom4te.cache/
 bin/
--- a/contrib/jemalloc/FREEBSD-diffs
+++ b/contrib/jemalloc/FREEBSD-diffs
@ -1,5 +1,5 @@
 diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
-index 21e401ac..c26f9f4a 100644
+index 1e12fd3a..c42a7e10 100644
 --- a/doc/jemalloc.xml.in
 +++ b/doc/jemalloc.xml.in
@@ -53,11 +53,22 @@
@ -26,7 +26,7 @@ index 21e401ac..c26f9f4a 100644
       <refsect2>
         <title>Standard API</title>
         <funcprototype>
-@@ -3252,4 +3263,18 @@ malloc_conf = "narenas:1";]]></programlisting></para>
+@@ -3376,4 +3387,18 @@ malloc_conf = "narenas:1";]]></programlisting></para>
     <para>The <function>posix_memalign()</function> function conforms
     to IEEE Std 1003.1-2001 (<quote>POSIX.1</quote>).</para>
   </refsect1>
@ -64,7 +64,7 @@ index cd49afcb..85e2a991 100644
 #define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, hooks_libc_hook)
 
 diff --git a/include/jemalloc/internal/jemalloc_internal_decls.h b/include/jemalloc/internal/jemalloc_internal_decls.h
-index 1efdb56b..12a7e5a8 100644
+index be70df51..84cd70da 100644
 --- a/include/jemalloc/internal/jemalloc_internal_decls.h
 +++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -1,6 +1,9 @@
@ -78,7 +78,7 @@ index 1efdb56b..12a7e5a8 100644
 #ifdef _WIN32
 #  include <windows.h>
 diff --git a/include/jemalloc/internal/jemalloc_preamble.h.in b/include/jemalloc/internal/jemalloc_preamble.h.in
-index 18539a09..c8af8683 100644
+index e621fbc8..dbdd5d6b 100644
 --- a/include/jemalloc/internal/jemalloc_preamble.h.in
 +++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@@ -8,6 +8,9 @@
@ -91,7 +91,7 @@ index 18539a09..c8af8683 100644
 #define JEMALLOC_NO_DEMANGLE
 #ifdef JEMALLOC_JET
 #  undef JEMALLOC_IS_MALLOC
-@@ -68,13 +71,7 @@ static const bool config_fill =
+@@ -79,13 +82,7 @@ static const bool config_fill =
     false
 #endif
     ;
@ -128,9 +128,23 @@ index 6520c251..0013cbe9 100644
 bool malloc_mutex_boot(void);
 void malloc_mutex_prof_data_reset(tsdn_t *tsdn, malloc_mutex_t *mutex);
 
+diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
+index 0b9841aa..f03eee61 100644
+--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
+@@ -122,7 +122,8 @@ struct tsd_s {
+ 	t use_a_getter_or_setter_instead_##n;
+ MALLOC_TSD
+ #undef O
+-};
+/* AddressSanitizer requires TLS data to be aligned to at least 8 bytes. */
+} JEMALLOC_ALIGNED(16);
+ 
+ /*
+  * Wrapper around tsd_t that makes it possible to avoid implicit conversion
 diff --git a/include/jemalloc/jemalloc_FreeBSD.h b/include/jemalloc/jemalloc_FreeBSD.h
 new file mode 100644
-index 00000000..355b565c
+index 00000000..b752b0e7
 --- /dev/null
 +++ b/include/jemalloc/jemalloc_FreeBSD.h
@@ -0,0 +1,185 @@
@ -203,7 +217,7 @@ index 00000000..355b565c
 +#  define LG_VADDR		32
 +#  define LG_SIZEOF_PTR		2
 +#endif
-+#ifdef __riscv__
+#ifdef __riscv
 +#  define LG_VADDR		64
 +#  define LG_SIZEOF_PTR		3
 +#endif
@ -331,10 +345,10 @@ index f9438912..47d032c1 100755
 +#include "jemalloc_FreeBSD.h"
 EOF
 diff --git a/src/jemalloc.c b/src/jemalloc.c
-index 52c86aa6..868c9e86 100644
+index f93c16fa..e0ad297b 100644
 --- a/src/jemalloc.c
 +++ b/src/jemalloc.c
-@@ -20,6 +20,10 @@
+@@ -21,6 +21,10 @@
 /******************************************************************************/
 /* Data. */
 
@ -345,7 +359,7 @@ index 52c86aa6..868c9e86 100644
 /* Runtime configuration options. */
 const char	*je_malloc_conf
 #ifndef _WIN32
-@@ -2981,6 +2985,103 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) {
+@@ -3160,6 +3164,103 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) {
  */
 /******************************************************************************/
 /*
@ -449,7 +463,7 @@ index 52c86aa6..868c9e86 100644
  * The following functions are used by threading libraries for protection of
  * malloc during fork().
  */
-@@ -3141,4 +3242,11 @@ jemalloc_postfork_child(void) {
+@@ -3323,4 +3424,11 @@ jemalloc_postfork_child(void) {
 	ctl_postfork_child(tsd_tsdn(tsd));
 }
 
@ -462,10 +476,10 @@ index 52c86aa6..868c9e86 100644
 +
 /******************************************************************************/
 diff --git a/src/malloc_io.c b/src/malloc_io.c
-index 6b99afcd..4363cb83 100644
+index 7bdc13f9..c8802c70 100644
 --- a/src/malloc_io.c
 +++ b/src/malloc_io.c
-@@ -88,6 +88,20 @@ wrtmessage(void *cbopaque, const char *s) {
+@@ -75,6 +75,20 @@ wrtmessage(void *cbopaque, const char *s) {
 
 JEMALLOC_EXPORT void	(*je_malloc_message)(void *, const char *s);
 
@ -487,10 +501,10 @@ index 6b99afcd..4363cb83 100644
  * Wrapper around malloc_message() that avoids the need for
  * je_malloc_message(...) throughout the code.
 diff --git a/src/mutex.c b/src/mutex.c
-index a528ef0c..820af613 100644
+index 30222b3e..b2c36283 100644
 --- a/src/mutex.c
 +++ b/src/mutex.c
-@@ -40,6 +40,17 @@ pthread_create(pthread_t *__restrict thread,
+@@ -41,6 +41,17 @@ pthread_create(pthread_t *__restrict thread,
 #ifdef JEMALLOC_MUTEX_INIT_CB
 JEMALLOC_EXPORT int	_pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
     void *(calloc_cb)(size_t, size_t));
@ -508,7 +522,7 @@ index a528ef0c..820af613 100644
 #endif
 
 void
-@@ -130,6 +141,16 @@ mutex_addr_comp(const witness_t *witness1, void *mutex1,
+@@ -131,6 +142,16 @@ mutex_addr_comp(const witness_t *witness1, void *mutex1,
 }
 
 bool
--- a/contrib/jemalloc/VERSION
+++ b/contrib/jemalloc/VERSION
@ -1 +1 @@
-5.0.1-0-g896ed3a8b3f41998d4fb4d625d30ac63ef2d51fb
+5.1.0-0-g61efbda7098de6fe64c362d309824864308c36d4
--- a/contrib/jemalloc/doc/jemalloc.3
+++ b/contrib/jemalloc/doc/jemalloc.3
@ -2,12 +2,12 @@
 .\"     Title: JEMALLOC
 .\"    Author: Jason Evans
 .\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
-.\"      Date: 07/01/2017
+.\"      Date: 05/08/2018
 .\"    Manual: User Manual
-.\"    Source: jemalloc 5.0.1-0-g896ed3a8b3f41998d4fb4d625d30ac63ef2d51fb
+.\"    Source: jemalloc 5.1.0-0-g61efbda7098de6fe64c362d309824864308c36d4
 .\"  Language: English
 .\"
-.TH "JEMALLOC" "3" "07/01/2017" "jemalloc 5.0.1-0-g896ed3a8b3f4" "User Manual"
+.TH "JEMALLOC" "3" "05/08/2018" "jemalloc 5.1.0-0-g61efbda7098d" "User Manual"
 .\" -----------------------------------------------------------------
 .\" * Define some portability stuff
 .\" -----------------------------------------------------------------
@ -31,7 +31,7 @@
 jemalloc \- general purpose memory allocation functions
 .SH "LIBRARY"
 .PP
-This manual describes jemalloc 5\&.0\&.1\-0\-g896ed3a8b3f41998d4fb4d625d30ac63ef2d51fb\&. More information can be found at the
+This manual describes jemalloc 5\&.1\&.0\-0\-g61efbda7098de6fe64c362d309824864308c36d4\&. More information can be found at the
 \m[blue]\fBjemalloc website\fR\m[]\&\s-2\u[1]\d\s+2\&.
 .PP
 The following configuration options are enabled in libc\*(Aqs built\-in jemalloc:
@ -741,6 +741,13 @@ opt\&.background_thread
 can be used to set the default option\&. This option is only available on selected pthread\-based platforms\&.
 .RE
 .PP
+max_background_threads (\fBsize_t\fR) rw
+.RS 4
+Maximum number of background worker threads that will be created\&. This value is capped at
+opt\&.max_background_threads
+at startup\&.
+.RE
+.PP
 config\&.cache_oblivious (\fBbool\fR) r\-
 .RS 4
 \fB\-\-enable\-cache\-oblivious\fR
@ -796,12 +803,6 @@ config\&.stats (\fBbool\fR) r\-
 was specified during build configuration\&.
 .RE
 .PP
-config\&.thp (\fBbool\fR) r\-
-.RS 4
-\fB\-\-disable\-thp\fR
-was not specified during build configuration, and the system supports transparent huge page manipulation\&.
-.RE
-.PP
 config\&.utrace (\fBbool\fR) r\-
 .RS 4
 \fB\-\-enable\-utrace\fR
@ -834,6 +835,17 @@ in these cases\&. This option is disabled by default unless
 is specified during configuration, in which case it is enabled by default\&.
 .RE
 .PP
+opt\&.metadata_thp (\fBconst char *\fR) r\-
+.RS 4
+Controls whether to allow jemalloc to use transparent huge page (THP) for internal metadata (see
+stats\&.metadata)\&.
+\(lqalways\(rq
+allows such usage\&.
+\(lqauto\(rq
+uses no THP initially, but may begin to do so when metadata usage reaches certain level\&. The default is
+\(lqdisabled\(rq\&.
+.RE
+.PP
 opt\&.retain (\fBbool\fR) r\-
 .RS 4
 If true, retain unused virtual memory for later reuse rather than discarding it by calling
@ -883,11 +895,18 @@ setting uses one arena per physical CPU, which means the two hyper threads on th
 .PP
 opt\&.background_thread (\fBconst bool\fR) r\-
 .RS 4
-Internal background worker threads enabled/disabled\&. See
+Internal background worker threads enabled/disabled\&. Because of potential circular dependencies, enabling background thread using this option may cause crash or deadlock during initialization\&. For a reliable way to use this feature, see
 background_thread
 for dynamic control options and details\&. This option is disabled by default\&.
 .RE
 .PP
+opt\&.max_background_threads (\fBconst size_t\fR) r\-
+.RS 4
+Maximum number of background threads that will be created if
+background_thread
+is set\&. Defaults to number of cpus\&.
+.RE
+.PP
 opt\&.dirty_decay_ms (\fBssize_t\fR) r\-
 .RS 4
 Approximate time in milliseconds from the creation of a set of unused dirty pages until an equivalent set of unused dirty pages is purged (i\&.e\&. converted to muzzy via e\&.g\&.
@ -895,7 +914,7 @@ madvise(\fI\&.\&.\&.\fR\fI\fBMADV_FREE\fR\fR)
 if supported by the operating system, or converted to clean otherwise) and/or reused\&. Dirty pages are defined as previously having been potentially written to by the application, and therefore consuming physical memory, yet having no current use\&. The pages are incrementally purged according to a sigmoidal decay curve that starts and ends with zero purge rate\&. A decay time of 0 causes all unused dirty pages to be purged immediately upon creation\&. A decay time of \-1 disables purging\&. The default decay time is 10 seconds\&. See
 arenas\&.dirty_decay_ms
 and
-arena\&.<i>\&.muzzy_decay_ms
+arena\&.<i>\&.dirty_decay_ms
 for related dynamic control options\&. See
 opt\&.muzzy_decay_ms
 for a description of muzzy pages\&.
@ -911,6 +930,11 @@ arena\&.<i>\&.muzzy_decay_ms
 for related dynamic control options\&.
 .RE
 .PP
+opt\&.lg_extent_max_active_fit (\fBsize_t\fR) r\-
+.RS 4
+When reusing dirty extents, this determines the (log base 2 of the) maximum ratio between the size of the active extent selected (to split off from) and the size of the requested allocation\&. This prevents the splitting of large active extents for smaller allocations, which can reduce fragmentation over the long run (especially for non\-active extents)\&. Lower value may reduce fragmentation, at the cost of extra active extents\&. The default value is 6, which gives a maximum ratio of 64 (2^6)\&.
+.RE
+.PP
 opt\&.stats_print (\fBbool\fR) r\-
 .RS 4
 Enable/disable statistics printing at exit\&. If enabled, the
@ -1008,6 +1032,15 @@ opt\&.lg_tcache_max (\fBsize_t\fR) r\-
 Maximum size class (log base 2) to cache in the thread\-specific cache (tcache)\&. At a minimum, all small size classes are cached, and at a maximum all large size classes are cached\&. The default maximum is 32 KiB (2^15)\&.
 .RE
 .PP
+opt\&.thp (\fBconst char *\fR) r\-
+.RS 4
+Transparent hugepage (THP) mode\&. Settings "always", "never" and "default" are available if THP is supported by the operating system\&. The "always" setting enables transparent hugepage for all user memory mappings with
+\fI\fBMADV_HUGEPAGE\fR\fR; "never" ensures no transparent hugepage with
+\fI\fBMADV_NOHUGEPAGE\fR\fR; the default setting "default" makes no changes\&. Note that: this option does not affect THP for jemalloc internal metadata (see
+opt\&.metadata_thp); in addition, for arenas with customized
+extent_hooks, this option is bypassed as it is implemented as part of the default extent hooks\&.
+.RE
+.PP
 opt\&.prof (\fBbool\fR) r\- [\fB\-\-enable\-prof\fR]
 .RS 4
 Memory profiling enabled/disabled\&. If enabled, profile memory allocation activity\&. See the
@ -1248,6 +1281,14 @@ opt\&.muzzy_decay_ms
 for additional information\&.
 .RE
 .PP
+arena\&.<i>\&.retain_grow_limit (\fBsize_t\fR) rw
+.RS 4
+Maximum size to grow retained region (only relevant when
+opt\&.retain
+is enabled)\&. This controls the maximum increment to expand virtual memory, or allocation through
+arena\&.<i>extent_hooks\&. In particular, if customized extent hooks reserve physical memory (e\&.g\&. 1G huge pages), this is useful to control the allocation hook\*(Aqs input size\&. The default is no limit\&.
+.RE
+.PP
 arena\&.<i>\&.extent_hooks (\fBextent_hooks_t *\fR) rw
 .RS 4
 Get or set the extent management hook functions for arena <i>\&. The functions must be capable of operating on all extant extents associated with arena <i>, usually by passing unknown extents to the replaced functions\&. In practice, it is feasible to control allocation for arenas explicitly created via
@ -1278,7 +1319,7 @@ struct extent_hooks_s {
 The
 \fBextent_hooks_t\fR
 structure comprises function pointers which are described individually below\&. jemalloc uses these functions to manage extent lifetime, which starts off with allocation of mapped committed memory, in the simplest case followed by deallocation\&. However, there are performance and platform reasons to retain extents for later reuse\&. Cleanup attempts cascade from deallocation to decommit to forced purging to lazy purging, which gives the extent management functions opportunities to reject the most permanent cleanup operations in favor of less permanent (and often less costly) operations\&. All operations except allocation can be universally opted out of by setting the hook pointers to
-\fBNULL\fR, or selectively opted out of by returning failure\&.
+\fBNULL\fR, or selectively opted out of by returning failure\&. Note that once the extent hook is set, the structure is accessed directly by the associated arenas, so it must remain valid for the entire lifetime of the arenas\&.
 .HP \w'typedef\ void\ *(extent_alloc_t)('u
 .BI "typedef void *(extent_alloc_t)(extent_hooks_t\ *" "extent_hooks" ", void\ *" "new_addr" ", size_t\ " "size" ", size_t\ " "alignment" ", bool\ *" "zero" ", bool\ *" "commit" ", unsigned\ " "arena_ind" ");"
 .sp
@ -1572,6 +1613,11 @@ arenas\&.create (\fBunsigned\fR, \fBextent_hooks_t *\fR) rw
 Explicitly create a new arena outside the range of automatically managed arenas, with optionally specified extent hooks, and return the new arena index\&.
 .RE
 .PP
+arenas\&.lookup (\fBunsigned\fR, \fBvoid*\fR) rw
+.RS 4
+Index of the arena to which an allocation belongs to\&.
+.RE
+.PP
 prof\&.thread_active_init (\fBbool\fR) rw [\fB\-\-enable\-prof\fR]
 .RS 4
 Control the initial setting for
@ -1648,7 +1694,16 @@ stats\&.metadata (\fBsize_t\fR) r\- [\fB\-\-enable\-stats\fR]
 .RS 4
 Total number of bytes dedicated to metadata, which comprise base allocations used for bootstrap\-sensitive allocator metadata structures (see
 stats\&.arenas\&.<i>\&.base) and internal allocations (see
-stats\&.arenas\&.<i>\&.internal)\&.
+stats\&.arenas\&.<i>\&.internal)\&. Transparent huge page (enabled with
+opt\&.metadata_thp) usage is not considered\&.
+.RE
+.PP
+stats\&.metadata_thp (\fBsize_t\fR) r\- [\fB\-\-enable\-stats\fR]
+.RS 4
+Number of transparent huge pages (THP) used for metadata\&. See
+stats\&.metadata
+and
+opt\&.metadata_thp) for details\&.
 .RE
 .PP
 stats\&.resident (\fBsize_t\fR) r\- [\fB\-\-enable\-stats\fR]
@ -1831,6 +1886,13 @@ stats\&.arenas\&.<i>\&.internal (\fBsize_t\fR) r\- [\fB\-\-enable\-stats\fR]
 Number of bytes dedicated to internal allocations\&. Internal allocations differ from application\-originated allocations in that they are for internal use, and that they are omitted from heap profiles\&.
 .RE
 .PP
+stats\&.arenas\&.<i>\&.metadata_thp (\fBsize_t\fR) r\- [\fB\-\-enable\-stats\fR]
+.RS 4
+Number of transparent huge pages (THP) used for metadata\&. See
+opt\&.metadata_thp
+for details\&.
+.RE
+.PP
 stats\&.arenas\&.<i>\&.resident (\fBsize_t\fR) r\- [\fB\-\-enable\-stats\fR]
 .RS 4
 Maximum number of bytes in physically resident data pages mapped by the arena, comprising all pages dedicated to allocator metadata, pages backing active allocations, and unused dirty pages\&. This is a maximum rather than precise because pages may not actually be physically resident if they correspond to demand\-zeroed virtual memory that has not yet been touched\&. This is a multiple of the page size\&.
--- a/contrib/jemalloc/include/jemalloc/internal/arena_externs.h
+++ b/contrib/jemalloc/include/jemalloc/internal/arena_externs.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_EXTERNS_H
 #define JEMALLOC_INTERNAL_ARENA_EXTERNS_H

+#include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/size_classes.h"
@ -9,25 +10,19 @@
 extern ssize_t opt_dirty_decay_ms;
 extern ssize_t opt_muzzy_decay_ms;

-extern const arena_bin_info_t arena_bin_info[NBINS];
-
 extern percpu_arena_mode_t opt_percpu_arena;
 extern const char *percpu_arena_mode_names[];

 extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
 extern malloc_mutex_t arenas_lock;

-void arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    szind_t szind, uint64_t nrequests);
-void arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    size_t size);
 void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena,
    unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms,
    ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy);
 void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
    const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
    size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
-    malloc_bin_stats_t *bstats, malloc_large_stats_t *lstats);
+    bin_stats_t *bstats, arena_stats_large_t *lstats);
 void arena_extents_dirty_dalloc(tsdn_t *tsdn, arena_t *arena,
    extent_hooks_t **r_extent_hooks, extent_t *extent);
 #ifdef JEMALLOC_JET
@ -50,11 +45,11 @@ void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread,
 void arena_reset(tsd_t *tsd, arena_t *arena);
 void arena_destroy(tsd_t *tsd, arena_t *arena);
 void arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes);
-void arena_alloc_junk_small(void *ptr, const arena_bin_info_t *bin_info,
+    cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes);
+void arena_alloc_junk_small(void *ptr, const bin_info_t *bin_info,
    bool zero);

-typedef void (arena_dalloc_junk_small_t)(void *, const arena_bin_info_t *);
+typedef void (arena_dalloc_junk_small_t)(void *, const bin_info_t *);
 extern arena_dalloc_junk_small_t *JET_MUTABLE arena_dalloc_junk_small;

 void *arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size,
@ -77,6 +72,8 @@ ssize_t arena_dirty_decay_ms_default_get(void);
 bool arena_dirty_decay_ms_default_set(ssize_t decay_ms);
 ssize_t arena_muzzy_decay_ms_default_get(void);
 bool arena_muzzy_decay_ms_default_set(ssize_t decay_ms);
+bool arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena,
+    size_t *old_limit, size_t *new_limit);
 unsigned arena_nthreads_get(arena_t *arena, bool internal);
 void arena_nthreads_inc(arena_t *arena, bool internal);
 void arena_nthreads_dec(arena_t *arena, bool internal);
--- a/contrib/jemalloc/include/jemalloc/internal/arena_inlines_a.h
+++ b/contrib/jemalloc/include/jemalloc/internal/arena_inlines_a.h
@ -25,7 +25,7 @@ static inline bool
 arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes) {
 	cassert(config_prof);

-	if (likely(prof_interval == 0)) {
+	if (likely(prof_interval == 0 || !prof_active_get_unlocked())) {
 		return false;
 	}

--- a/contrib/jemalloc/include/jemalloc/internal/arena_inlines_b.h
+++ b/contrib/jemalloc/include/jemalloc/internal/arena_inlines_b.h
@ -8,13 +8,6 @@
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"

-static inline szind_t
-arena_bin_index(arena_t *arena, arena_bin_t *bin) {
-	szind_t binind = (szind_t)(bin - arena->bins);
-	assert(binind < NBINS);
-	return binind;
-}
-
 JEMALLOC_ALWAYS_INLINE prof_tctx_t *
 arena_prof_tctx_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
 	cassert(config_prof);
@ -35,7 +28,7 @@ arena_prof_tctx_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
 }

 JEMALLOC_ALWAYS_INLINE void
-arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, size_t usize,
+arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, UNUSED size_t usize,
    alloc_ctx_t *alloc_ctx, prof_tctx_t *tctx) {
 	cassert(config_prof);
 	assert(ptr != NULL);
@ -54,7 +47,7 @@ arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, size_t usize,
 }

 static inline void
-arena_prof_tctx_reset(tsdn_t *tsdn, const void *ptr, prof_tctx_t *tctx) {
+arena_prof_tctx_reset(tsdn_t *tsdn, const void *ptr, UNUSED prof_tctx_t *tctx) {
 	cassert(config_prof);
 	assert(ptr != NULL);

--- a/contrib/jemalloc/include/jemalloc/internal/arena_stats.h
+++ b/contrib/jemalloc/include/jemalloc/internal/arena_stats.h
@ -0,0 +1,237 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_STATS_H
+#define JEMALLOC_INTERNAL_ARENA_STATS_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/mutex_prof.h"
+#include "jemalloc/internal/size_classes.h"
+
+/*
+ * In those architectures that support 64-bit atomics, we use atomic updates for
+ * our 64-bit values.  Otherwise, we use a plain uint64_t and synchronize
+ * externally.
+ */
+#ifdef JEMALLOC_ATOMIC_U64
+typedef atomic_u64_t arena_stats_u64_t;
+#else
+/* Must hold the arena stats mutex while reading atomically. */
+typedef uint64_t arena_stats_u64_t;
+#endif
+
+typedef struct arena_stats_large_s arena_stats_large_t;
+struct arena_stats_large_s {
+	/*
+	 * Total number of allocation/deallocation requests served directly by
+	 * the arena.
+	 */
+	arena_stats_u64_t	nmalloc;
+	arena_stats_u64_t	ndalloc;
+
+	/*
+	 * Number of allocation requests that correspond to this size class.
+	 * This includes requests served by tcache, though tcache only
+	 * periodically merges into this counter.
+	 */
+	arena_stats_u64_t	nrequests; /* Partially derived. */
+
+	/* Current number of allocations of this size class. */
+	size_t		curlextents; /* Derived. */
+};
+
+typedef struct arena_stats_decay_s arena_stats_decay_t;
+struct arena_stats_decay_s {
+	/* Total number of purge sweeps. */
+	arena_stats_u64_t	npurge;
+	/* Total number of madvise calls made. */
+	arena_stats_u64_t	nmadvise;
+	/* Total number of pages purged. */
+	arena_stats_u64_t	purged;
+};
+
+/*
+ * Arena stats.  Note that fields marked "derived" are not directly maintained
+ * within the arena code; rather their values are derived during stats merge
+ * requests.
+ */
+typedef struct arena_stats_s arena_stats_t;
+struct arena_stats_s {
+#ifndef JEMALLOC_ATOMIC_U64
+	malloc_mutex_t		mtx;
+#endif
+
+	/* Number of bytes currently mapped, excluding retained memory. */
+	atomic_zu_t		mapped; /* Partially derived. */
+
+	/*
+	 * Number of unused virtual memory bytes currently retained.  Retained
+	 * bytes are technically mapped (though always decommitted or purged),
+	 * but they are excluded from the mapped statistic (above).
+	 */
+	atomic_zu_t		retained; /* Derived. */
+
+	arena_stats_decay_t	decay_dirty;
+	arena_stats_decay_t	decay_muzzy;
+
+	atomic_zu_t		base; /* Derived. */
+	atomic_zu_t		internal;
+	atomic_zu_t		resident; /* Derived. */
+	atomic_zu_t		metadata_thp;
+
+	atomic_zu_t		allocated_large; /* Derived. */
+	arena_stats_u64_t	nmalloc_large; /* Derived. */
+	arena_stats_u64_t	ndalloc_large; /* Derived. */
+	arena_stats_u64_t	nrequests_large; /* Derived. */
+
+	/* Number of bytes cached in tcache associated with this arena. */
+	atomic_zu_t		tcache_bytes; /* Derived. */
+
+	mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes];
+
+	/* One element for each large size class. */
+	arena_stats_large_t	lstats[NSIZES - NBINS];
+
+	/* Arena uptime. */
+	nstime_t		uptime;
+};
+
+static inline bool
+arena_stats_init(UNUSED tsdn_t *tsdn, arena_stats_t *arena_stats) {
+	if (config_debug) {
+		for (size_t i = 0; i < sizeof(arena_stats_t); i++) {
+			assert(((char *)arena_stats)[i] == 0);
+		}
+	}
+#ifndef JEMALLOC_ATOMIC_U64
+	if (malloc_mutex_init(&arena_stats->mtx, "arena_stats",
+	    WITNESS_RANK_ARENA_STATS, malloc_mutex_rank_exclusive)) {
+		return true;
+	}
+#endif
+	/* Memory is zeroed, so there is no need to clear stats. */
+	return false;
+}
+
+static inline void
+arena_stats_lock(tsdn_t *tsdn, arena_stats_t *arena_stats) {
+#ifndef JEMALLOC_ATOMIC_U64
+	malloc_mutex_lock(tsdn, &arena_stats->mtx);
+#endif
+}
+
+static inline void
+arena_stats_unlock(tsdn_t *tsdn, arena_stats_t *arena_stats) {
+#ifndef JEMALLOC_ATOMIC_U64
+	malloc_mutex_unlock(tsdn, &arena_stats->mtx);
+#endif
+}
+
+static inline uint64_t
+arena_stats_read_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    arena_stats_u64_t *p) {
+#ifdef JEMALLOC_ATOMIC_U64
+	return atomic_load_u64(p, ATOMIC_RELAXED);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	return *p;
+#endif
+}
+
+static inline void
+arena_stats_add_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    arena_stats_u64_t *p, uint64_t x) {
+#ifdef JEMALLOC_ATOMIC_U64
+	atomic_fetch_add_u64(p, x, ATOMIC_RELAXED);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	*p += x;
+#endif
+}
+
+UNUSED static inline void
+arena_stats_sub_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    arena_stats_u64_t *p, uint64_t x) {
+#ifdef JEMALLOC_ATOMIC_U64
+	UNUSED uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED);
+	assert(r - x <= r);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	*p -= x;
+	assert(*p + x >= *p);
+#endif
+}
+
+/*
+ * Non-atomically sets *dst += src.  *dst needs external synchronization.
+ * This lets us avoid the cost of a fetch_add when its unnecessary (note that
+ * the types here are atomic).
+ */
+static inline void
+arena_stats_accum_u64(arena_stats_u64_t *dst, uint64_t src) {
+#ifdef JEMALLOC_ATOMIC_U64
+	uint64_t cur_dst = atomic_load_u64(dst, ATOMIC_RELAXED);
+	atomic_store_u64(dst, src + cur_dst, ATOMIC_RELAXED);
+#else
+	*dst += src;
+#endif
+}
+
+static inline size_t
+arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p) {
+#ifdef JEMALLOC_ATOMIC_U64
+	return atomic_load_zu(p, ATOMIC_RELAXED);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	return atomic_load_zu(p, ATOMIC_RELAXED);
+#endif
+}
+
+static inline void
+arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
+    size_t x) {
+#ifdef JEMALLOC_ATOMIC_U64
+	atomic_fetch_add_zu(p, x, ATOMIC_RELAXED);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	size_t cur = atomic_load_zu(p, ATOMIC_RELAXED);
+	atomic_store_zu(p, cur + x, ATOMIC_RELAXED);
+#endif
+}
+
+static inline void
+arena_stats_sub_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
+    size_t x) {
+#ifdef JEMALLOC_ATOMIC_U64
+	UNUSED size_t r = atomic_fetch_sub_zu(p, x, ATOMIC_RELAXED);
+	assert(r - x <= r);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	size_t cur = atomic_load_zu(p, ATOMIC_RELAXED);
+	atomic_store_zu(p, cur - x, ATOMIC_RELAXED);
+#endif
+}
+
+/* Like the _u64 variant, needs an externally synchronized *dst. */
+static inline void
+arena_stats_accum_zu(atomic_zu_t *dst, size_t src) {
+	size_t cur_dst = atomic_load_zu(dst, ATOMIC_RELAXED);
+	atomic_store_zu(dst, src + cur_dst, ATOMIC_RELAXED);
+}
+
+static inline void
+arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    szind_t szind, uint64_t nrequests) {
+	arena_stats_lock(tsdn, arena_stats);
+	arena_stats_add_u64(tsdn, arena_stats, &arena_stats->lstats[szind -
+	    NBINS].nrequests, nrequests);
+	arena_stats_unlock(tsdn, arena_stats);
+}
+
+static inline void
+arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats, size_t size) {
+	arena_stats_lock(tsdn, arena_stats);
+	arena_stats_add_zu(tsdn, arena_stats, &arena_stats->mapped, size);
+	arena_stats_unlock(tsdn, arena_stats);
+}
+
+
+#endif /* JEMALLOC_INTERNAL_ARENA_STATS_H */
--- a/contrib/jemalloc/include/jemalloc/internal/arena_structs_b.h
+++ b/contrib/jemalloc/include/jemalloc/internal/arena_structs_b.h
@ -1,7 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_STRUCTS_B_H
 #define JEMALLOC_INTERNAL_ARENA_STRUCTS_B_H

+#include "jemalloc/internal/arena_stats.h"
 #include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
@ -10,45 +12,8 @@
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/smoothstep.h"
-#include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ticker.h"

-/*
- * Read-only information associated with each element of arena_t's bins array
- * is stored separately, partly to reduce memory usage (only one copy, rather
- * than one per arena), but mainly to avoid false cacheline sharing.
- *
- * Each slab has the following layout:
- *
- *   /--------------------\
- *   | region 0           |
- *   |--------------------|
- *   | region 1           |
- *   |--------------------|
- *   | ...                |
- *   | ...                |
- *   | ...                |
- *   |--------------------|
- *   | region nregs-1     |
- *   \--------------------/
- */
-struct arena_bin_info_s {
-	/* Size of regions in a slab for this bin's size class. */
-	size_t			reg_size;
-
-	/* Total size of a slab for this bin's size class. */
-	size_t			slab_size;
-
-	/* Total number of regions in a slab for this bin's size class. */
-	uint32_t		nregs;
-
-	/*
-	 * Metadata used to manipulate bitmaps for slabs associated with this
-	 * bin.
-	 */
-	bitmap_info_t		bitmap_info;
-};
-
 struct arena_decay_s {
 	/* Synchronizes all non-atomic fields. */
 	malloc_mutex_t		mtx;
@ -104,37 +69,11 @@ struct arena_decay_s {
 	 * arena and ctl code.
 	 *
 	 * Synchronization: Same as associated arena's stats field. */
-	decay_stats_t		*stats;
+	arena_stats_decay_t	*stats;
 	/* Peak number of pages in associated extents.  Used for debug only. */
 	uint64_t		ceil_npages;
 };

-struct arena_bin_s {
-	/* All operations on arena_bin_t fields require lock ownership. */
-	malloc_mutex_t		lock;
-
-	/*
-	 * Current slab being used to service allocations of this bin's size
-	 * class.  slabcur is independent of slabs_{nonfull,full}; whenever
-	 * slabcur is reassigned, the previous slab must be deallocated or
-	 * inserted into slabs_{nonfull,full}.
-	 */
-	extent_t		*slabcur;
-
-	/*
-	 * Heap of non-full slabs.  This heap is used to assure that new
-	 * allocations come from the non-full slab that is oldest/lowest in
-	 * memory.
-	 */
-	extent_heap_t		slabs_nonfull;
-
-	/* List used to track full slabs. */
-	extent_list_t		slabs_full;
-
-	/* Bin statistics. */
-	malloc_bin_stats_t	stats;
-};
-
 struct arena_s {
 	/*
 	 * Number of threads currently assigned to this arena.  Each thread has
@ -162,14 +101,15 @@ struct arena_s {
 	arena_stats_t		stats;

 	/*
-	 * List of tcaches for extant threads associated with this arena.
-	 * Stats from these are merged incrementally, and at exit if
-	 * opt_stats_print is enabled.
+	 * Lists of tcaches and cache_bin_array_descriptors for extant threads
+	 * associated with this arena.  Stats from these are merged
+	 * incrementally, and at exit if opt_stats_print is enabled.
 	 *
 	 * Synchronization: tcache_ql_mtx.
 	 */
-	ql_head(tcache_t)	tcache_ql;
-	malloc_mutex_t		tcache_ql_mtx;
+	ql_head(tcache_t)			tcache_ql;
+	ql_head(cache_bin_array_descriptor_t)	cache_bin_array_descriptor_ql;
+	malloc_mutex_t				tcache_ql_mtx;

 	/* Synchronization: internal. */
 	prof_accum_t		prof_accum;
@ -239,9 +179,14 @@ struct arena_s {
 	 * be effective even if multiple arenas' extent allocation requests are
 	 * highly interleaved.
 	 *
+	 * retain_grow_limit is the max allowed size ind to expand (unless the
+	 * required size is greater).  Default is no limit, and controlled
+	 * through mallctl only.
+	 *
 	 * Synchronization: extent_grow_mtx
 	 */
 	pszind_t		extent_grow_next;
+	pszind_t		retain_grow_limit;
 	malloc_mutex_t		extent_grow_mtx;

 	/*
@ -258,7 +203,7 @@ struct arena_s {
 	 *
 	 * Synchronization: internal.
 	 */
-	arena_bin_t		bins[NBINS];
+	bin_t			bins[NBINS];

 	/*
 	 * Base allocator, from which arena metadata are allocated.
--- a/contrib/jemalloc/include/jemalloc/internal/arena_types.h
+++ b/contrib/jemalloc/include/jemalloc/internal/arena_types.h
@ -12,9 +12,7 @@
 #define DECAY_NTICKS_PER_UPDATE	1000

 typedef struct arena_slab_data_s arena_slab_data_t;
-typedef struct arena_bin_info_s arena_bin_info_t;
 typedef struct arena_decay_s arena_decay_t;
-typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;
 typedef struct arena_tdata_s arena_tdata_t;
 typedef struct alloc_ctx_s alloc_ctx_t;
--- a/contrib/jemalloc/include/jemalloc/internal/background_thread_externs.h
+++ b/contrib/jemalloc/include/jemalloc/internal/background_thread_externs.h
@ -2,9 +2,11 @@
 #define JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H

 extern bool opt_background_thread;
+extern size_t opt_max_background_threads;
 extern malloc_mutex_t background_thread_lock;
 extern atomic_b_t background_thread_enabled_state;
 extern size_t n_background_threads;
+extern size_t max_background_threads;
 extern background_thread_info_t *background_thread_info;
 extern bool can_enable_background_thread;

--- a/contrib/jemalloc/include/jemalloc/internal/background_thread_structs.h
+++ b/contrib/jemalloc/include/jemalloc/internal/background_thread_structs.h
@ -8,6 +8,7 @@
 #endif

 #define BACKGROUND_THREAD_INDEFINITE_SLEEP UINT64_MAX
+#define MAX_BACKGROUND_THREAD_LIMIT MALLOCX_ARENA_LIMIT

 typedef enum {
 	background_thread_stopped,
--- a/contrib/jemalloc/include/jemalloc/internal/base_externs.h
+++ b/contrib/jemalloc/include/jemalloc/internal/base_externs.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_BASE_EXTERNS_H
 #define JEMALLOC_INTERNAL_BASE_EXTERNS_H

+extern metadata_thp_mode_t opt_metadata_thp;
+extern const char *metadata_thp_mode_names[];
+
 base_t *b0get(void);
 base_t *base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks);
 void base_delete(tsdn_t *tsdn, base_t *base);
@ -10,7 +13,7 @@ extent_hooks_t *base_extent_hooks_set(base_t *base,
 void *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment);
 extent_t *base_alloc_extent(tsdn_t *tsdn, base_t *base);
 void base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated,
-    size_t *resident, size_t *mapped);
+    size_t *resident, size_t *mapped, size_t *n_thp);
 void base_prefork(tsdn_t *tsdn, base_t *base);
 void base_postfork_parent(tsdn_t *tsdn, base_t *base);
 void base_postfork_child(tsdn_t *tsdn, base_t *base);
--- a/contrib/jemalloc/include/jemalloc/internal/base_inlines.h
+++ b/contrib/jemalloc/include/jemalloc/internal/base_inlines.h
@ -6,4 +6,8 @@ base_ind_get(const base_t *base) {
 	return base->ind;
 }

+static inline bool
+metadata_thp_enabled(void) {
+	return (opt_metadata_thp != metadata_thp_disabled);
+}
 #endif /* JEMALLOC_INTERNAL_BASE_INLINES_H */
--- a/contrib/jemalloc/include/jemalloc/internal/base_structs.h
+++ b/contrib/jemalloc/include/jemalloc/internal/base_structs.h
@ -30,6 +30,8 @@ struct base_s {
 	/* Protects base_alloc() and base_stats_get() operations. */
 	malloc_mutex_t	mtx;

+	/* Using THP when true (metadata_thp auto mode). */
+	bool		auto_thp_switched;
 	/*
 	 * Most recent size class in the series of increasingly large base
 	 * extents.  Logarithmic spacing between subsequent allocations ensures
@ -50,6 +52,8 @@ struct base_s {
 	size_t		allocated;
 	size_t		resident;
 	size_t		mapped;
+	/* Number of THP regions touched. */
+	size_t		n_thp;
 };

 #endif /* JEMALLOC_INTERNAL_BASE_STRUCTS_H */
--- a/contrib/jemalloc/include/jemalloc/internal/base_types.h
+++ b/contrib/jemalloc/include/jemalloc/internal/base_types.h
@ -4,4 +4,30 @@
 typedef struct base_block_s base_block_t;
 typedef struct base_s base_t;

+#define METADATA_THP_DEFAULT metadata_thp_disabled
+
+/*
+ * In auto mode, arenas switch to huge pages for the base allocator on the
+ * second base block.  a0 switches to thp on the 5th block (after 20 megabytes
+ * of metadata), since more metadata (e.g. rtree nodes) come from a0's base.
+ */
+
+#define BASE_AUTO_THP_THRESHOLD    2
+#define BASE_AUTO_THP_THRESHOLD_A0 5
+
+typedef enum {
+	metadata_thp_disabled   = 0,
+	/*
+	 * Lazily enable hugepage for metadata. To avoid high RSS caused by THP
+	 * + low usage arena (i.e. THP becomes a significant percentage), the
+	 * "auto" option only starts using THP after a base allocator used up
+	 * the first THP region.  Starting from the second hugepage (in a single
+	 * arena), "auto" behaves the same as "always", i.e. madvise hugepage
+	 * right away.
+	 */
+	metadata_thp_auto       = 1,
+	metadata_thp_always     = 2,
+	metadata_thp_mode_limit = 3
+} metadata_thp_mode_t;
+
 #endif /* JEMALLOC_INTERNAL_BASE_TYPES_H */
--- a/contrib/jemalloc/include/jemalloc/internal/bin.h
+++ b/contrib/jemalloc/include/jemalloc/internal/bin.h
@ -0,0 +1,106 @@
+#ifndef JEMALLOC_INTERNAL_BIN_H
+#define JEMALLOC_INTERNAL_BIN_H
+
+#include "jemalloc/internal/extent_types.h"
+#include "jemalloc/internal/extent_structs.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/bin_stats.h"
+
+/*
+ * A bin contains a set of extents that are currently being used for slab
+ * allocations.
+ */
+
+/*
+ * Read-only information associated with each element of arena_t's bins array
+ * is stored separately, partly to reduce memory usage (only one copy, rather
+ * than one per arena), but mainly to avoid false cacheline sharing.
+ *
+ * Each slab has the following layout:
+ *
+ *   /--------------------\
+ *   | region 0           |
+ *   |--------------------|
+ *   | region 1           |
+ *   |--------------------|
+ *   | ...                |
+ *   | ...                |
+ *   | ...                |
+ *   |--------------------|
+ *   | region nregs-1     |
+ *   \--------------------/
+ */
+typedef struct bin_info_s bin_info_t;
+struct bin_info_s {
+	/* Size of regions in a slab for this bin's size class. */
+	size_t			reg_size;
+
+	/* Total size of a slab for this bin's size class. */
+	size_t			slab_size;
+
+	/* Total number of regions in a slab for this bin's size class. */
+	uint32_t		nregs;
+
+	/*
+	 * Metadata used to manipulate bitmaps for slabs associated with this
+	 * bin.
+	 */
+	bitmap_info_t		bitmap_info;
+};
+
+extern const bin_info_t bin_infos[NBINS];
+
+
+typedef struct bin_s bin_t;
+struct bin_s {
+	/* All operations on bin_t fields require lock ownership. */
+	malloc_mutex_t		lock;
+
+	/*
+	 * Current slab being used to service allocations of this bin's size
+	 * class.  slabcur is independent of slabs_{nonfull,full}; whenever
+	 * slabcur is reassigned, the previous slab must be deallocated or
+	 * inserted into slabs_{nonfull,full}.
+	 */
+	extent_t		*slabcur;
+
+	/*
+	 * Heap of non-full slabs.  This heap is used to assure that new
+	 * allocations come from the non-full slab that is oldest/lowest in
+	 * memory.
+	 */
+	extent_heap_t		slabs_nonfull;
+
+	/* List used to track full slabs. */
+	extent_list_t		slabs_full;
+
+	/* Bin statistics. */
+	bin_stats_t	stats;
+};
+
+/* Initializes a bin to empty.  Returns true on error. */
+bool bin_init(bin_t *bin);
+
+/* Forking. */
+void bin_prefork(tsdn_t *tsdn, bin_t *bin);
+void bin_postfork_parent(tsdn_t *tsdn, bin_t *bin);
+void bin_postfork_child(tsdn_t *tsdn, bin_t *bin);
+
+/* Stats. */
+static inline void
+bin_stats_merge(tsdn_t *tsdn, bin_stats_t *dst_bin_stats, bin_t *bin) {
+	malloc_mutex_lock(tsdn, &bin->lock);
+	malloc_mutex_prof_read(tsdn, &dst_bin_stats->mutex_data, &bin->lock);
+	dst_bin_stats->nmalloc += bin->stats.nmalloc;
+	dst_bin_stats->ndalloc += bin->stats.ndalloc;
+	dst_bin_stats->nrequests += bin->stats.nrequests;
+	dst_bin_stats->curregs += bin->stats.curregs;
+	dst_bin_stats->nfills += bin->stats.nfills;
+	dst_bin_stats->nflushes += bin->stats.nflushes;
+	dst_bin_stats->nslabs += bin->stats.nslabs;
+	dst_bin_stats->reslabs += bin->stats.reslabs;
+	dst_bin_stats->curslabs += bin->stats.curslabs;
+	malloc_mutex_unlock(tsdn, &bin->lock);
+}
+
+#endif /* JEMALLOC_INTERNAL_BIN_H */
--- a/contrib/jemalloc/include/jemalloc/internal/bin_stats.h
+++ b/contrib/jemalloc/include/jemalloc/internal/bin_stats.h
@ -0,0 +1,51 @@
+#ifndef JEMALLOC_INTERNAL_BIN_STATS_H
+#define JEMALLOC_INTERNAL_BIN_STATS_H
+
+#include "jemalloc/internal/mutex_prof.h"
+
+typedef struct bin_stats_s bin_stats_t;
+struct bin_stats_s {
+	/*
+	 * Total number of allocation/deallocation requests served directly by
+	 * the bin.  Note that tcache may allocate an object, then recycle it
+	 * many times, resulting many increments to nrequests, but only one
+	 * each to nmalloc and ndalloc.
+	 */
+	uint64_t	nmalloc;
+	uint64_t	ndalloc;
+
+	/*
+	 * Number of allocation requests that correspond to the size of this
+	 * bin.  This includes requests served by tcache, though tcache only
+	 * periodically merges into this counter.
+	 */
+	uint64_t	nrequests;
+
+	/*
+	 * Current number of regions of this size class, including regions
+	 * currently cached by tcache.
+	 */
+	size_t		curregs;
+
+	/* Number of tcache fills from this bin. */
+	uint64_t	nfills;
+
+	/* Number of tcache flushes to this bin. */
+	uint64_t	nflushes;
+
+	/* Total number of slabs created for this bin's size class. */
+	uint64_t	nslabs;
+
+	/*
+	 * Total number of slabs reused by extracting them from the slabs heap
+	 * for this bin's size class.
+	 */
+	uint64_t	reslabs;
+
+	/* Current number of slabs in this bin. */
+	size_t		curslabs;
+
+	mutex_prof_data_t mutex_data;
+};
+
+#endif /* JEMALLOC_INTERNAL_BIN_STATS_H */
--- a/contrib/jemalloc/include/jemalloc/internal/cache_bin.h
+++ b/contrib/jemalloc/include/jemalloc/internal/cache_bin.h
@ -0,0 +1,114 @@
+#ifndef JEMALLOC_INTERNAL_CACHE_BIN_H
+#define JEMALLOC_INTERNAL_CACHE_BIN_H
+
+#include "jemalloc/internal/ql.h"
+
+/*
+ * The cache_bins are the mechanism that the tcache and the arena use to
+ * communicate.  The tcache fills from and flushes to the arena by passing a
+ * cache_bin_t to fill/flush.  When the arena needs to pull stats from the
+ * tcaches associated with it, it does so by iterating over its
+ * cache_bin_array_descriptor_t objects and reading out per-bin stats it
+ * contains.  This makes it so that the arena need not know about the existence
+ * of the tcache at all.
+ */
+
+
+/*
+ * The count of the number of cached allocations in a bin.  We make this signed
+ * so that negative numbers can encode "invalid" states (e.g. a low water mark
+ * of -1 for a cache that has been depleted).
+ */
+typedef int32_t cache_bin_sz_t;
+
+typedef struct cache_bin_stats_s cache_bin_stats_t;
+struct cache_bin_stats_s {
+	/*
+	 * Number of allocation requests that corresponded to the size of this
+	 * bin.
+	 */
+	uint64_t nrequests;
+};
+
+/*
+ * Read-only information associated with each element of tcache_t's tbins array
+ * is stored separately, mainly to reduce memory usage.
+ */
+typedef struct cache_bin_info_s cache_bin_info_t;
+struct cache_bin_info_s {
+	/* Upper limit on ncached. */
+	cache_bin_sz_t ncached_max;
+};
+
+typedef struct cache_bin_s cache_bin_t;
+struct cache_bin_s {
+	/* Min # cached since last GC. */
+	cache_bin_sz_t low_water;
+	/* # of cached objects. */
+	cache_bin_sz_t ncached;
+	/*
+	 * ncached and stats are both modified frequently.  Let's keep them
+	 * close so that they have a higher chance of being on the same
+	 * cacheline, thus less write-backs.
+	 */
+	cache_bin_stats_t tstats;
+	/*
+	 * Stack of available objects.
+	 *
+	 * To make use of adjacent cacheline prefetch, the items in the avail
+	 * stack goes to higher address for newer allocations.  avail points
+	 * just above the available space, which means that
+	 * avail[-ncached, ... -1] are available items and the lowest item will
+	 * be allocated first.
+	 */
+	void **avail;
+};
+
+typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t;
+struct cache_bin_array_descriptor_s {
+	/*
+	 * The arena keeps a list of the cache bins associated with it, for
+	 * stats collection.
+	 */
+	ql_elm(cache_bin_array_descriptor_t) link;
+	/* Pointers to the tcache bins. */
+	cache_bin_t *bins_small;
+	cache_bin_t *bins_large;
+};
+
+static inline void
+cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor,
+    cache_bin_t *bins_small, cache_bin_t *bins_large) {
+	ql_elm_new(descriptor, link);
+	descriptor->bins_small = bins_small;
+	descriptor->bins_large = bins_large;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
+	void *ret;
+
+	if (unlikely(bin->ncached == 0)) {
+		bin->low_water = -1;
+		*success = false;
+		return NULL;
+	}
+	/*
+	 * success (instead of ret) should be checked upon the return of this
+	 * function.  We avoid checking (ret == NULL) because there is never a
+	 * null stored on the avail stack (which is unknown to the compiler),
+	 * and eagerly checking ret would cause pipeline stall (waiting for the
+	 * cacheline).
+	 */
+	*success = true;
+	ret = *(bin->avail - bin->ncached);
+	bin->ncached--;
+
+	if (unlikely(bin->ncached < bin->low_water)) {
+		bin->low_water = bin->ncached;
+	}
+
+	return ret;
+}
+
+#endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
--- a/contrib/jemalloc/include/jemalloc/internal/ctl.h
+++ b/contrib/jemalloc/include/jemalloc/internal/ctl.h
@ -40,14 +40,15 @@ typedef struct ctl_arena_stats_s {
 	uint64_t ndalloc_small;
 	uint64_t nrequests_small;

-	malloc_bin_stats_t bstats[NBINS];
-	malloc_large_stats_t lstats[NSIZES - NBINS];
+	bin_stats_t bstats[NBINS];
+	arena_stats_large_t lstats[NSIZES - NBINS];
 } ctl_arena_stats_t;

 typedef struct ctl_stats_s {
 	size_t allocated;
 	size_t active;
 	size_t metadata;
+	size_t metadata_thp;
 	size_t resident;
 	size_t mapped;
 	size_t retained;
--- a/contrib/jemalloc/include/jemalloc/internal/div.h
+++ b/contrib/jemalloc/include/jemalloc/internal/div.h
@ -0,0 +1,41 @@
+#ifndef JEMALLOC_INTERNAL_DIV_H
+#define JEMALLOC_INTERNAL_DIV_H
+
+#include "jemalloc/internal/assert.h"
+
+/*
+ * This module does the division that computes the index of a region in a slab,
+ * given its offset relative to the base.
+ * That is, given a divisor d, an n = i * d (all integers), we'll return i.
+ * We do some pre-computation to do this more quickly than a CPU division
+ * instruction.
+ * We bound n < 2^32, and don't support dividing by one.
+ */
+
+typedef struct div_info_s div_info_t;
+struct div_info_s {
+	uint32_t magic;
+#ifdef JEMALLOC_DEBUG
+	size_t d;
+#endif
+};
+
+void div_init(div_info_t *div_info, size_t divisor);
+
+static inline size_t
+div_compute(div_info_t *div_info, size_t n) {
+	assert(n <= (uint32_t)-1);
+	/*
+	 * This generates, e.g. mov; imul; shr on x86-64. On a 32-bit machine,
+	 * the compilers I tried were all smart enough to turn this into the
+	 * appropriate "get the high 32 bits of the result of a multiply" (e.g.
+	 * mul; mov edx eax; on x86, umull on arm, etc.).
+	 */
+	size_t i = ((uint64_t)n * (uint64_t)div_info->magic) >> 32;
+#ifdef JEMALLOC_DEBUG
+	assert(i * div_info->d == n);
+#endif
+	return i;
+}
+
+#endif /* JEMALLOC_INTERNAL_DIV_H */
--- a/contrib/jemalloc/include/jemalloc/internal/emitter.h
+++ b/contrib/jemalloc/include/jemalloc/internal/emitter.h
@ -0,0 +1,435 @@
+#ifndef JEMALLOC_INTERNAL_EMITTER_H
+#define JEMALLOC_INTERNAL_EMITTER_H
+
+#include "jemalloc/internal/ql.h"
+
+typedef enum emitter_output_e emitter_output_t;
+enum emitter_output_e {
+	emitter_output_json,
+	emitter_output_table
+};
+
+typedef enum emitter_justify_e emitter_justify_t;
+enum emitter_justify_e {
+	emitter_justify_left,
+	emitter_justify_right,
+	/* Not for users; just to pass to internal functions. */
+	emitter_justify_none
+};
+
+typedef enum emitter_type_e emitter_type_t;
+enum emitter_type_e {
+	emitter_type_bool,
+	emitter_type_int,
+	emitter_type_unsigned,
+	emitter_type_uint32,
+	emitter_type_uint64,
+	emitter_type_size,
+	emitter_type_ssize,
+	emitter_type_string,
+	/*
+	 * A title is a column title in a table; it's just a string, but it's
+	 * not quoted.
+	 */
+	emitter_type_title,
+};
+
+typedef struct emitter_col_s emitter_col_t;
+struct emitter_col_s {
+	/* Filled in by the user. */
+	emitter_justify_t justify;
+	int width;
+	emitter_type_t type;
+	union {
+		bool bool_val;
+		int int_val;
+		unsigned unsigned_val;
+		uint32_t uint32_val;
+		uint64_t uint64_val;
+		size_t size_val;
+		ssize_t ssize_val;
+		const char *str_val;
+	};
+
+	/* Filled in by initialization. */
+	ql_elm(emitter_col_t) link;
+};
+
+typedef struct emitter_row_s emitter_row_t;
+struct emitter_row_s {
+	ql_head(emitter_col_t) cols;
+};
+
+static inline void
+emitter_row_init(emitter_row_t *row) {
+	ql_new(&row->cols);
+}
+
+static inline void
+emitter_col_init(emitter_col_t *col, emitter_row_t *row) {
+	ql_elm_new(col, link);
+	ql_tail_insert(&row->cols, col, link);
+}
+
+typedef struct emitter_s emitter_t;
+struct emitter_s {
+	emitter_output_t output;
+	/* The output information. */
+	void (*write_cb)(void *, const char *);
+	void *cbopaque;
+	int nesting_depth;
+	/* True if we've already emitted a value at the given depth. */
+	bool item_at_depth;
+};
+
+static inline void
+emitter_init(emitter_t *emitter, emitter_output_t emitter_output,
+    void (*write_cb)(void *, const char *), void *cbopaque) {
+	emitter->output = emitter_output;
+	emitter->write_cb = write_cb;
+	emitter->cbopaque = cbopaque;
+	emitter->item_at_depth = false;
+	emitter->nesting_depth = 0;
+}
+
+/* Internal convenience function.  Write to the emitter the given string. */
+JEMALLOC_FORMAT_PRINTF(2, 3)
+static inline void
+emitter_printf(emitter_t *emitter, const char *format, ...) {
+	va_list ap;
+
+	va_start(ap, format);
+	malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
+	va_end(ap);
+}
+
+/* Write to the emitter the given string, but only in table mode. */
+JEMALLOC_FORMAT_PRINTF(2, 3)
+static inline void
+emitter_table_printf(emitter_t *emitter, const char *format, ...) {
+	if (emitter->output == emitter_output_table) {
+		va_list ap;
+		va_start(ap, format);
+		malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
+		va_end(ap);
+	}
+}
+
+static inline void
+emitter_gen_fmt(char *out_fmt, size_t out_size, const char *fmt_specifier,
+    emitter_justify_t justify, int width) {
+	size_t written;
+	if (justify == emitter_justify_none) {
+		written = malloc_snprintf(out_fmt, out_size,
+		    "%%%s", fmt_specifier);
+	} else if (justify == emitter_justify_left) {
+		written = malloc_snprintf(out_fmt, out_size,
+		    "%%-%d%s", width, fmt_specifier);
+	} else {
+		written = malloc_snprintf(out_fmt, out_size,
+		    "%%%d%s", width, fmt_specifier);
+	}
+	/* Only happens in case of bad format string, which *we* choose. */
+	assert(written <  out_size);
+}
+
+/*
+ * Internal.  Emit the given value type in the relevant encoding (so that the
+ * bool true gets mapped to json "true", but the string "true" gets mapped to
+ * json "\"true\"", for instance.
+ *
+ * Width is ignored if justify is emitter_justify_none.
+ */
+static inline void
+emitter_print_value(emitter_t *emitter, emitter_justify_t justify, int width,
+    emitter_type_t value_type, const void *value) {
+	size_t str_written;
+#define BUF_SIZE 256
+#define FMT_SIZE 10
+	/*
+	 * We dynamically generate a format string to emit, to let us use the
+	 * snprintf machinery.  This is kinda hacky, but gets the job done
+	 * quickly without having to think about the various snprintf edge
+	 * cases.
+	 */
+	char fmt[FMT_SIZE];
+	char buf[BUF_SIZE];
+
+#define EMIT_SIMPLE(type, format)					\
+	emitter_gen_fmt(fmt, FMT_SIZE, format, justify, width);		\
+	emitter_printf(emitter, fmt, *(const type *)value);		\
+
+	switch (value_type) {
+	case emitter_type_bool:
+		emitter_gen_fmt(fmt, FMT_SIZE, "s", justify, width);
+		emitter_printf(emitter, fmt, *(const bool *)value ?
+		    "true" : "false");
+		break;
+	case emitter_type_int:
+		EMIT_SIMPLE(int, "d")
+		break;
+	case emitter_type_unsigned:
+		EMIT_SIMPLE(unsigned, "u")
+		break;
+	case emitter_type_ssize:
+		EMIT_SIMPLE(ssize_t, "zd")
+		break;
+	case emitter_type_size:
+		EMIT_SIMPLE(size_t, "zu")
+		break;
+	case emitter_type_string:
+		str_written = malloc_snprintf(buf, BUF_SIZE, "\"%s\"",
+		    *(const char *const *)value);
+		/*
+		 * We control the strings we output; we shouldn't get anything
+		 * anywhere near the fmt size.
+		 */
+		assert(str_written < BUF_SIZE);
+		emitter_gen_fmt(fmt, FMT_SIZE, "s", justify, width);
+		emitter_printf(emitter, fmt, buf);
+		break;
+	case emitter_type_uint32:
+		EMIT_SIMPLE(uint32_t, FMTu32)
+		break;
+	case emitter_type_uint64:
+		EMIT_SIMPLE(uint64_t, FMTu64)
+		break;
+	case emitter_type_title:
+		EMIT_SIMPLE(char *const, "s");
+		break;
+	default:
+		unreachable();
+	}
+#undef BUF_SIZE
+#undef FMT_SIZE
+}
+
+
+/* Internal functions.  In json mode, tracks nesting state. */
+static inline void
+emitter_nest_inc(emitter_t *emitter) {
+	emitter->nesting_depth++;
+	emitter->item_at_depth = false;
+}
+
+static inline void
+emitter_nest_dec(emitter_t *emitter) {
+	emitter->nesting_depth--;
+	emitter->item_at_depth = true;
+}
+
+static inline void
+emitter_indent(emitter_t *emitter) {
+	int amount = emitter->nesting_depth;
+	const char *indent_str;
+	if (emitter->output == emitter_output_json) {
+		indent_str = "\t";
+	} else {
+		amount *= 2;
+		indent_str = " ";
+	}
+	for (int i = 0; i < amount; i++) {
+		emitter_printf(emitter, "%s", indent_str);
+	}
+}
+
+static inline void
+emitter_json_key_prefix(emitter_t *emitter) {
+	emitter_printf(emitter, "%s\n", emitter->item_at_depth ? "," : "");
+	emitter_indent(emitter);
+}
+
+static inline void
+emitter_begin(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth == 0);
+		emitter_printf(emitter, "{");
+		emitter_nest_inc(emitter);
+	} else {
+		// tabular init
+		emitter_printf(emitter, "%s", "");
+	}
+}
+
+static inline void
+emitter_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth == 1);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "\n}\n");
+	}
+}
+
+/*
+ * Note emits a different kv pair as well, but only in table mode.  Omits the
+ * note if table_note_key is NULL.
+ */
+static inline void
+emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value,
+    const char *table_note_key, emitter_type_t table_note_value_type,
+    const void *table_note_value) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth > 0);
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "\"%s\": ", json_key);
+		emitter_print_value(emitter, emitter_justify_none, -1,
+		    value_type, value);
+	} else {
+		emitter_indent(emitter);
+		emitter_printf(emitter, "%s: ", table_key);
+		emitter_print_value(emitter, emitter_justify_none, -1,
+		    value_type, value);
+		if (table_note_key != NULL) {
+			emitter_printf(emitter, " (%s: ", table_note_key);
+			emitter_print_value(emitter, emitter_justify_none, -1,
+			    table_note_value_type, table_note_value);
+			emitter_printf(emitter, ")");
+		}
+		emitter_printf(emitter, "\n");
+	}
+	emitter->item_at_depth = true;
+}
+
+static inline void
+emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_kv_note(emitter, json_key, table_key, value_type, value, NULL,
+	    emitter_type_bool, NULL);
+}
+
+static inline void
+emitter_json_kv(emitter_t *emitter, const char *json_key,
+    emitter_type_t value_type, const void *value) {
+	if (emitter->output == emitter_output_json) {
+		emitter_kv(emitter, json_key, NULL, value_type, value);
+	}
+}
+
+static inline void
+emitter_table_kv(emitter_t *emitter, const char *table_key,
+    emitter_type_t value_type, const void *value) {
+	if (emitter->output == emitter_output_table) {
+		emitter_kv(emitter, NULL, table_key, value_type, value);
+	}
+}
+
+static inline void
+emitter_dict_begin(emitter_t *emitter, const char *json_key,
+    const char *table_header) {
+	if (emitter->output == emitter_output_json) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "\"%s\": {", json_key);
+		emitter_nest_inc(emitter);
+	} else {
+		emitter_indent(emitter);
+		emitter_printf(emitter, "%s\n", table_header);
+		emitter_nest_inc(emitter);
+	}
+}
+
+static inline void
+emitter_dict_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth > 0);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "\n");
+		emitter_indent(emitter);
+		emitter_printf(emitter, "}");
+	} else {
+		emitter_nest_dec(emitter);
+	}
+}
+
+static inline void
+emitter_json_dict_begin(emitter_t *emitter, const char *json_key) {
+	if (emitter->output == emitter_output_json) {
+		emitter_dict_begin(emitter, json_key, NULL);
+	}
+}
+
+static inline void
+emitter_json_dict_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		emitter_dict_end(emitter);
+	}
+}
+
+static inline void
+emitter_table_dict_begin(emitter_t *emitter, const char *table_key) {
+	if (emitter->output == emitter_output_table) {
+		emitter_dict_begin(emitter, NULL, table_key);
+	}
+}
+
+static inline void
+emitter_table_dict_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_table) {
+		emitter_dict_end(emitter);
+	}
+}
+
+static inline void
+emitter_json_arr_begin(emitter_t *emitter, const char *json_key) {
+	if (emitter->output == emitter_output_json) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "\"%s\": [", json_key);
+		emitter_nest_inc(emitter);
+	}
+}
+
+static inline void
+emitter_json_arr_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth > 0);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "\n");
+		emitter_indent(emitter);
+		emitter_printf(emitter, "]");
+	}
+}
+
+static inline void
+emitter_json_arr_obj_begin(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "{");
+		emitter_nest_inc(emitter);
+	}
+}
+
+static inline void
+emitter_json_arr_obj_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth > 0);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "\n");
+		emitter_indent(emitter);
+		emitter_printf(emitter, "}");
+	}
+}
+
+static inline void
+emitter_json_arr_value(emitter_t *emitter, emitter_type_t value_type,
+    const void *value) {
+	if (emitter->output == emitter_output_json) {
+		emitter_json_key_prefix(emitter);
+		emitter_print_value(emitter, emitter_justify_none, -1,
+		    value_type, value);
+	}
+}
+
+static inline void
+emitter_table_row(emitter_t *emitter, emitter_row_t *row) {
+	if (emitter->output != emitter_output_table) {
+		return;
+	}
+	emitter_col_t *col;
+	ql_foreach(col, &row->cols, link) {
+		emitter_print_value(emitter, col->justify, col->width,
+		    col->type, (const void *)&col->bool_val);
+	}
+	emitter_table_printf(emitter, "\n");
+}
+
+#endif /* JEMALLOC_INTERNAL_EMITTER_H */
--- a/contrib/jemalloc/include/jemalloc/internal/extent_externs.h
+++ b/contrib/jemalloc/include/jemalloc/internal/extent_externs.h
@ -4,12 +4,13 @@
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mutex_pool.h"
 #include "jemalloc/internal/ph.h"
-#include "jemalloc/internal/rb.h"
 #include "jemalloc/internal/rtree.h"

-extern rtree_t			extents_rtree;
-extern const extent_hooks_t	extent_hooks_default;
-extern mutex_pool_t		extent_mutex_pool;
+extern size_t opt_lg_extent_max_active_fit;
+
+extern rtree_t extents_rtree;
+extern const extent_hooks_t extent_hooks_default;
+extern mutex_pool_t extent_mutex_pool;

 extent_t *extent_alloc(tsdn_t *tsdn, arena_t *arena);
 void extent_dalloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent);
--- a/contrib/jemalloc/include/jemalloc/internal/extent_inlines.h
+++ b/contrib/jemalloc/include/jemalloc/internal/extent_inlines.h
@ -93,6 +93,12 @@ extent_committed_get(const extent_t *extent) {
 	    EXTENT_BITS_COMMITTED_SHIFT);
 }

+static inline bool
+extent_dumpable_get(const extent_t *extent) {
+	return (bool)((extent->e_bits & EXTENT_BITS_DUMPABLE_MASK) >>
+	    EXTENT_BITS_DUMPABLE_SHIFT);
+}
+
 static inline bool
 extent_slab_get(const extent_t *extent) {
 	return (bool)((extent->e_bits & EXTENT_BITS_SLAB_MASK) >>
@ -184,15 +190,22 @@ extent_addr_set(extent_t *extent, void *addr) {
 }

 static inline void
-extent_addr_randomize(tsdn_t *tsdn, extent_t *extent, size_t alignment) {
+extent_addr_randomize(UNUSED tsdn_t *tsdn, extent_t *extent, size_t alignment) {
 	assert(extent_base_get(extent) == extent_addr_get(extent));

 	if (alignment < PAGE) {
 		unsigned lg_range = LG_PAGE -
 		    lg_floor(CACHELINE_CEILING(alignment));
-		size_t r =
-		    prng_lg_range_zu(&extent_arena_get(extent)->offset_state,
-		    lg_range, true);
+		size_t r;
+		if (!tsdn_null(tsdn)) {
+			tsd_t *tsd = tsdn_tsd(tsdn);
+			r = (size_t)prng_lg_range_u64(
+			    tsd_offset_statep_get(tsd), lg_range);
+		} else {
+			r = prng_lg_range_zu(
+			    &extent_arena_get(extent)->offset_state,
+			    lg_range, true);
+		}
 		uintptr_t random_offset = ((uintptr_t)r) << (LG_PAGE -
 		    lg_range);
 		extent->e_addr = (void *)((uintptr_t)extent->e_addr +
@ -269,6 +282,12 @@ extent_committed_set(extent_t *extent, bool committed) {
 	    ((uint64_t)committed << EXTENT_BITS_COMMITTED_SHIFT);
 }

+static inline void
+extent_dumpable_set(extent_t *extent, bool dumpable) {
+	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_DUMPABLE_MASK) |
+	    ((uint64_t)dumpable << EXTENT_BITS_DUMPABLE_SHIFT);
+}
+
 static inline void
 extent_slab_set(extent_t *extent, bool slab) {
 	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_SLAB_MASK) |
@ -283,7 +302,7 @@ extent_prof_tctx_set(extent_t *extent, prof_tctx_t *tctx) {
 static inline void
 extent_init(extent_t *extent, arena_t *arena, void *addr, size_t size,
    bool slab, szind_t szind, size_t sn, extent_state_t state, bool zeroed,
-    bool committed) {
+    bool committed, bool dumpable) {
 	assert(addr == PAGE_ADDR2BASE(addr) || !slab);

 	extent_arena_set(extent, arena);
@ -295,6 +314,7 @@ extent_init(extent_t *extent, arena_t *arena, void *addr, size_t size,
 	extent_state_set(extent, state);
 	extent_zeroed_set(extent, zeroed);
 	extent_committed_set(extent, committed);
+	extent_dumpable_set(extent, dumpable);
 	ql_elm_new(extent, ql_link);
 	if (config_prof) {
 		extent_prof_tctx_set(extent, NULL);
@ -312,6 +332,7 @@ extent_binit(extent_t *extent, void *addr, size_t bsize, size_t sn) {
 	extent_state_set(extent, extent_state_active);
 	extent_zeroed_set(extent, true);
 	extent_committed_set(extent, true);
+	extent_dumpable_set(extent, true);
 }

 static inline void
@ -334,6 +355,11 @@ extent_list_append(extent_list_t *list, extent_t *extent) {
 	ql_tail_insert(list, extent, ql_link);
 }

+static inline void
+extent_list_prepend(extent_list_t *list, extent_t *extent) {
+	ql_head_insert(list, extent, ql_link);
+}
+
 static inline void
 extent_list_replace(extent_list_t *list, extent_t *to_remove,
    extent_t *to_insert) {
--- a/contrib/jemalloc/include/jemalloc/internal/extent_structs.h
+++ b/contrib/jemalloc/include/jemalloc/internal/extent_structs.h
@ -5,7 +5,6 @@
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/ql.h"
-#include "jemalloc/internal/rb.h"
 #include "jemalloc/internal/ph.h"
 #include "jemalloc/internal/size_classes.h"

@ -24,13 +23,14 @@ struct extent_s {
 	 * a: arena_ind
 	 * b: slab
 	 * c: committed
+	 * d: dumpable
 	 * z: zeroed
 	 * t: state
 	 * i: szind
 	 * f: nfree
 	 * n: sn
 	 *
-	 * nnnnnnnn ... nnnnnfff fffffffi iiiiiiit tzcbaaaa aaaaaaaa
+	 * nnnnnnnn ... nnnnffff ffffffii iiiiiitt zdcbaaaa aaaaaaaa
 	 *
 	 * arena_ind: Arena from which this extent came, or all 1 bits if
 	 *            unassociated.
@ -45,6 +45,23 @@ struct extent_s {
 	 *            as on a system that overcommits and satisfies physical
 	 *            memory needs on demand via soft page faults.
 	 *
+	 * dumpable: The dumpable flag indicates whether or not we've set the
+	 *           memory in question to be dumpable.  Note that this
+	 *           interacts somewhat subtly with user-specified extent hooks,
+	 *           since we don't know if *they* are fiddling with
+	 *           dumpability (in which case, we don't want to undo whatever
+	 *           they're doing).  To deal with this scenario, we:
+	 *             - Make dumpable false only for memory allocated with the
+	 *               default hooks.
+	 *             - Only allow memory to go from non-dumpable to dumpable,
+	 *               and only once.
+	 *             - Never make the OS call to allow dumping when the
+	 *               dumpable bit is already set.
+	 *           These three constraints mean that we will never
+	 *           accidentally dump user memory that the user meant to set
+	 *           nondumpable with their extent hooks.
+	 *
+	 *
 	 * zeroed: The zeroed flag is used by extent recycling code to track
 	 *         whether memory is zero-filled.
 	 *
@ -69,38 +86,42 @@ struct extent_s {
 	 *     serial number to both resulting adjacent extents.
 	 */
 	uint64_t		e_bits;
-#define EXTENT_BITS_ARENA_SHIFT		0
-#define EXTENT_BITS_ARENA_MASK \
-    (((uint64_t)(1U << MALLOCX_ARENA_BITS) - 1) << EXTENT_BITS_ARENA_SHIFT)
+#define MASK(CURRENT_FIELD_WIDTH, CURRENT_FIELD_SHIFT) ((((((uint64_t)0x1U) << (CURRENT_FIELD_WIDTH)) - 1)) << (CURRENT_FIELD_SHIFT))

-#define EXTENT_BITS_SLAB_SHIFT		MALLOCX_ARENA_BITS
-#define EXTENT_BITS_SLAB_MASK \
-    ((uint64_t)0x1U << EXTENT_BITS_SLAB_SHIFT)
+#define EXTENT_BITS_ARENA_WIDTH  MALLOCX_ARENA_BITS
+#define EXTENT_BITS_ARENA_SHIFT  0
+#define EXTENT_BITS_ARENA_MASK  MASK(EXTENT_BITS_ARENA_WIDTH, EXTENT_BITS_ARENA_SHIFT)

-#define EXTENT_BITS_COMMITTED_SHIFT	(MALLOCX_ARENA_BITS + 1)
-#define EXTENT_BITS_COMMITTED_MASK \
-    ((uint64_t)0x1U << EXTENT_BITS_COMMITTED_SHIFT)
+#define EXTENT_BITS_SLAB_WIDTH  1
+#define EXTENT_BITS_SLAB_SHIFT  (EXTENT_BITS_ARENA_WIDTH + EXTENT_BITS_ARENA_SHIFT)
+#define EXTENT_BITS_SLAB_MASK  MASK(EXTENT_BITS_SLAB_WIDTH, EXTENT_BITS_SLAB_SHIFT)

-#define EXTENT_BITS_ZEROED_SHIFT	(MALLOCX_ARENA_BITS + 2)
-#define EXTENT_BITS_ZEROED_MASK \
-    ((uint64_t)0x1U << EXTENT_BITS_ZEROED_SHIFT)
+#define EXTENT_BITS_COMMITTED_WIDTH  1
+#define EXTENT_BITS_COMMITTED_SHIFT  (EXTENT_BITS_SLAB_WIDTH + EXTENT_BITS_SLAB_SHIFT)
+#define EXTENT_BITS_COMMITTED_MASK  MASK(EXTENT_BITS_COMMITTED_WIDTH, EXTENT_BITS_COMMITTED_SHIFT)

-#define EXTENT_BITS_STATE_SHIFT		(MALLOCX_ARENA_BITS + 3)
-#define EXTENT_BITS_STATE_MASK \
-    ((uint64_t)0x3U << EXTENT_BITS_STATE_SHIFT)
+#define EXTENT_BITS_DUMPABLE_WIDTH  1
+#define EXTENT_BITS_DUMPABLE_SHIFT  (EXTENT_BITS_COMMITTED_WIDTH + EXTENT_BITS_COMMITTED_SHIFT)
+#define EXTENT_BITS_DUMPABLE_MASK  MASK(EXTENT_BITS_DUMPABLE_WIDTH, EXTENT_BITS_DUMPABLE_SHIFT)

-#define EXTENT_BITS_SZIND_SHIFT		(MALLOCX_ARENA_BITS + 5)
-#define EXTENT_BITS_SZIND_MASK \
-    (((uint64_t)(1U << LG_CEIL_NSIZES) - 1) << EXTENT_BITS_SZIND_SHIFT)
+#define EXTENT_BITS_ZEROED_WIDTH  1
+#define EXTENT_BITS_ZEROED_SHIFT  (EXTENT_BITS_DUMPABLE_WIDTH + EXTENT_BITS_DUMPABLE_SHIFT)
+#define EXTENT_BITS_ZEROED_MASK  MASK(EXTENT_BITS_ZEROED_WIDTH, EXTENT_BITS_ZEROED_SHIFT)

-#define EXTENT_BITS_NFREE_SHIFT \
-    (MALLOCX_ARENA_BITS + 5 + LG_CEIL_NSIZES)
-#define EXTENT_BITS_NFREE_MASK \
-    ((uint64_t)((1U << (LG_SLAB_MAXREGS + 1)) - 1) << EXTENT_BITS_NFREE_SHIFT)
+#define EXTENT_BITS_STATE_WIDTH  2
+#define EXTENT_BITS_STATE_SHIFT  (EXTENT_BITS_ZEROED_WIDTH + EXTENT_BITS_ZEROED_SHIFT)
+#define EXTENT_BITS_STATE_MASK  MASK(EXTENT_BITS_STATE_WIDTH, EXTENT_BITS_STATE_SHIFT)

-#define EXTENT_BITS_SN_SHIFT \
-    (MALLOCX_ARENA_BITS + 5 + LG_CEIL_NSIZES + (LG_SLAB_MAXREGS + 1))
-#define EXTENT_BITS_SN_MASK		(UINT64_MAX << EXTENT_BITS_SN_SHIFT)
+#define EXTENT_BITS_SZIND_WIDTH  LG_CEIL_NSIZES
+#define EXTENT_BITS_SZIND_SHIFT  (EXTENT_BITS_STATE_WIDTH + EXTENT_BITS_STATE_SHIFT)
+#define EXTENT_BITS_SZIND_MASK  MASK(EXTENT_BITS_SZIND_WIDTH, EXTENT_BITS_SZIND_SHIFT)
+
+#define EXTENT_BITS_NFREE_WIDTH  (LG_SLAB_MAXREGS + 1)
+#define EXTENT_BITS_NFREE_SHIFT  (EXTENT_BITS_SZIND_WIDTH + EXTENT_BITS_SZIND_SHIFT)
+#define EXTENT_BITS_NFREE_MASK  MASK(EXTENT_BITS_NFREE_WIDTH, EXTENT_BITS_NFREE_SHIFT)
+
+#define EXTENT_BITS_SN_SHIFT  (EXTENT_BITS_NFREE_WIDTH + EXTENT_BITS_NFREE_SHIFT)
+#define EXTENT_BITS_SN_MASK  (UINT64_MAX << EXTENT_BITS_SN_SHIFT)

 	/* Pointer to the extent that this structure is responsible for. */
 	void			*e_addr;
@ -120,20 +141,19 @@ struct extent_s {
 		size_t			e_bsize;
 	};

-	union {
-		/*
-		 * List linkage, used by a variety of lists:
-		 * - arena_bin_t's slabs_full
-		 * - extents_t's LRU
-		 * - stashed dirty extents
-		 * - arena's large allocations
-		 */
-		ql_elm(extent_t)	ql_link;
-		/* Red-black tree linkage, used by arena's extent_avail. */
-		rb_node(extent_t)	rb_link;
-	};
+	/*
+	 * List linkage, used by a variety of lists:
+	 * - bin_t's slabs_full
+	 * - extents_t's LRU
+	 * - stashed dirty extents
+	 * - arena's large allocations
+	 */
+	ql_elm(extent_t)	ql_link;

-	/* Linkage for per size class sn/address-ordered heaps. */
+	/*
+	 * Linkage for per size class sn/address-ordered heaps, and
+	 * for extent_avail
+	 */
 	phn(extent_t)		ph_link;

 	union {
@ -148,7 +168,7 @@ struct extent_s {
 	};
 };
 typedef ql_head(extent_t) extent_list_t;
-typedef rb_tree(extent_t) extent_tree_t;
+typedef ph(extent_t) extent_tree_t;
 typedef ph(extent_t) extent_heap_t;

 /* Quantized collection of extents, with built-in LRU queue. */
--- a/contrib/jemalloc/include/jemalloc/internal/extent_types.h
+++ b/contrib/jemalloc/include/jemalloc/internal/extent_types.h
@ -6,4 +6,12 @@ typedef struct extents_s extents_t;

 #define EXTENT_HOOKS_INITIALIZER	NULL

+#define EXTENT_GROW_MAX_PIND (NPSIZES - 1)
+
+/*
+ * When reuse (and split) an active extent, (1U << opt_lg_extent_max_active_fit)
+ * is the max ratio between the size of the active extent and the new extent.
+ */
+#define LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT 6
+
 #endif /* JEMALLOC_INTERNAL_EXTENT_TYPES_H */
--- a/contrib/jemalloc/include/jemalloc/internal/hash.h
+++ b/contrib/jemalloc/include/jemalloc/internal/hash.h
@ -260,22 +260,22 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,
 		uint64_t k2 = 0;

 		switch (len & 15) {
-		case 15: k2 ^= ((uint64_t)(tail[14])) << 48;
-		case 14: k2 ^= ((uint64_t)(tail[13])) << 40;
-		case 13: k2 ^= ((uint64_t)(tail[12])) << 32;
-		case 12: k2 ^= ((uint64_t)(tail[11])) << 24;
-		case 11: k2 ^= ((uint64_t)(tail[10])) << 16;
-		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;
+		case 15: k2 ^= ((uint64_t)(tail[14])) << 48; /* falls through */
+		case 14: k2 ^= ((uint64_t)(tail[13])) << 40; /* falls through */
+		case 13: k2 ^= ((uint64_t)(tail[12])) << 32; /* falls through */
+		case 12: k2 ^= ((uint64_t)(tail[11])) << 24; /* falls through */
+		case 11: k2 ^= ((uint64_t)(tail[10])) << 16; /* falls through */
+		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;  /* falls through */
 		case  9: k2 ^= ((uint64_t)(tail[ 8])) << 0;
 			k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2;
-
-		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56;
-		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48;
-		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40;
-		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32;
-		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24;
-		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16;
-		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;
+			/* falls through */
+		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56; /* falls through */
+		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48; /* falls through */
+		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40; /* falls through */
+		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32; /* falls through */
+		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24; /* falls through */
+		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16; /* falls through */
+		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;  /* falls through */
 		case  1: k1 ^= ((uint64_t)(tail[ 0])) << 0;
 			k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1;
 		}
--- a/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h
@ -8,7 +8,16 @@
 #ifdef _WIN32
 #  include <windows.h>
 #  include "msvc_compat/windows_extra.h"
-
+#  ifdef _WIN64
+#    if LG_VADDR <= 32
+#      error Generate the headers using x64 vcargs
+#    endif
+#  else
+#    if LG_VADDR > 32
+#      undef LG_VADDR
+#      define LG_VADDR 32
+#    endif
+#  endif
 #else
 #  include <sys/param.h>
 #  include <sys/mman.h>
--- a/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h
+++ b/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h
@ -34,6 +34,8 @@
 * order to yield to another virtual CPU.
 */
 #define CPU_SPINWAIT __asm__ volatile("pause")
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 1

 /*
 * Number of significant bits in virtual addresses.  This may be less than the
@ -238,6 +240,12 @@
 */
 #define JEMALLOC_CACHE_OBLIVIOUS 

+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
 /*
 * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
 */
@ -255,6 +263,12 @@
 /* Defined if madvise(2) is available. */
 #define JEMALLOC_HAVE_MADVISE 

+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+/* #undef JEMALLOC_HAVE_MADVISE_HUGE */
+
 /*
 * Methods for purging unused pages differ between operating systems.
 *
@ -272,6 +286,14 @@
 #define JEMALLOC_PURGE_MADVISE_DONTNEED 
 /* #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS */

+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_DONTDUMP */
+
 /*
 * Defined if transparent huge pages (THPs) are supported via the
 * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
@ -337,4 +359,9 @@
 /* If defined, jemalloc takes the malloc/free/etc. symbol names. */
 #define JEMALLOC_IS_MALLOC 1

+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
--- a/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h
+++ b/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h
@ -106,16 +106,16 @@ decay_ticker_get(tsd_t *tsd, unsigned ind) {
 	return &tdata->decay_ticker;
 }

-JEMALLOC_ALWAYS_INLINE tcache_bin_t *
+JEMALLOC_ALWAYS_INLINE cache_bin_t *
 tcache_small_bin_get(tcache_t *tcache, szind_t binind) {
 	assert(binind < NBINS);
-	return &tcache->tbins_small[binind];
+	return &tcache->bins_small[binind];
 }

-JEMALLOC_ALWAYS_INLINE tcache_bin_t *
+JEMALLOC_ALWAYS_INLINE cache_bin_t *
 tcache_large_bin_get(tcache_t *tcache, szind_t binind) {
 	assert(binind >= NBINS &&binind < nhbins);
-	return &tcache->tbins_large[binind - NBINS];
+	return &tcache->bins_large[binind - NBINS];
 }

 JEMALLOC_ALWAYS_INLINE bool
@ -151,6 +151,7 @@ pre_reentrancy(tsd_t *tsd, arena_t *arena) {
 	assert(arena != arena_get(tsd_tsdn(tsd), 0, false));

 	bool fast = tsd_fast(tsd);
+	assert(tsd_reentrancy_level_get(tsd) < INT8_MAX);
 	++*tsd_reentrancy_levelp_get(tsd);
 	if (fast) {
 		/* Prepare slow path for reentrancy. */
--- a/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@ -5,6 +5,24 @@
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/witness.h"

+/*
+ * Translating the names of the 'i' functions:
+ *   Abbreviations used in the first part of the function name (before
+ *   alloc/dalloc) describe what that function accomplishes:
+ *     a: arena (query)
+ *     s: size (query, or sized deallocation)
+ *     e: extent (query)
+ *     p: aligned (allocates)
+ *     vs: size (query, without knowing that the pointer is into the heap)
+ *     r: rallocx implementation
+ *     x: xallocx implementation
+ *   Abbreviations used in the second part of the function name (after
+ *   alloc/dalloc) describe the arguments it takes
+ *     z: whether to return zeroed memory
+ *     t: accepts a tcache_t * parameter
+ *     m: accepts an arena_t * parameter
+ */
+
 JEMALLOC_ALWAYS_INLINE arena_t *
 iaalloc(tsdn_t *tsdn, const void *ptr) {
 	assert(ptr != NULL);
@ -27,8 +45,10 @@ iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
 	assert(size != 0);
 	assert(!is_internal || tcache == NULL);
 	assert(!is_internal || arena == NULL || arena_is_auto(arena));
-	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
-	    WITNESS_RANK_CORE, 0);
+	if (!tsdn_null(tsdn) && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) == 0) {
+		witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
+		    WITNESS_RANK_CORE, 0);
+	}

 	ret = arena_malloc(tsdn, arena, size, ind, zero, tcache, slow_path);
 	if (config_stats && is_internal && likely(ret != NULL)) {
@ -91,7 +111,7 @@ idalloctm(tsdn_t *tsdn, void *ptr, tcache_t *tcache, alloc_ctx_t *alloc_ctx,
 	if (config_stats && is_internal) {
 		arena_internal_sub(iaalloc(tsdn, ptr), isalloc(tsdn, ptr));
 	}
-	if (!is_internal && !tsdn_null(tsdn) && 
+	if (!is_internal && !tsdn_null(tsdn) &&
 	    tsd_reentrancy_level_get(tsdn_tsd(tsdn)) != 0) {
 		assert(tcache == NULL);
 	}
--- a/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h
@ -37,4 +37,7 @@
 #  define JET_MUTABLE const
 #endif

+#define JEMALLOC_VA_ARGS_HEAD(head, ...) head
+#define JEMALLOC_VA_ARGS_TAIL(head, ...) __VA_ARGS__
+
 #endif /* JEMALLOC_INTERNAL_MACROS_H */
--- a/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_types.h
+++ b/contrib/jemalloc/include/jemalloc/internal/jemalloc_internal_types.h
@ -79,22 +79,29 @@ typedef int malloc_cpuid_t;
 #  ifdef __hppa__
 #    define LG_QUANTUM		4
 #  endif
+#  ifdef __m68k__
+#    define LG_QUANTUM		3
+#  endif
 #  ifdef __mips__
 #    define LG_QUANTUM		3
 #  endif
+#  ifdef __nios2__
+#    define LG_QUANTUM		3
+#  endif
 #  ifdef __or1k__
 #    define LG_QUANTUM		3
 #  endif
 #  ifdef __powerpc__
 #    define LG_QUANTUM		4
 #  endif
-#  ifdef __riscv
+#  if defined(__riscv) || defined(__riscv__)
 #    define LG_QUANTUM		4
 #  endif
 #  ifdef __s390__
 #    define LG_QUANTUM		4
 #  endif
-#  ifdef __SH4__
+#  if (defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || \
+	defined(__SH4_SINGLE_ONLY__))
 #    define LG_QUANTUM		4
 #  endif
 #  ifdef __tile__
--- a/contrib/jemalloc/include/jemalloc/internal/jemalloc_preamble.h
+++ b/contrib/jemalloc/include/jemalloc/internal/jemalloc_preamble.h
@ -50,6 +50,10 @@
 #endif
 #include "jemalloc/internal/hooks.h"

+#ifdef JEMALLOC_DEFINE_MADVISE_FREE
+#  define JEMALLOC_MADV_FREE 8
+#endif
+
 static const bool config_debug =
 #ifdef JEMALLOC_DEBUG
    true
@ -64,6 +68,13 @@ static const bool have_dss =
    false
 #endif
    ;
+static const bool have_madvise_huge =
+#ifdef JEMALLOC_HAVE_MADVISE_HUGE
+    true
+#else
+    false
+#endif
+    ;
 static const bool config_fill =
 #ifdef JEMALLOC_FILL
    true
@ -108,13 +119,6 @@ static const bool config_stats =
    false
 #endif
    ;
-static const bool config_thp =
-#ifdef JEMALLOC_THP
-    true
-#else
-    false
-#endif
-    ;
 static const bool config_tls =
 #ifdef JEMALLOC_TLS
    true
@ -143,6 +147,17 @@ static const bool config_cache_oblivious =
    false
 #endif
    ;
+/*
+ * Undocumented, for jemalloc development use only at the moment.  See the note
+ * in jemalloc/internal/log.h.
+ */
+static const bool config_log =
+#ifdef JEMALLOC_LOG
+    true
+#else
+    false
+#endif
+    ;
 #ifdef JEMALLOC_HAVE_SCHED_GETCPU
 /* Currently percpu_arena depends on sched_getcpu. */
 #define JEMALLOC_PERCPU_ARENA
--- a/contrib/jemalloc/include/jemalloc/internal/log.h
+++ b/contrib/jemalloc/include/jemalloc/internal/log.h
@ -0,0 +1,115 @@
+#ifndef JEMALLOC_INTERNAL_LOG_H
+#define JEMALLOC_INTERNAL_LOG_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/malloc_io.h"
+#include "jemalloc/internal/mutex.h"
+
+#ifdef JEMALLOC_LOG
+#  define JEMALLOC_LOG_VAR_BUFSIZE 1000
+#else
+#  define JEMALLOC_LOG_VAR_BUFSIZE 1
+#endif
+
+#define JEMALLOC_LOG_BUFSIZE 4096
+
+/*
+ * The log malloc_conf option is a '|'-delimited list of log_var name segments
+ * which should be logged.  The names are themselves hierarchical, with '.' as
+ * the delimiter (a "segment" is just a prefix in the log namespace).  So, if
+ * you have:
+ *
+ * log("arena", "log msg for arena"); // 1
+ * log("arena.a", "log msg for arena.a"); // 2
+ * log("arena.b", "log msg for arena.b"); // 3
+ * log("arena.a.a", "log msg for arena.a.a"); // 4
+ * log("extent.a", "log msg for extent.a"); // 5
+ * log("extent.b", "log msg for extent.b"); // 6
+ *
+ * And your malloc_conf option is "log=arena.a|extent", then lines 2, 4, 5, and
+ * 6 will print at runtime.  You can enable logging from all log vars by
+ * writing "log=.".
+ *
+ * None of this should be regarded as a stable API for right now.  It's intended
+ * as a debugging interface, to let us keep around some of our printf-debugging
+ * statements.
+ */
+
+extern char log_var_names[JEMALLOC_LOG_VAR_BUFSIZE];
+extern atomic_b_t log_init_done;
+
+typedef struct log_var_s log_var_t;
+struct log_var_s {
+	/*
+	 * Lowest bit is "inited", second lowest is "enabled".  Putting them in
+	 * a single word lets us avoid any fences on weak architectures.
+	 */
+	atomic_u_t state;
+	const char *name;
+};
+
+#define LOG_NOT_INITIALIZED 0U
+#define LOG_INITIALIZED_NOT_ENABLED 1U
+#define LOG_ENABLED 2U
+
+#define LOG_VAR_INIT(name_str) {ATOMIC_INIT(LOG_NOT_INITIALIZED), name_str}
+
+/*
+ * Returns the value we should assume for state (which is not necessarily
+ * accurate; if logging is done before logging has finished initializing, then
+ * we default to doing the safe thing by logging everything).
+ */
+unsigned log_var_update_state(log_var_t *log_var);
+
+/* We factor out the metadata management to allow us to test more easily. */
+#define log_do_begin(log_var)						\
+if (config_log) {							\
+	unsigned log_state = atomic_load_u(&(log_var).state,		\
+	    ATOMIC_RELAXED);						\
+	if (unlikely(log_state == LOG_NOT_INITIALIZED)) {		\
+		log_state = log_var_update_state(&(log_var));		\
+		assert(log_state != LOG_NOT_INITIALIZED);		\
+	}								\
+	if (log_state == LOG_ENABLED) {					\
+		{
+			/* User code executes here. */
+#define log_do_end(log_var)						\
+		}							\
+	}								\
+}
+
+/*
+ * MSVC has some preprocessor bugs in its expansion of __VA_ARGS__ during
+ * preprocessing.  To work around this, we take all potential extra arguments in
+ * a var-args functions.  Since a varargs macro needs at least one argument in
+ * the "...", we accept the format string there, and require that the first
+ * argument in this "..." is a const char *.
+ */
+static inline void
+log_impl_varargs(const char *name, ...) {
+	char buf[JEMALLOC_LOG_BUFSIZE];
+	va_list ap;
+
+	va_start(ap, name);
+	const char *format = va_arg(ap, const char *);
+	size_t dst_offset = 0;
+	dst_offset += malloc_snprintf(buf, JEMALLOC_LOG_BUFSIZE, "%s: ", name);
+	dst_offset += malloc_vsnprintf(buf + dst_offset,
+	    JEMALLOC_LOG_BUFSIZE - dst_offset, format, ap);
+	dst_offset += malloc_snprintf(buf + dst_offset,
+	    JEMALLOC_LOG_BUFSIZE - dst_offset, "\n");
+	va_end(ap);
+
+	malloc_write(buf);
+}
+
+/* Call as log("log.var.str", "format_string %d", arg_for_format_string); */
+#define LOG(log_var_str, ...)						\
+do {									\
+	static log_var_t log_var = LOG_VAR_INIT(log_var_str);		\
+	log_do_begin(log_var)						\
+		log_impl_varargs((log_var).name, __VA_ARGS__);		\
+	log_do_end(log_var)						\
+} while (0)
+
+#endif /* JEMALLOC_INTERNAL_LOG_H */
--- a/contrib/jemalloc/include/jemalloc/internal/malloc_io.h
+++ b/contrib/jemalloc/include/jemalloc/internal/malloc_io.h
@ -53,10 +53,50 @@ size_t malloc_vsnprintf(char *str, size_t size, const char *format,
    va_list ap);
 size_t malloc_snprintf(char *str, size_t size, const char *format, ...)
    JEMALLOC_FORMAT_PRINTF(3, 4);
+/*
+ * The caller can set write_cb and cbopaque to null to choose to print with the
+ * je_malloc_message hook.
+ */
 void malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque,
    const char *format, va_list ap);
 void malloc_cprintf(void (*write_cb)(void *, const char *), void *cbopaque,
    const char *format, ...) JEMALLOC_FORMAT_PRINTF(3, 4);
 void malloc_printf(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2);

+static inline ssize_t
+malloc_write_fd(int fd, const void *buf, size_t count) {
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_write)
+	/*
+	 * Use syscall(2) rather than write(2) when possible in order to avoid
+	 * the possibility of memory allocation within libc.  This is necessary
+	 * on FreeBSD; most operating systems do not have this problem though.
+	 *
+	 * syscall() returns long or int, depending on platform, so capture the
+	 * result in the widest plausible type to avoid compiler warnings.
+	 */
+	long result = syscall(SYS_write, fd, buf, count);
+#else
+	ssize_t result = (ssize_t)write(fd, buf,
+#ifdef _WIN32
+	    (unsigned int)
+#endif
+	    count);
+#endif
+	return (ssize_t)result;
+}
+
+static inline ssize_t
+malloc_read_fd(int fd, void *buf, size_t count) {
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_read)
+	long result = syscall(SYS_read, fd, buf, count);
+#else
+	ssize_t result = read(fd, buf,
+#ifdef _WIN32
+	    (unsigned int)
+#endif
+	    count);
+#endif
+	return (ssize_t)result;
+}
+
 #endif /* JEMALLOC_INTERNAL_MALLOC_IO_H */
--- a/contrib/jemalloc/include/jemalloc/internal/mutex_prof.h
+++ b/contrib/jemalloc/include/jemalloc/internal/mutex_prof.h
@ -35,21 +35,34 @@ typedef enum {
 	mutex_prof_num_arena_mutexes
 } mutex_prof_arena_ind_t;

-#define MUTEX_PROF_COUNTERS						\
-    OP(num_ops, uint64_t)						\
-    OP(num_wait, uint64_t)						\
-    OP(num_spin_acq, uint64_t)						\
-    OP(num_owner_switch, uint64_t)					\
-    OP(total_wait_time, uint64_t)					\
-    OP(max_wait_time, uint64_t)						\
-    OP(max_num_thds, uint32_t)
+#define MUTEX_PROF_UINT64_COUNTERS					\
+    OP(num_ops, uint64_t, "n_lock_ops")					\
+    OP(num_wait, uint64_t, "n_waiting")					\
+    OP(num_spin_acq, uint64_t, "n_spin_acq")				\
+    OP(num_owner_switch, uint64_t, "n_owner_switch")			\
+    OP(total_wait_time, uint64_t, "total_wait_ns")			\
+    OP(max_wait_time, uint64_t, "max_wait_ns")

-typedef enum {
-#define OP(counter, type) mutex_counter_##counter,
-	MUTEX_PROF_COUNTERS
+#define MUTEX_PROF_UINT32_COUNTERS					\
+    OP(max_num_thds, uint32_t, "max_n_thds")
+
+#define MUTEX_PROF_COUNTERS						\
+		MUTEX_PROF_UINT64_COUNTERS				\
+		MUTEX_PROF_UINT32_COUNTERS
+
+#define OP(counter, type, human) mutex_counter_##counter,
+
+#define COUNTER_ENUM(counter_list, t)					\
+		typedef enum {						\
+			counter_list					\
+			mutex_prof_num_##t##_counters			\
+		} mutex_prof_##t##_counter_ind_t;
+
+COUNTER_ENUM(MUTEX_PROF_UINT64_COUNTERS, uint64_t)
+COUNTER_ENUM(MUTEX_PROF_UINT32_COUNTERS, uint32_t)
+
+#undef COUNTER_ENUM
 #undef OP
-	mutex_prof_num_counters
-} mutex_prof_counter_ind_t;

 typedef struct {
 	/*
--- a/contrib/jemalloc/include/jemalloc/internal/pages.h
+++ b/contrib/jemalloc/include/jemalloc/internal/pages.h
@ -58,6 +58,20 @@ static const bool pages_can_purge_forced =
 #endif
    ;

+typedef enum {
+	thp_mode_default       = 0, /* Do not change hugepage settings. */
+	thp_mode_always        = 1, /* Always set MADV_HUGEPAGE. */
+	thp_mode_never         = 2, /* Always set MADV_NOHUGEPAGE. */
+
+	thp_mode_names_limit   = 3, /* Used for option processing. */
+	thp_mode_not_supported = 3  /* No THP support detected. */
+} thp_mode_t;
+
+#define THP_MODE_DEFAULT thp_mode_default
+extern thp_mode_t opt_thp;
+extern thp_mode_t init_system_thp_mode; /* Initial system wide state. */
+extern const char *thp_mode_names[];
+
 void *pages_map(void *addr, size_t size, size_t alignment, bool *commit);
 void pages_unmap(void *addr, size_t size);
 bool pages_commit(void *addr, size_t size);
@ -66,6 +80,9 @@ bool pages_purge_lazy(void *addr, size_t size);
 bool pages_purge_forced(void *addr, size_t size);
 bool pages_huge(void *addr, size_t size);
 bool pages_nohuge(void *addr, size_t size);
+bool pages_dontdump(void *addr, size_t size);
+bool pages_dodump(void *addr, size_t size);
 bool pages_boot(void);
+void pages_set_thp_state (void *ptr, size_t size);

 #endif /* JEMALLOC_INTERNAL_PAGES_EXTERNS_H */
--- a/contrib/jemalloc/include/jemalloc/internal/private_namespace.h
+++ b/contrib/jemalloc/include/jemalloc/internal/private_namespace.h
@ -30,7 +30,6 @@
 #define opt_zero JEMALLOC_N(opt_zero)
 #define arena_alloc_junk_small JEMALLOC_N(arena_alloc_junk_small)
 #define arena_basic_stats_merge JEMALLOC_N(arena_basic_stats_merge)
-#define arena_bin_info JEMALLOC_N(arena_bin_info)
 #define arena_boot JEMALLOC_N(arena_boot)
 #define arena_dalloc_bin_junked_locked JEMALLOC_N(arena_dalloc_bin_junked_locked)
 #define arena_dalloc_junk_small JEMALLOC_N(arena_dalloc_junk_small)
@ -74,8 +73,7 @@
 #define arena_ralloc JEMALLOC_N(arena_ralloc)
 #define arena_ralloc_no_move JEMALLOC_N(arena_ralloc_no_move)
 #define arena_reset JEMALLOC_N(arena_reset)
-#define arena_stats_large_nrequests_add JEMALLOC_N(arena_stats_large_nrequests_add)
-#define arena_stats_mapped_add JEMALLOC_N(arena_stats_mapped_add)
+#define arena_retain_grow_limit_get_set JEMALLOC_N(arena_retain_grow_limit_get_set)
 #define arena_stats_merge JEMALLOC_N(arena_stats_merge)
 #define arena_tcache_fill_small JEMALLOC_N(arena_tcache_fill_small)
 #define h_steps JEMALLOC_N(h_steps)
@ -99,8 +97,10 @@
 #define background_threads_disable JEMALLOC_N(background_threads_disable)
 #define background_threads_enable JEMALLOC_N(background_threads_enable)
 #define can_enable_background_thread JEMALLOC_N(can_enable_background_thread)
+#define max_background_threads JEMALLOC_N(max_background_threads)
 #define n_background_threads JEMALLOC_N(n_background_threads)
 #define opt_background_thread JEMALLOC_N(opt_background_thread)
+#define opt_max_background_threads JEMALLOC_N(opt_max_background_threads)
 #define pthread_create_wrapper JEMALLOC_N(pthread_create_wrapper)
 #define b0get JEMALLOC_N(b0get)
 #define base_alloc JEMALLOC_N(base_alloc)
@ -114,6 +114,13 @@
 #define base_postfork_parent JEMALLOC_N(base_postfork_parent)
 #define base_prefork JEMALLOC_N(base_prefork)
 #define base_stats_get JEMALLOC_N(base_stats_get)
+#define metadata_thp_mode_names JEMALLOC_N(metadata_thp_mode_names)
+#define opt_metadata_thp JEMALLOC_N(opt_metadata_thp)
+#define bin_infos JEMALLOC_N(bin_infos)
+#define bin_init JEMALLOC_N(bin_init)
+#define bin_postfork_child JEMALLOC_N(bin_postfork_child)
+#define bin_postfork_parent JEMALLOC_N(bin_postfork_parent)
+#define bin_prefork JEMALLOC_N(bin_prefork)
 #define bitmap_info_init JEMALLOC_N(bitmap_info_init)
 #define bitmap_init JEMALLOC_N(bitmap_init)
 #define bitmap_size JEMALLOC_N(bitmap_size)
@ -135,27 +142,17 @@
 #define ctl_postfork_child JEMALLOC_N(ctl_postfork_child)
 #define ctl_postfork_parent JEMALLOC_N(ctl_postfork_parent)
 #define ctl_prefork JEMALLOC_N(ctl_prefork)
+#define div_init JEMALLOC_N(div_init)
 #define extent_alloc JEMALLOC_N(extent_alloc)
 #define extent_alloc_wrapper JEMALLOC_N(extent_alloc_wrapper)
-#define extent_avail_destroy JEMALLOC_N(extent_avail_destroy)
-#define extent_avail_destroy_recurse JEMALLOC_N(extent_avail_destroy_recurse)
+#define extent_avail_any JEMALLOC_N(extent_avail_any)
 #define extent_avail_empty JEMALLOC_N(extent_avail_empty)
 #define extent_avail_first JEMALLOC_N(extent_avail_first)
 #define extent_avail_insert JEMALLOC_N(extent_avail_insert)
-#define extent_avail_iter JEMALLOC_N(extent_avail_iter)
-#define extent_avail_iter_recurse JEMALLOC_N(extent_avail_iter_recurse)
-#define extent_avail_iter_start JEMALLOC_N(extent_avail_iter_start)
-#define extent_avail_last JEMALLOC_N(extent_avail_last)
 #define extent_avail_new JEMALLOC_N(extent_avail_new)
-#define extent_avail_next JEMALLOC_N(extent_avail_next)
-#define extent_avail_nsearch JEMALLOC_N(extent_avail_nsearch)
-#define extent_avail_prev JEMALLOC_N(extent_avail_prev)
-#define extent_avail_psearch JEMALLOC_N(extent_avail_psearch)
 #define extent_avail_remove JEMALLOC_N(extent_avail_remove)
-#define extent_avail_reverse_iter JEMALLOC_N(extent_avail_reverse_iter)
-#define extent_avail_reverse_iter_recurse JEMALLOC_N(extent_avail_reverse_iter_recurse)
-#define extent_avail_reverse_iter_start JEMALLOC_N(extent_avail_reverse_iter_start)
-#define extent_avail_search JEMALLOC_N(extent_avail_search)
+#define extent_avail_remove_any JEMALLOC_N(extent_avail_remove_any)
+#define extent_avail_remove_first JEMALLOC_N(extent_avail_remove_first)
 #define extent_boot JEMALLOC_N(extent_boot)
 #define extent_commit_wrapper JEMALLOC_N(extent_commit_wrapper)
 #define extent_dalloc JEMALLOC_N(extent_dalloc)
@ -189,6 +186,7 @@
 #define extents_prefork JEMALLOC_N(extents_prefork)
 #define extents_rtree JEMALLOC_N(extents_rtree)
 #define extents_state_get JEMALLOC_N(extents_state_get)
+#define opt_lg_extent_max_active_fit JEMALLOC_N(opt_lg_extent_max_active_fit)
 #define dss_prec_names JEMALLOC_N(dss_prec_names)
 #define extent_alloc_dss JEMALLOC_N(extent_alloc_dss)
 #define extent_dss_boot JEMALLOC_N(extent_dss_boot)
@ -215,6 +213,9 @@
 #define large_ralloc JEMALLOC_N(large_ralloc)
 #define large_ralloc_no_move JEMALLOC_N(large_ralloc_no_move)
 #define large_salloc JEMALLOC_N(large_salloc)
+#define log_init_done JEMALLOC_N(log_init_done)
+#define log_var_names JEMALLOC_N(log_var_names)
+#define log_var_update_state JEMALLOC_N(log_var_update_state)
 #define buferror JEMALLOC_N(buferror)
 #define malloc_cprintf JEMALLOC_N(malloc_cprintf)
 #define malloc_printf JEMALLOC_N(malloc_printf)
@ -248,15 +249,21 @@
 #define nstime_sec JEMALLOC_N(nstime_sec)
 #define nstime_subtract JEMALLOC_N(nstime_subtract)
 #define nstime_update JEMALLOC_N(nstime_update)
+#define init_system_thp_mode JEMALLOC_N(init_system_thp_mode)
+#define opt_thp JEMALLOC_N(opt_thp)
 #define pages_boot JEMALLOC_N(pages_boot)
 #define pages_commit JEMALLOC_N(pages_commit)
 #define pages_decommit JEMALLOC_N(pages_decommit)
+#define pages_dodump JEMALLOC_N(pages_dodump)
+#define pages_dontdump JEMALLOC_N(pages_dontdump)
 #define pages_huge JEMALLOC_N(pages_huge)
 #define pages_map JEMALLOC_N(pages_map)
 #define pages_nohuge JEMALLOC_N(pages_nohuge)
 #define pages_purge_forced JEMALLOC_N(pages_purge_forced)
 #define pages_purge_lazy JEMALLOC_N(pages_purge_lazy)
+#define pages_set_thp_state JEMALLOC_N(pages_set_thp_state)
 #define pages_unmap JEMALLOC_N(pages_unmap)
+#define thp_mode_names JEMALLOC_N(thp_mode_names)
 #define bt2gctx_mtx JEMALLOC_N(bt2gctx_mtx)
 #define bt_init JEMALLOC_N(bt_init)
 #define lg_prof_sample JEMALLOC_N(lg_prof_sample)
@ -318,7 +325,6 @@
 #define opt_stats_print JEMALLOC_N(opt_stats_print)
 #define opt_stats_print_opts JEMALLOC_N(opt_stats_print_opts)
 #define stats_print JEMALLOC_N(stats_print)
-#define spin_adaptive JEMALLOC_N(spin_adaptive)
 #define sz_index2size_tab JEMALLOC_N(sz_index2size_tab)
 #define sz_pind2sz_tab JEMALLOC_N(sz_pind2sz_tab)
 #define sz_size2index_tab JEMALLOC_N(sz_size2index_tab)
--- a/contrib/jemalloc/include/jemalloc/internal/prof_inlines_a.h
+++ b/contrib/jemalloc/include/jemalloc/internal/prof_inlines_a.h
@ -69,4 +69,15 @@ prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum, size_t usize) {
 #endif
 }

+JEMALLOC_ALWAYS_INLINE bool
+prof_active_get_unlocked(void) {
+	/*
+	 * Even if opt_prof is true, sampling can be temporarily disabled by
+	 * setting prof_active to false.  No locking is used when reading
+	 * prof_active in the fast path, so there are no guarantees regarding
+	 * how long it will take for all threads to notice state changes.
+	 */
+	return prof_active;
+}
+
 #endif /* JEMALLOC_INTERNAL_PROF_INLINES_A_H */
--- a/contrib/jemalloc/include/jemalloc/internal/prof_inlines_b.h
+++ b/contrib/jemalloc/include/jemalloc/internal/prof_inlines_b.h
@ -3,17 +3,6 @@

 #include "jemalloc/internal/sz.h"

-JEMALLOC_ALWAYS_INLINE bool
-prof_active_get_unlocked(void) {
-	/*
-	 * Even if opt_prof is true, sampling can be temporarily disabled by
-	 * setting prof_active to false.  No locking is used when reading
-	 * prof_active in the fast path, so there are no guarantees regarding
-	 * how long it will take for all threads to notice state changes.
-	 */
-	return prof_active;
-}
-
 JEMALLOC_ALWAYS_INLINE bool
 prof_gdump_get_unlocked(void) {
 	/*
--- a/contrib/jemalloc/include/jemalloc/internal/rtree.h
+++ b/contrib/jemalloc/include/jemalloc/internal/rtree.h
@ -178,9 +178,21 @@ rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,

 JEMALLOC_ALWAYS_INLINE extent_t *
 rtree_leaf_elm_bits_extent_get(uintptr_t bits) {
+#    ifdef __aarch64__
+	/*
+	 * aarch64 doesn't sign extend the highest virtual address bit to set
+	 * the higher ones.  Instead, the high bits gets zeroed.
+	 */
+	uintptr_t high_bit_mask = ((uintptr_t)1 << LG_VADDR) - 1;
+	/* Mask off the slab bit. */
+	uintptr_t low_bit_mask = ~(uintptr_t)1;
+	uintptr_t mask = high_bit_mask & low_bit_mask;
+	return (extent_t *)(bits & mask);
+#    else
 	/* Restore sign-extended high bits, mask slab bit. */
 	return (extent_t *)((uintptr_t)((intptr_t)(bits << RTREE_NHIB) >>
 	    RTREE_NHIB) & ~((uintptr_t)0x1));
+#    endif
 }

 JEMALLOC_ALWAYS_INLINE szind_t
@ -196,8 +208,8 @@ rtree_leaf_elm_bits_slab_get(uintptr_t bits) {
 #  endif

 JEMALLOC_ALWAYS_INLINE extent_t *
-rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-    bool dependent) {
+rtree_leaf_elm_extent_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+    rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
 	return rtree_leaf_elm_bits_extent_get(bits);
@ -209,8 +221,8 @@ rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
 }

 JEMALLOC_ALWAYS_INLINE szind_t
-rtree_leaf_elm_szind_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-    bool dependent) {
+rtree_leaf_elm_szind_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+    rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
 	return rtree_leaf_elm_bits_szind_get(bits);
@ -221,8 +233,8 @@ rtree_leaf_elm_szind_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
 }

 JEMALLOC_ALWAYS_INLINE bool
-rtree_leaf_elm_slab_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-    bool dependent) {
+rtree_leaf_elm_slab_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+    rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
 	return rtree_leaf_elm_bits_slab_get(bits);
@ -233,8 +245,8 @@ rtree_leaf_elm_slab_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
 }

 static inline void
-rtree_leaf_elm_extent_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-    extent_t *extent) {
+rtree_leaf_elm_extent_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+    rtree_leaf_elm_t *elm, extent_t *extent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, true);
 	uintptr_t bits = ((uintptr_t)rtree_leaf_elm_bits_szind_get(old_bits) <<
@ -247,8 +259,8 @@ rtree_leaf_elm_extent_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
 }

 static inline void
-rtree_leaf_elm_szind_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-    szind_t szind) {
+rtree_leaf_elm_szind_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+    rtree_leaf_elm_t *elm, szind_t szind) {
 	assert(szind <= NSIZES);

 #ifdef RTREE_LEAF_COMPACT
@ -265,8 +277,8 @@ rtree_leaf_elm_szind_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
 }

 static inline void
-rtree_leaf_elm_slab_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-     bool slab) {
+rtree_leaf_elm_slab_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+    rtree_leaf_elm_t *elm, bool slab) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
 	    true);
@ -448,8 +460,14 @@ rtree_szind_slab_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 	if (!dependent && elm == NULL) {
 		return true;
 	}
+#ifdef RTREE_LEAF_COMPACT
+	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
+	*r_szind = rtree_leaf_elm_bits_szind_get(bits);
+	*r_slab = rtree_leaf_elm_bits_slab_get(bits);
+#else
 	*r_szind = rtree_leaf_elm_szind_read(tsdn, rtree, elm, dependent);
 	*r_slab = rtree_leaf_elm_slab_read(tsdn, rtree, elm, dependent);
+#endif
 	return false;
 }

--- a/contrib/jemalloc/include/jemalloc/internal/rtree_tsd.h
+++ b/contrib/jemalloc/include/jemalloc/internal/rtree_tsd.h
@ -26,7 +26,7 @@
 * Zero initializer required for tsd initialization only.  Proper initialization
 * done via rtree_ctx_data_init().
 */
-#define RTREE_CTX_ZERO_INITIALIZER {{{0}}}
+#define RTREE_CTX_ZERO_INITIALIZER {{{0}}, {{0}}}


 typedef struct rtree_leaf_elm_s rtree_leaf_elm_t;
--- a/contrib/jemalloc/include/jemalloc/internal/spin.h
+++ b/contrib/jemalloc/include/jemalloc/internal/spin.h
@ -7,13 +7,23 @@ typedef struct {
 	unsigned iteration;
 } spin_t;

+static inline void
+spin_cpu_spinwait() {
+#  if HAVE_CPU_SPINWAIT
+	CPU_SPINWAIT;
+#  else
+	volatile int x = 0;
+	x = x;
+#  endif
+}
+
 static inline void
 spin_adaptive(spin_t *spin) {
 	volatile uint32_t i;

 	if (spin->iteration < 5) {
 		for (i = 0; i < (1U << spin->iteration); i++) {
-			CPU_SPINWAIT;
+			spin_cpu_spinwait();
 		}
 		spin->iteration++;
 	} else {
--- a/contrib/jemalloc/include/jemalloc/internal/stats.h
+++ b/contrib/jemalloc/include/jemalloc/internal/stats.h
@ -1,12 +1,6 @@
 #ifndef JEMALLOC_INTERNAL_STATS_H
 #define JEMALLOC_INTERNAL_STATS_H

-#include "jemalloc/internal/atomic.h"
-#include "jemalloc/internal/mutex_prof.h"
-#include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/size_classes.h"
-#include "jemalloc/internal/stats_tsd.h"
-
 /*  OPTION(opt,		var_name,	default,	set_value_to) */
 #define STATS_PRINT_OPTIONS						\
    OPTION('J',		json,		false,		true)		\
@ -33,132 +27,4 @@ extern char opt_stats_print_opts[stats_print_tot_num_options+1];
 void stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
    const char *opts);

-/*
- * In those architectures that support 64-bit atomics, we use atomic updates for
- * our 64-bit values.  Otherwise, we use a plain uint64_t and synchronize
- * externally.
- */
-#ifdef JEMALLOC_ATOMIC_U64
-typedef atomic_u64_t arena_stats_u64_t;
-#else
-/* Must hold the arena stats mutex while reading atomically. */
-typedef uint64_t arena_stats_u64_t;
-#endif
-
-typedef struct malloc_bin_stats_s {
-	/*
-	 * Total number of allocation/deallocation requests served directly by
-	 * the bin.  Note that tcache may allocate an object, then recycle it
-	 * many times, resulting many increments to nrequests, but only one
-	 * each to nmalloc and ndalloc.
-	 */
-	uint64_t	nmalloc;
-	uint64_t	ndalloc;
-
-	/*
-	 * Number of allocation requests that correspond to the size of this
-	 * bin.  This includes requests served by tcache, though tcache only
-	 * periodically merges into this counter.
-	 */
-	uint64_t	nrequests;
-
-	/*
-	 * Current number of regions of this size class, including regions
-	 * currently cached by tcache.
-	 */
-	size_t		curregs;
-
-	/* Number of tcache fills from this bin. */
-	uint64_t	nfills;
-
-	/* Number of tcache flushes to this bin. */
-	uint64_t	nflushes;
-
-	/* Total number of slabs created for this bin's size class. */
-	uint64_t	nslabs;
-
-	/*
-	 * Total number of slabs reused by extracting them from the slabs heap
-	 * for this bin's size class.
-	 */
-	uint64_t	reslabs;
-
-	/* Current number of slabs in this bin. */
-	size_t		curslabs;
-
-	mutex_prof_data_t mutex_data;
-} malloc_bin_stats_t;
-
-typedef struct malloc_large_stats_s {
-	/*
-	 * Total number of allocation/deallocation requests served directly by
-	 * the arena.
-	 */
-	arena_stats_u64_t	nmalloc;
-	arena_stats_u64_t	ndalloc;
-
-	/*
-	 * Number of allocation requests that correspond to this size class.
-	 * This includes requests served by tcache, though tcache only
-	 * periodically merges into this counter.
-	 */
-	arena_stats_u64_t	nrequests; /* Partially derived. */
-
-	/* Current number of allocations of this size class. */
-	size_t		curlextents; /* Derived. */
-} malloc_large_stats_t;
-
-typedef struct decay_stats_s {
-	/* Total number of purge sweeps. */
-	arena_stats_u64_t	npurge;
-	/* Total number of madvise calls made. */
-	arena_stats_u64_t	nmadvise;
-	/* Total number of pages purged. */
-	arena_stats_u64_t	purged;
-} decay_stats_t;
-
-/*
- * Arena stats.  Note that fields marked "derived" are not directly maintained
- * within the arena code; rather their values are derived during stats merge
- * requests.
- */
-typedef struct arena_stats_s {
-#ifndef JEMALLOC_ATOMIC_U64
-	malloc_mutex_t		mtx;
-#endif
-
-	/* Number of bytes currently mapped, excluding retained memory. */
-	atomic_zu_t		mapped; /* Partially derived. */
-
-	/*
-	 * Number of unused virtual memory bytes currently retained.  Retained
-	 * bytes are technically mapped (though always decommitted or purged),
-	 * but they are excluded from the mapped statistic (above).
-	 */
-	atomic_zu_t		retained; /* Derived. */
-
-	decay_stats_t		decay_dirty;
-	decay_stats_t		decay_muzzy;
-
-	atomic_zu_t		base; /* Derived. */
-	atomic_zu_t		internal;
-	atomic_zu_t		resident; /* Derived. */
-
-	atomic_zu_t		allocated_large; /* Derived. */
-	arena_stats_u64_t	nmalloc_large; /* Derived. */
-	arena_stats_u64_t	ndalloc_large; /* Derived. */
-	arena_stats_u64_t	nrequests_large; /* Derived. */
-
-	/* Number of bytes cached in tcache associated with this arena. */
-	atomic_zu_t		tcache_bytes; /* Derived. */
-
-	mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes];
-
-	/* One element for each large size class. */
-	malloc_large_stats_t	lstats[NSIZES - NBINS];
-
-	/* Arena uptime. */
-	nstime_t		uptime;
-} arena_stats_t;
-
 #endif /* JEMALLOC_INTERNAL_STATS_H */
--- a/contrib/jemalloc/include/jemalloc/internal/stats_tsd.h
+++ b/contrib/jemalloc/include/jemalloc/internal/stats_tsd.h
@ -1,12 +0,0 @@
-#ifndef JEMALLOC_INTERNAL_STATS_TSD_H
-#define JEMALLOC_INTERNAL_STATS_TSD_H
-
-typedef struct tcache_bin_stats_s {
-	/*
-	 * Number of allocation requests that corresponded to the size of this
-	 * bin.
-	 */
-	uint64_t	nrequests;
-} tcache_bin_stats_t;
-
-#endif /* JEMALLOC_INTERNAL_STATS_TSD_H */
--- a/contrib/jemalloc/include/jemalloc/internal/sz.h
+++ b/contrib/jemalloc/include/jemalloc/internal/sz.h
@ -61,7 +61,7 @@ sz_psz2ind(size_t psz) {
 		pszind_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_PAGE + 1) ?
 		    LG_PAGE : x - LG_SIZE_CLASS_GROUP - 1;

-		size_t delta_inverse_mask = ZD(-1) << lg_delta;
+		size_t delta_inverse_mask = ZU(-1) << lg_delta;
 		pszind_t mod = ((((psz-1) & delta_inverse_mask) >> lg_delta)) &
 		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);

@ -142,7 +142,7 @@ sz_size2index_compute(size_t size) {
 		szind_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
 		    ? LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;

-		size_t delta_inverse_mask = ZD(-1) << lg_delta;
+		size_t delta_inverse_mask = ZU(-1) << lg_delta;
 		szind_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) &
 		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);

--- a/contrib/jemalloc/include/jemalloc/internal/tcache_externs.h
+++ b/contrib/jemalloc/include/jemalloc/internal/tcache_externs.h
@ -6,7 +6,7 @@
 extern bool	opt_tcache;
 extern ssize_t	opt_lg_tcache_max;

-extern tcache_bin_info_t	*tcache_bin_info;
+extern cache_bin_info_t	*tcache_bin_info;

 /*
 * Number of tcache bins.  There are NBINS small-object bins, plus 0 or more
@ -30,10 +30,10 @@ extern tcaches_t	*tcaches;
 size_t	tcache_salloc(tsdn_t *tsdn, const void *ptr);
 void	tcache_event_hard(tsd_t *tsd, tcache_t *tcache);
 void	*tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind, bool *tcache_success);
-void	tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
+    cache_bin_t *tbin, szind_t binind, bool *tcache_success);
+void	tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
    szind_t binind, unsigned rem);
-void	tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
+void	tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
    unsigned rem, tcache_t *tcache);
 void	tcache_arena_reassociate(tsdn_t *tsdn, tcache_t *tcache,
    arena_t *arena);
--- a/contrib/jemalloc/include/jemalloc/internal/tcache_inlines.h
+++ b/contrib/jemalloc/include/jemalloc/internal/tcache_inlines.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_TCACHE_INLINES_H
 #define JEMALLOC_INTERNAL_TCACHE_INLINES_H

+#include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/sz.h"
@ -38,43 +39,16 @@ tcache_event(tsd_t *tsd, tcache_t *tcache) {
 }

 JEMALLOC_ALWAYS_INLINE void *
-tcache_alloc_easy(tcache_bin_t *tbin, bool *tcache_success) {
+tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
+    UNUSED size_t size, szind_t binind, bool zero, bool slow_path) {
 	void *ret;
-
-	if (unlikely(tbin->ncached == 0)) {
-		tbin->low_water = -1;
-		*tcache_success = false;
-		return NULL;
-	}
-	/*
-	 * tcache_success (instead of ret) should be checked upon the return of
-	 * this function.  We avoid checking (ret == NULL) because there is
-	 * never a null stored on the avail stack (which is unknown to the
-	 * compiler), and eagerly checking ret would cause pipeline stall
-	 * (waiting for the cacheline).
-	 */
-	*tcache_success = true;
-	ret = *(tbin->avail - tbin->ncached);
-	tbin->ncached--;
-
-	if (unlikely((low_water_t)tbin->ncached < tbin->low_water)) {
-		tbin->low_water = tbin->ncached;
-	}
-
-	return ret;
-}
-
-JEMALLOC_ALWAYS_INLINE void *
-tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
-    szind_t binind, bool zero, bool slow_path) {
-	void *ret;
-	tcache_bin_t *tbin;
+	cache_bin_t *bin;
 	bool tcache_success;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);

 	assert(binind < NBINS);
-	tbin = tcache_small_bin_get(tcache, binind);
-	ret = tcache_alloc_easy(tbin, &tcache_success);
+	bin = tcache_small_bin_get(tcache, binind);
+	ret = cache_bin_alloc_easy(bin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
 	if (unlikely(!tcache_success)) {
 		bool tcache_hard_success;
@ -84,7 +58,7 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 		}

 		ret = tcache_alloc_small_hard(tsd_tsdn(tsd), arena, tcache,
-		    tbin, binind, &tcache_hard_success);
+		    bin, binind, &tcache_hard_success);
 		if (tcache_hard_success == false) {
 			return NULL;
 		}
@ -103,22 +77,21 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 	if (likely(!zero)) {
 		if (slow_path && config_fill) {
 			if (unlikely(opt_junk_alloc)) {
-				arena_alloc_junk_small(ret,
-				    &arena_bin_info[binind], false);
+				arena_alloc_junk_small(ret, &bin_infos[binind],
+				    false);
 			} else if (unlikely(opt_zero)) {
 				memset(ret, 0, usize);
 			}
 		}
 	} else {
 		if (slow_path && config_fill && unlikely(opt_junk_alloc)) {
-			arena_alloc_junk_small(ret, &arena_bin_info[binind],
-			    true);
+			arena_alloc_junk_small(ret, &bin_infos[binind], true);
 		}
 		memset(ret, 0, usize);
 	}

 	if (config_stats) {
-		tbin->tstats.nrequests++;
+		bin->tstats.nrequests++;
 	}
 	if (config_prof) {
 		tcache->prof_accumbytes += usize;
@ -131,12 +104,12 @@ JEMALLOC_ALWAYS_INLINE void *
 tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
    szind_t binind, bool zero, bool slow_path) {
 	void *ret;
-	tcache_bin_t *tbin;
+	cache_bin_t *bin;
 	bool tcache_success;

 	assert(binind >= NBINS &&binind < nhbins);
-	tbin = tcache_large_bin_get(tcache, binind);
-	ret = tcache_alloc_easy(tbin, &tcache_success);
+	bin = tcache_large_bin_get(tcache, binind);
+	ret = cache_bin_alloc_easy(bin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
 	if (unlikely(!tcache_success)) {
 		/*
@ -176,7 +149,7 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 		}

 		if (config_stats) {
-			tbin->tstats.nrequests++;
+			bin->tstats.nrequests++;
 		}
 		if (config_prof) {
 			tcache->prof_accumbytes += usize;
@ -190,24 +163,24 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 JEMALLOC_ALWAYS_INLINE void
 tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
    bool slow_path) {
-	tcache_bin_t *tbin;
-	tcache_bin_info_t *tbin_info;
+	cache_bin_t *bin;
+	cache_bin_info_t *bin_info;

 	assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= SMALL_MAXCLASS);

 	if (slow_path && config_fill && unlikely(opt_junk_free)) {
-		arena_dalloc_junk_small(ptr, &arena_bin_info[binind]);
+		arena_dalloc_junk_small(ptr, &bin_infos[binind]);
 	}

-	tbin = tcache_small_bin_get(tcache, binind);
-	tbin_info = &tcache_bin_info[binind];
-	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
-		tcache_bin_flush_small(tsd, tcache, tbin, binind,
-		    (tbin_info->ncached_max >> 1));
+	bin = tcache_small_bin_get(tcache, binind);
+	bin_info = &tcache_bin_info[binind];
+	if (unlikely(bin->ncached == bin_info->ncached_max)) {
+		tcache_bin_flush_small(tsd, tcache, bin, binind,
+		    (bin_info->ncached_max >> 1));
 	}
-	assert(tbin->ncached < tbin_info->ncached_max);
-	tbin->ncached++;
-	*(tbin->avail - tbin->ncached) = ptr;
+	assert(bin->ncached < bin_info->ncached_max);
+	bin->ncached++;
+	*(bin->avail - bin->ncached) = ptr;

 	tcache_event(tsd, tcache);
 }
@ -215,8 +188,8 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 JEMALLOC_ALWAYS_INLINE void
 tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
    bool slow_path) {
-	tcache_bin_t *tbin;
-	tcache_bin_info_t *tbin_info;
+	cache_bin_t *bin;
+	cache_bin_info_t *bin_info;

 	assert(tcache_salloc(tsd_tsdn(tsd), ptr) > SMALL_MAXCLASS);
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= tcache_maxclass);
@ -225,15 +198,15 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 		large_dalloc_junk(ptr, sz_index2size(binind));
 	}

-	tbin = tcache_large_bin_get(tcache, binind);
-	tbin_info = &tcache_bin_info[binind];
-	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
-		tcache_bin_flush_large(tsd, tbin, binind,
-		    (tbin_info->ncached_max >> 1), tcache);
+	bin = tcache_large_bin_get(tcache, binind);
+	bin_info = &tcache_bin_info[binind];
+	if (unlikely(bin->ncached == bin_info->ncached_max)) {
+		tcache_bin_flush_large(tsd, bin, binind,
+		    (bin_info->ncached_max >> 1), tcache);
 	}
-	assert(tbin->ncached < tbin_info->ncached_max);
-	tbin->ncached++;
-	*(tbin->avail - tbin->ncached) = ptr;
+	assert(bin->ncached < bin_info->ncached_max);
+	bin->ncached++;
+	*(bin->avail - bin->ncached) = ptr;

 	tcache_event(tsd, tcache);
 }
--- a/contrib/jemalloc/include/jemalloc/internal/tcache_structs.h
+++ b/contrib/jemalloc/include/jemalloc/internal/tcache_structs.h
@ -3,54 +3,51 @@

 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/size_classes.h"
-#include "jemalloc/internal/stats_tsd.h"
+#include "jemalloc/internal/cache_bin.h"
 #include "jemalloc/internal/ticker.h"

-/*
- * Read-only information associated with each element of tcache_t's tbins array
- * is stored separately, mainly to reduce memory usage.
- */
-struct tcache_bin_info_s {
-	unsigned	ncached_max;	/* Upper limit on ncached. */
-};
-
-struct tcache_bin_s {
-	low_water_t	low_water;	/* Min # cached since last GC. */
-	uint32_t	ncached;	/* # of cached objects. */
-	/*
-	 * ncached and stats are both modified frequently.  Let's keep them
-	 * close so that they have a higher chance of being on the same
-	 * cacheline, thus less write-backs.
-	 */
-	tcache_bin_stats_t tstats;
-	/*
-	 * To make use of adjacent cacheline prefetch, the items in the avail
-	 * stack goes to higher address for newer allocations.  avail points
-	 * just above the available space, which means that
-	 * avail[-ncached, ... -1] are available items and the lowest item will
-	 * be allocated first.
-	 */
-	void		**avail;	/* Stack of available objects. */
-};
-
 struct tcache_s {
-	/* Data accessed frequently first: prof, ticker and small bins. */
-	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum(). */
-	ticker_t	gc_ticker;	/* Drives incremental GC. */
 	/*
-	 * The pointer stacks associated with tbins follow as a contiguous
-	 * array.  During tcache initialization, the avail pointer in each
-	 * element of tbins is initialized to point to the proper offset within
-	 * this array.
+	 * To minimize our cache-footprint, we put the frequently accessed data
+	 * together at the start of this struct.
 	 */
-	tcache_bin_t	tbins_small[NBINS];
-	/* Data accessed less often below. */
-	ql_elm(tcache_t) link;		/* Used for aggregating stats. */
-	arena_t		*arena;		/* Associated arena. */
-	szind_t		next_gc_bin;	/* Next bin to GC. */
+
+	/* Cleared after arena_prof_accum(). */
+	uint64_t	prof_accumbytes;
+	/* Drives incremental GC. */
+	ticker_t	gc_ticker;
+	/*
+	 * The pointer stacks associated with bins follow as a contiguous array.
+	 * During tcache initialization, the avail pointer in each element of
+	 * tbins is initialized to point to the proper offset within this array.
+	 */
+	cache_bin_t	bins_small[NBINS];
+
+	/*
+	 * This data is less hot; we can be a little less careful with our
+	 * footprint here.
+	 */
+	/* Lets us track all the tcaches in an arena. */
+	ql_elm(tcache_t) link;
+	/*
+	 * The descriptor lets the arena find our cache bins without seeing the
+	 * tcache definition.  This enables arenas to aggregate stats across
+	 * tcaches without having a tcache dependency.
+	 */
+	cache_bin_array_descriptor_t cache_bin_array_descriptor;
+
+	/* The arena this tcache is associated with. */
+	arena_t		*arena;
+	/* Next bin to GC. */
+	szind_t		next_gc_bin;
 	/* For small bins, fill (ncached_max >> lg_fill_div). */
 	uint8_t		lg_fill_div[NBINS];
-	tcache_bin_t	tbins_large[NSIZES-NBINS];
+	/*
+	 * We put the cache bins for large size classes at the end of the
+	 * struct, since some of them might not get used.  This might end up
+	 * letting us avoid touching an extra page if we don't have to.
+	 */
+	cache_bin_t	bins_large[NSIZES-NBINS];
 };

 /* Linkage for list of available (previously used) explicit tcache IDs. */
--- a/contrib/jemalloc/include/jemalloc/internal/tcache_types.h
+++ b/contrib/jemalloc/include/jemalloc/internal/tcache_types.h
@ -3,14 +3,9 @@

 #include "jemalloc/internal/size_classes.h"

-typedef struct tcache_bin_info_s tcache_bin_info_t;
-typedef struct tcache_bin_s tcache_bin_t;
 typedef struct tcache_s tcache_t;
 typedef struct tcaches_s tcaches_t;

-/* ncached is cast to this type for comparison. */
-typedef int32_t low_water_t;
-
 /*
 * tcache pointers close to NULL are used to encode state information that is
 * used for two purposes: preventing thread caching on a per thread basis and
--- a/contrib/jemalloc/include/jemalloc/internal/ticker.h
+++ b/contrib/jemalloc/include/jemalloc/internal/ticker.h
@ -32,14 +32,42 @@ ticker_read(const ticker_t *ticker) {
 	return ticker->tick;
 }

+/*
+ * Not intended to be a public API.  Unfortunately, on x86, neither gcc nor
+ * clang seems smart enough to turn
+ *   ticker->tick -= nticks;
+ *   if (unlikely(ticker->tick < 0)) {
+ *     fixup ticker
+ *     return true;
+ *   }
+ *   return false;
+ * into
+ *   subq %nticks_reg, (%ticker_reg)
+ *   js fixup ticker
+ *
+ * unless we force "fixup ticker" out of line.  In that case, gcc gets it right,
+ * but clang now does worse than before.  So, on x86 with gcc, we force it out
+ * of line, but otherwise let the inlining occur.  Ordinarily this wouldn't be
+ * worth the hassle, but this is on the fast path of both malloc and free (via
+ * tcache_event).
+ */
+#if defined(__GNUC__) && !defined(__clang__)				\
+    && (defined(__x86_64__) || defined(__i386__))
+JEMALLOC_NOINLINE
+#endif
+static bool
+ticker_fixup(ticker_t *ticker) {
+	ticker->tick = ticker->nticks;
+	return true;
+}
+
 static inline bool
 ticker_ticks(ticker_t *ticker, int32_t nticks) {
-	if (unlikely(ticker->tick < nticks)) {
-		ticker->tick = ticker->nticks;
-		return true;
-	}
 	ticker->tick -= nticks;
-	return(false);
+	if (unlikely(ticker->tick < 0)) {
+		return ticker_fixup(ticker);
+	}
+	return false;
 }

 static inline bool
--- a/contrib/jemalloc/include/jemalloc/internal/tsd.h
+++ b/contrib/jemalloc/include/jemalloc/internal/tsd.h
@ -65,6 +65,7 @@ typedef void (*test_callback_t)(int *);
    O(arenas_tdata_bypass,	bool,			bool)		\
    O(reentrancy_level,		int8_t,			int8_t)		\
    O(narenas_tdata,		uint32_t,		uint32_t)	\
+    O(offset_state,		uint64_t,		uint64_t)	\
    O(thread_allocated,		uint64_t,		uint64_t)	\
    O(thread_deallocated,	uint64_t,		uint64_t)	\
    O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
@ -84,6 +85,7 @@ typedef void (*test_callback_t)(int *);
    0,									\
    0,									\
    0,									\
+    0,									\
    NULL,								\
    RTREE_CTX_ZERO_INITIALIZER,						\
    NULL,								\
--- a/contrib/jemalloc/include/jemalloc/internal/tsd_tls.h
+++ b/contrib/jemalloc/include/jemalloc/internal/tsd_tls.h
@ -39,7 +39,7 @@ tsd_get_allocates(void) {

 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
-tsd_get(bool init) {
+tsd_get(UNUSED bool init) {
 	assert(tsd_booted);
 	return &tsd_tls;
 }
--- a/contrib/jemalloc/include/jemalloc/internal/witness.h
+++ b/contrib/jemalloc/include/jemalloc/internal/witness.h
@ -51,7 +51,7 @@
 #define WITNESS_RANK_ARENA_LARGE	19U

 #define WITNESS_RANK_LEAF		0xffffffffU
-#define WITNESS_RANK_ARENA_BIN		WITNESS_RANK_LEAF
+#define WITNESS_RANK_BIN		WITNESS_RANK_LEAF
 #define WITNESS_RANK_ARENA_STATS	WITNESS_RANK_LEAF
 #define WITNESS_RANK_DSS		WITNESS_RANK_LEAF
 #define WITNESS_RANK_PROF_ACTIVE	WITNESS_RANK_LEAF
--- a/contrib/jemalloc/include/jemalloc/jemalloc.h
+++ b/contrib/jemalloc/include/jemalloc/jemalloc.h
@ -87,12 +87,12 @@ extern "C" {
 #include <limits.h>
 #include <strings.h>

-#define JEMALLOC_VERSION "5.0.1-0-g896ed3a8b3f41998d4fb4d625d30ac63ef2d51fb"
+#define JEMALLOC_VERSION "5.1.0-0-g61efbda7098de6fe64c362d309824864308c36d4"
 #define JEMALLOC_VERSION_MAJOR 5
-#define JEMALLOC_VERSION_MINOR 0
-#define JEMALLOC_VERSION_BUGFIX 1
+#define JEMALLOC_VERSION_MINOR 1
+#define JEMALLOC_VERSION_BUGFIX 0
 #define JEMALLOC_VERSION_NREV 0
-#define JEMALLOC_VERSION_GID "896ed3a8b3f41998d4fb4d625d30ac63ef2d51fb"
+#define JEMALLOC_VERSION_GID "61efbda7098de6fe64c362d309824864308c36d4"

 #define MALLOCX_LG_ALIGN(la)	((int)(la))
 #if LG_SIZEOF_PTR == 2
--- a/contrib/jemalloc/src/arena.c
+++ b/contrib/jemalloc/src/arena.c
@ -3,6 +3,7 @@
 #include "jemalloc/internal/jemalloc_internal_includes.h"

 #include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/div.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/extent_mmap.h"
 #include "jemalloc/internal/mutex.h"
@ -32,21 +33,6 @@ ssize_t opt_muzzy_decay_ms = MUZZY_DECAY_MS_DEFAULT;
 static atomic_zd_t dirty_decay_ms_default;
 static atomic_zd_t muzzy_decay_ms_default;

-const arena_bin_info_t arena_bin_info[NBINS] = {
-#define BIN_INFO_bin_yes(reg_size, slab_size, nregs)			\
-	{reg_size, slab_size, nregs, BITMAP_INFO_INITIALIZER(nregs)},
-#define BIN_INFO_bin_no(reg_size, slab_size, nregs)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
-    lg_delta_lookup)							\
-	BIN_INFO_bin_##bin((1U<<lg_grp) + (ndelta<<lg_delta),		\
-	    (pgs << LG_PAGE), (pgs << LG_PAGE) / ((1U<<lg_grp) +	\
-	    (ndelta<<lg_delta)))
-	SIZE_CLASSES
-#undef BIN_INFO_bin_yes
-#undef BIN_INFO_bin_no
-#undef SC
-};
-
 const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 #define STEP(step, h, x, y)			\
 		h,
@ -54,6 +40,8 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 #undef STEP
 };

+static div_info_t arena_binind_div_info[NBINS];
+
 /******************************************************************************/
 /*
 * Function prototypes for static functions that are referenced prior to
@ -62,157 +50,18 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {

 static void arena_decay_to_limit(tsdn_t *tsdn, arena_t *arena,
    arena_decay_t *decay, extents_t *extents, bool all, size_t npages_limit,
-    bool is_background_thread);
+    size_t npages_decay_max, bool is_background_thread);
 static bool arena_decay_dirty(tsdn_t *tsdn, arena_t *arena,
    bool is_background_thread, bool all);
 static void arena_dalloc_bin_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin);
+    bin_t *bin);
 static void arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin);
+    bin_t *bin);

 /******************************************************************************/

-static bool
-arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) {
-	if (config_debug) {
-		for (size_t i = 0; i < sizeof(arena_stats_t); i++) {
-			assert(((char *)arena_stats)[i] == 0);
-		}
-	}
-#ifndef JEMALLOC_ATOMIC_U64
-	if (malloc_mutex_init(&arena_stats->mtx, "arena_stats",
-	    WITNESS_RANK_ARENA_STATS, malloc_mutex_rank_exclusive)) {
-		return true;
-	}
-#endif
-	/* Memory is zeroed, so there is no need to clear stats. */
-	return false;
-}
-
-static void
-arena_stats_lock(tsdn_t *tsdn, arena_stats_t *arena_stats) {
-#ifndef JEMALLOC_ATOMIC_U64
-	malloc_mutex_lock(tsdn, &arena_stats->mtx);
-#endif
-}
-
-static void
-arena_stats_unlock(tsdn_t *tsdn, arena_stats_t *arena_stats) {
-#ifndef JEMALLOC_ATOMIC_U64
-	malloc_mutex_unlock(tsdn, &arena_stats->mtx);
-#endif
-}
-
-static uint64_t
-arena_stats_read_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    arena_stats_u64_t *p) {
-#ifdef JEMALLOC_ATOMIC_U64
-	return atomic_load_u64(p, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	return *p;
-#endif
-}
-
-static void
-arena_stats_add_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    arena_stats_u64_t *p, uint64_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	atomic_fetch_add_u64(p, x, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	*p += x;
-#endif
-}
-
-UNUSED static void
-arena_stats_sub_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    arena_stats_u64_t *p, uint64_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	UNUSED uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED);
-	assert(r - x <= r);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	*p -= x;
-	assert(*p + x >= *p);
-#endif
-}
-
-/*
- * Non-atomically sets *dst += src.  *dst needs external synchronization.
- * This lets us avoid the cost of a fetch_add when its unnecessary (note that
- * the types here are atomic).
- */
-static void
-arena_stats_accum_u64(arena_stats_u64_t *dst, uint64_t src) {
-#ifdef JEMALLOC_ATOMIC_U64
-	uint64_t cur_dst = atomic_load_u64(dst, ATOMIC_RELAXED);
-	atomic_store_u64(dst, src + cur_dst, ATOMIC_RELAXED);
-#else
-	*dst += src;
-#endif
-}
-
-static size_t
-arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p) {
-#ifdef JEMALLOC_ATOMIC_U64
-	return atomic_load_zu(p, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	return atomic_load_zu(p, ATOMIC_RELAXED);
-#endif
-}
-
-static void
-arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	atomic_fetch_add_zu(p, x, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	size_t cur = atomic_load_zu(p, ATOMIC_RELAXED);
-	atomic_store_zu(p, cur + x, ATOMIC_RELAXED);
-#endif
-}
-
-static void
-arena_stats_sub_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	UNUSED size_t r = atomic_fetch_sub_zu(p, x, ATOMIC_RELAXED);
-	assert(r - x <= r);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	size_t cur = atomic_load_zu(p, ATOMIC_RELAXED);
-	atomic_store_zu(p, cur - x, ATOMIC_RELAXED);
-#endif
-}
-
-/* Like the _u64 variant, needs an externally synchronized *dst. */
-static void
-arena_stats_accum_zu(atomic_zu_t *dst, size_t src) {
-	size_t cur_dst = atomic_load_zu(dst, ATOMIC_RELAXED);
-	atomic_store_zu(dst, src + cur_dst, ATOMIC_RELAXED);
-}
-
 void
-arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    szind_t szind, uint64_t nrequests) {
-	arena_stats_lock(tsdn, arena_stats);
-	arena_stats_add_u64(tsdn, arena_stats, &arena_stats->lstats[szind -
-	    NBINS].nrequests, nrequests);
-	arena_stats_unlock(tsdn, arena_stats);
-}
-
-void
-arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats, size_t size) {
-	arena_stats_lock(tsdn, arena_stats);
-	arena_stats_add_zu(tsdn, arena_stats, &arena_stats->mapped, size);
-	arena_stats_unlock(tsdn, arena_stats);
-}
-
-void
-arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
+arena_basic_stats_merge(UNUSED tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
    const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
    size_t *nactive, size_t *ndirty, size_t *nmuzzy) {
 	*nthreads += arena_nthreads_get(arena, false);
@ -228,15 +77,15 @@ void
 arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
    const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
    size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
-    malloc_bin_stats_t *bstats, malloc_large_stats_t *lstats) {
+    bin_stats_t *bstats, arena_stats_large_t *lstats) {
 	cassert(config_stats);

 	arena_basic_stats_merge(tsdn, arena, nthreads, dss, dirty_decay_ms,
 	    muzzy_decay_ms, nactive, ndirty, nmuzzy);

-	size_t base_allocated, base_resident, base_mapped;
+	size_t base_allocated, base_resident, base_mapped, metadata_thp;
 	base_stats_get(tsdn, arena->base, &base_allocated, &base_resident,
-	    &base_mapped);
+	    &base_mapped, &metadata_thp);

 	arena_stats_lock(tsdn, &arena->stats);

@ -267,6 +116,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,

 	arena_stats_accum_zu(&astats->base, base_allocated);
 	arena_stats_accum_zu(&astats->internal, arena_internal_get(arena));
+	arena_stats_accum_zu(&astats->metadata_thp, metadata_thp);
 	arena_stats_accum_zu(&astats->resident, base_resident +
 	    (((atomic_load_zu(&arena->nactive, ATOMIC_RELAXED) +
 	    extents_npages_get(&arena->extents_dirty) +
@ -303,16 +153,16 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	/* tcache_bytes counts currently cached bytes. */
 	atomic_store_zu(&astats->tcache_bytes, 0, ATOMIC_RELAXED);
 	malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx);
-	tcache_t *tcache;
-	ql_foreach(tcache, &arena->tcache_ql, link) {
+	cache_bin_array_descriptor_t *descriptor;
+	ql_foreach(descriptor, &arena->cache_bin_array_descriptor_ql, link) {
 		szind_t i = 0;
 		for (; i < NBINS; i++) {
-			tcache_bin_t *tbin = tcache_small_bin_get(tcache, i);
+			cache_bin_t *tbin = &descriptor->bins_small[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
 			    tbin->ncached * sz_index2size(i));
 		}
 		for (; i < nhbins; i++) {
-			tcache_bin_t *tbin = tcache_large_bin_get(tcache, i);
+			cache_bin_t *tbin = &descriptor->bins_large[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
 			    tbin->ncached * sz_index2size(i));
 		}
@ -351,20 +201,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	nstime_subtract(&astats->uptime, &arena->create_time);

 	for (szind_t i = 0; i < NBINS; i++) {
-		arena_bin_t *bin = &arena->bins[i];
-
-		malloc_mutex_lock(tsdn, &bin->lock);
-		malloc_mutex_prof_read(tsdn, &bstats[i].mutex_data, &bin->lock);
-		bstats[i].nmalloc += bin->stats.nmalloc;
-		bstats[i].ndalloc += bin->stats.ndalloc;
-		bstats[i].nrequests += bin->stats.nrequests;
-		bstats[i].curregs += bin->stats.curregs;
-		bstats[i].nfills += bin->stats.nfills;
-		bstats[i].nflushes += bin->stats.nflushes;
-		bstats[i].nslabs += bin->stats.nslabs;
-		bstats[i].reslabs += bin->stats.reslabs;
-		bstats[i].curslabs += bin->stats.curslabs;
-		malloc_mutex_unlock(tsdn, &bin->lock);
+		bin_stats_merge(tsdn, &bstats[i], &arena->bins[i]);
 	}
 }

@ -384,8 +221,7 @@ arena_extents_dirty_dalloc(tsdn_t *tsdn, arena_t *arena,
 }

 static void *
-arena_slab_reg_alloc(tsdn_t *tsdn, extent_t *slab,
-    const arena_bin_info_t *bin_info) {
+arena_slab_reg_alloc(extent_t *slab, const bin_info_t *bin_info) {
 	void *ret;
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
 	size_t regind;
@ -412,37 +248,22 @@ arena_slab_regind(extent_t *slab, szind_t binind, const void *ptr) {
 	assert((uintptr_t)ptr < (uintptr_t)extent_past_get(slab));
 	/* Freeing an interior pointer can cause assertion failure. */
 	assert(((uintptr_t)ptr - (uintptr_t)extent_addr_get(slab)) %
-	    (uintptr_t)arena_bin_info[binind].reg_size == 0);
+	    (uintptr_t)bin_infos[binind].reg_size == 0);
+
+	diff = (size_t)((uintptr_t)ptr - (uintptr_t)extent_addr_get(slab));

 	/* Avoid doing division with a variable divisor. */
-	diff = (size_t)((uintptr_t)ptr - (uintptr_t)extent_addr_get(slab));
-	switch (binind) {
-#define REGIND_bin_yes(index, reg_size)					\
-	case index:							\
-		regind = diff / (reg_size);				\
-		assert(diff == regind * (reg_size));			\
-		break;
-#define REGIND_bin_no(index, reg_size)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
-    lg_delta_lookup)							\
-	REGIND_bin_##bin(index, (1U<<lg_grp) + (ndelta<<lg_delta))
-	SIZE_CLASSES
-#undef REGIND_bin_yes
-#undef REGIND_bin_no
-#undef SC
-	default: not_reached();
-	}
+	regind = div_compute(&arena_binind_div_info[binind], diff);

-	assert(regind < arena_bin_info[binind].nregs);
+	assert(regind < bin_infos[binind].nregs);

 	return regind;
 }

 static void
-arena_slab_reg_dalloc(tsdn_t *tsdn, extent_t *slab,
-    arena_slab_data_t *slab_data, void *ptr) {
+arena_slab_reg_dalloc(extent_t *slab, arena_slab_data_t *slab_data, void *ptr) {
 	szind_t binind = extent_szind_get(slab);
-	const arena_bin_info_t *bin_info = &arena_bin_info[binind];
+	const bin_info_t *bin_info = &bin_infos[binind];
 	size_t regind = arena_slab_regind(slab, binind, ptr);

 	assert(extent_nfree_get(slab) < bin_info->nregs);
@ -692,7 +513,8 @@ arena_decay_try_purge(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
    bool is_background_thread) {
 	if (current_npages > npages_limit) {
 		arena_decay_to_limit(tsdn, arena, decay, extents, false,
-		    npages_limit, is_background_thread);
+		    npages_limit, current_npages - npages_limit,
+		    is_background_thread);
 	}
 }

@ -738,7 +560,7 @@ arena_decay_epoch_advance(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 }

 static void
-arena_decay_reinit(arena_decay_t *decay, extents_t *extents, ssize_t decay_ms) {
+arena_decay_reinit(arena_decay_t *decay, ssize_t decay_ms) {
 	arena_decay_ms_write(decay, decay_ms);
 	if (decay_ms > 0) {
 		nstime_init(&decay->interval, (uint64_t)decay_ms *
@ -755,8 +577,8 @@ arena_decay_reinit(arena_decay_t *decay, extents_t *extents, ssize_t decay_ms) {
 }

 static bool
-arena_decay_init(arena_decay_t *decay, extents_t *extents, ssize_t decay_ms,
-    decay_stats_t *stats) {
+arena_decay_init(arena_decay_t *decay, ssize_t decay_ms,
+    arena_stats_decay_t *stats) {
 	if (config_debug) {
 		for (size_t i = 0; i < sizeof(arena_decay_t); i++) {
 			assert(((char *)decay)[i] == 0);
@ -768,7 +590,7 @@ arena_decay_init(arena_decay_t *decay, extents_t *extents, ssize_t decay_ms,
 		return true;
 	}
 	decay->purging = false;
-	arena_decay_reinit(decay, extents, decay_ms);
+	arena_decay_reinit(decay, decay_ms);
 	/* Memory is zeroed, so there is no need to clear stats. */
 	if (config_stats) {
 		decay->stats = stats;
@ -798,7 +620,8 @@ arena_maybe_decay(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	if (decay_ms <= 0) {
 		if (decay_ms == 0) {
 			arena_decay_to_limit(tsdn, arena, decay, extents, false,
-			    0, is_background_thread);
+			    0, extents_npages_get(extents),
+			    is_background_thread);
 		}
 		return false;
 	}
@ -876,7 +699,7 @@ arena_decay_ms_set(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	 * infrequent, either between the {-1, 0, >0} states, or a one-time
 	 * arbitrary change during initial arena configuration.
 	 */
-	arena_decay_reinit(decay, extents, decay_ms);
+	arena_decay_reinit(decay, decay_ms);
 	arena_maybe_decay(tsdn, arena, decay, extents, false);
 	malloc_mutex_unlock(tsdn, &decay->mtx);

@ -900,14 +723,15 @@ arena_muzzy_decay_ms_set(tsdn_t *tsdn, arena_t *arena,
 static size_t
 arena_stash_decayed(tsdn_t *tsdn, arena_t *arena,
    extent_hooks_t **r_extent_hooks, extents_t *extents, size_t npages_limit,
-    extent_list_t *decay_extents) {
+	size_t npages_decay_max, extent_list_t *decay_extents) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);

 	/* Stash extents according to npages_limit. */
 	size_t nstashed = 0;
 	extent_t *extent;
-	while ((extent = extents_evict(tsdn, arena, r_extent_hooks, extents,
+	while (nstashed < npages_decay_max &&
+	    (extent = extents_evict(tsdn, arena, r_extent_hooks, extents,
 	    npages_limit)) != NULL) {
 		extent_list_append(decay_extents, extent);
 		nstashed += extent_size_get(extent) >> LG_PAGE;
@ -982,12 +806,15 @@ arena_decay_stashed(tsdn_t *tsdn, arena_t *arena,
 }

 /*
- * npages_limit: Decay as many dirty extents as possible without violating the
- * invariant: (extents_npages_get(extents) >= npages_limit)
+ * npages_limit: Decay at most npages_decay_max pages without violating the
+ * invariant: (extents_npages_get(extents) >= npages_limit).  We need an upper
+ * bound on number of pages in order to prevent unbounded growth (namely in
+ * stashed), otherwise unbounded new pages could be added to extents during the
+ * current decay run, so that the purging thread never finishes.
 */
 static void
 arena_decay_to_limit(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
-    extents_t *extents, bool all, size_t npages_limit,
+    extents_t *extents, bool all, size_t npages_limit, size_t npages_decay_max,
    bool is_background_thread) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 1);
@ -1005,7 +832,7 @@ arena_decay_to_limit(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	extent_list_init(&decay_extents);

 	size_t npurge = arena_stash_decayed(tsdn, arena, &extent_hooks, extents,
-	    npages_limit, &decay_extents);
+	    npages_limit, npages_decay_max, &decay_extents);
 	if (npurge != 0) {
 		UNUSED size_t npurged = arena_decay_stashed(tsdn, arena,
 		    &extent_hooks, decay, extents, all, &decay_extents,
@ -1023,7 +850,7 @@ arena_decay_impl(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	if (all) {
 		malloc_mutex_lock(tsdn, &decay->mtx);
 		arena_decay_to_limit(tsdn, arena, decay, extents, all, 0,
-		    is_background_thread);
+		    extents_npages_get(extents), is_background_thread);
 		malloc_mutex_unlock(tsdn, &decay->mtx);

 		return false;
@ -1036,7 +863,7 @@ arena_decay_impl(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,

 	bool epoch_advanced = arena_maybe_decay(tsdn, arena, decay, extents,
 	    is_background_thread);
-	size_t npages_new;
+	UNUSED size_t npages_new;
 	if (epoch_advanced) {
 		/* Backlog is updated on epoch advance. */
 		npages_new = decay->backlog[SMOOTHSTEP_NSTEPS-1];
@ -1045,7 +872,8 @@ arena_decay_impl(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,

 	if (have_background_thread && background_thread_enabled() &&
 	    epoch_advanced && !is_background_thread) {
-		background_thread_interval_check(tsdn, arena, decay, npages_new);
+		background_thread_interval_check(tsdn, arena, decay,
+		    npages_new);
 	}

 	return false;
@ -1082,18 +910,18 @@ arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, extent_t *slab) {
 }

 static void
-arena_bin_slabs_nonfull_insert(arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_nonfull_insert(bin_t *bin, extent_t *slab) {
 	assert(extent_nfree_get(slab) > 0);
 	extent_heap_insert(&bin->slabs_nonfull, slab);
 }

 static void
-arena_bin_slabs_nonfull_remove(arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_nonfull_remove(bin_t *bin, extent_t *slab) {
 	extent_heap_remove(&bin->slabs_nonfull, slab);
 }

 static extent_t *
-arena_bin_slabs_nonfull_tryget(arena_bin_t *bin) {
+arena_bin_slabs_nonfull_tryget(bin_t *bin) {
 	extent_t *slab = extent_heap_remove_first(&bin->slabs_nonfull);
 	if (slab == NULL) {
 		return NULL;
@ -1105,7 +933,7 @@ arena_bin_slabs_nonfull_tryget(arena_bin_t *bin) {
 }

 static void
-arena_bin_slabs_full_insert(arena_t *arena, arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_full_insert(arena_t *arena, bin_t *bin, extent_t *slab) {
 	assert(extent_nfree_get(slab) == 0);
 	/*
 	 *  Tracking extents is required by arena_reset, which is not allowed
@ -1119,7 +947,7 @@ arena_bin_slabs_full_insert(arena_t *arena, arena_bin_t *bin, extent_t *slab) {
 }

 static void
-arena_bin_slabs_full_remove(arena_t *arena, arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_full_remove(arena_t *arena, bin_t *bin, extent_t *slab) {
 	if (arena_is_auto(arena)) {
 		return;
 	}
@ -1173,7 +1001,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 	/* Bins. */
 	for (unsigned i = 0; i < NBINS; i++) {
 		extent_t *slab;
-		arena_bin_t *bin = &arena->bins[i];
+		bin_t *bin = &arena->bins[i];
 		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
 		if (bin->slabcur != NULL) {
 			slab = bin->slabcur;
@ -1262,7 +1090,7 @@ arena_destroy(tsd_t *tsd, arena_t *arena) {

 static extent_t *
 arena_slab_alloc_hard(tsdn_t *tsdn, arena_t *arena,
-    extent_hooks_t **r_extent_hooks, const arena_bin_info_t *bin_info,
+    extent_hooks_t **r_extent_hooks, const bin_info_t *bin_info,
    szind_t szind) {
 	extent_t *slab;
 	bool zero, commit;
@ -1285,7 +1113,7 @@ arena_slab_alloc_hard(tsdn_t *tsdn, arena_t *arena,

 static extent_t *
 arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
-    const arena_bin_info_t *bin_info) {
+    const bin_info_t *bin_info) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);

@ -1321,10 +1149,10 @@ arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 }

 static extent_t *
-arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
+arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
    szind_t binind) {
 	extent_t *slab;
-	const arena_bin_info_t *bin_info;
+	const bin_info_t *bin_info;

 	/* Look for a usable slab. */
 	slab = arena_bin_slabs_nonfull_tryget(bin);
@ -1333,7 +1161,7 @@ arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
 	}
 	/* No existing slabs have any space available. */

-	bin_info = &arena_bin_info[binind];
+	bin_info = &bin_infos[binind];

 	/* Allocate a new slab. */
 	malloc_mutex_unlock(tsdn, &bin->lock);
@ -1364,12 +1192,12 @@ arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,

 /* Re-fill bin->slabcur, then call arena_slab_reg_alloc(). */
 static void *
-arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
+arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
    szind_t binind) {
-	const arena_bin_info_t *bin_info;
+	const bin_info_t *bin_info;
 	extent_t *slab;

-	bin_info = &arena_bin_info[binind];
+	bin_info = &bin_infos[binind];
 	if (!arena_is_auto(arena) && bin->slabcur != NULL) {
 		arena_bin_slabs_full_insert(arena, bin, bin->slabcur);
 		bin->slabcur = NULL;
@ -1381,7 +1209,7 @@ arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
 		 * bin lock in arena_bin_nonfull_slab_get().
 		 */
 		if (extent_nfree_get(bin->slabcur) > 0) {
-			void *ret = arena_slab_reg_alloc(tsdn, bin->slabcur,
+			void *ret = arena_slab_reg_alloc(bin->slabcur,
 			    bin_info);
 			if (slab != NULL) {
 				/*
@ -1415,14 +1243,14 @@ arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,

 	assert(extent_nfree_get(bin->slabcur) > 0);

-	return arena_slab_reg_alloc(tsdn, slab, bin_info);
+	return arena_slab_reg_alloc(slab, bin_info);
 }

 void
 arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
+    cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
 	unsigned i, nfill;
-	arena_bin_t *bin;
+	bin_t *bin;

 	assert(tbin->ncached == 0);

@ -1437,8 +1265,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 		void *ptr;
 		if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) >
 		    0) {
-			ptr = arena_slab_reg_alloc(tsdn, slab,
-			    &arena_bin_info[binind]);
+			ptr = arena_slab_reg_alloc(slab, &bin_infos[binind]);
 		} else {
 			ptr = arena_bin_malloc_hard(tsdn, arena, bin, binind);
 		}
@ -1455,8 +1282,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 			break;
 		}
 		if (config_fill && unlikely(opt_junk_alloc)) {
-			arena_alloc_junk_small(ptr, &arena_bin_info[binind],
-			    true);
+			arena_alloc_junk_small(ptr, &bin_infos[binind], true);
 		}
 		/* Insert such that low regions get used first. */
 		*(tbin->avail - nfill + i) = ptr;
@ -1474,14 +1300,14 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 }

 void
-arena_alloc_junk_small(void *ptr, const arena_bin_info_t *bin_info, bool zero) {
+arena_alloc_junk_small(void *ptr, const bin_info_t *bin_info, bool zero) {
 	if (!zero) {
 		memset(ptr, JEMALLOC_ALLOC_JUNK, bin_info->reg_size);
 	}
 }

 static void
-arena_dalloc_junk_small_impl(void *ptr, const arena_bin_info_t *bin_info) {
+arena_dalloc_junk_small_impl(void *ptr, const bin_info_t *bin_info) {
 	memset(ptr, JEMALLOC_FREE_JUNK, bin_info->reg_size);
 }
 arena_dalloc_junk_small_t *JET_MUTABLE arena_dalloc_junk_small =
@ -1490,7 +1316,7 @@ arena_dalloc_junk_small_t *JET_MUTABLE arena_dalloc_junk_small =
 static void *
 arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
 	void *ret;
-	arena_bin_t *bin;
+	bin_t *bin;
 	size_t usize;
 	extent_t *slab;

@ -1500,7 +1326,7 @@ arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {

 	malloc_mutex_lock(tsdn, &bin->lock);
 	if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) > 0) {
-		ret = arena_slab_reg_alloc(tsdn, slab, &arena_bin_info[binind]);
+		ret = arena_slab_reg_alloc(slab, &bin_infos[binind]);
 	} else {
 		ret = arena_bin_malloc_hard(tsdn, arena, bin, binind);
 	}
@ -1524,14 +1350,14 @@ arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
 		if (config_fill) {
 			if (unlikely(opt_junk_alloc)) {
 				arena_alloc_junk_small(ret,
-				    &arena_bin_info[binind], false);
+				    &bin_infos[binind], false);
 			} else if (unlikely(opt_zero)) {
 				memset(ret, 0, usize);
 			}
 		}
 	} else {
 		if (config_fill && unlikely(opt_junk_alloc)) {
-			arena_alloc_junk_small(ret, &arena_bin_info[binind],
+			arena_alloc_junk_small(ret, &bin_infos[binind],
 			    true);
 		}
 		memset(ret, 0, usize);
@ -1636,13 +1462,13 @@ arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 }

 static void
-arena_dissociate_bin_slab(arena_t *arena, extent_t *slab, arena_bin_t *bin) {
+arena_dissociate_bin_slab(arena_t *arena, extent_t *slab, bin_t *bin) {
 	/* Dissociate slab from bin. */
 	if (slab == bin->slabcur) {
 		bin->slabcur = NULL;
 	} else {
 		szind_t binind = extent_szind_get(slab);
-		const arena_bin_info_t *bin_info = &arena_bin_info[binind];
+		const bin_info_t *bin_info = &bin_infos[binind];

 		/*
 		 * The following block's conditional is necessary because if the
@ -1659,7 +1485,7 @@ arena_dissociate_bin_slab(arena_t *arena, extent_t *slab, arena_bin_t *bin) {

 static void
 arena_dalloc_bin_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin) {
+    bin_t *bin) {
 	assert(slab != bin->slabcur);

 	malloc_mutex_unlock(tsdn, &bin->lock);
@ -1673,8 +1499,8 @@ arena_dalloc_bin_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 }

 static void
-arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin) {
+arena_bin_lower_slab(UNUSED tsdn_t *tsdn, arena_t *arena, extent_t *slab,
+    bin_t *bin) {
 	assert(extent_nfree_get(slab) > 0);

 	/*
@ -1704,14 +1530,14 @@ arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
    void *ptr, bool junked) {
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
 	szind_t binind = extent_szind_get(slab);
-	arena_bin_t *bin = &arena->bins[binind];
-	const arena_bin_info_t *bin_info = &arena_bin_info[binind];
+	bin_t *bin = &arena->bins[binind];
+	const bin_info_t *bin_info = &bin_infos[binind];

 	if (!junked && config_fill && unlikely(opt_junk_free)) {
 		arena_dalloc_junk_small(ptr, bin_info);
 	}

-	arena_slab_reg_dalloc(tsdn, slab, slab_data, ptr);
+	arena_slab_reg_dalloc(slab, slab_data, ptr);
 	unsigned nfree = extent_nfree_get(slab);
 	if (nfree == bin_info->nregs) {
 		arena_dissociate_bin_slab(arena, slab, bin);
@ -1736,7 +1562,7 @@ arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena, extent_t *extent,
 static void
 arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, extent_t *extent, void *ptr) {
 	szind_t binind = extent_szind_get(extent);
-	arena_bin_t *bin = &arena->bins[binind];
+	bin_t *bin = &arena->bins[binind];

 	malloc_mutex_lock(tsdn, &bin->lock);
 	arena_dalloc_bin_locked_impl(tsdn, arena, extent, ptr, false);
@ -1770,7 +1596,7 @@ arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 		 * Avoid moving the allocation if the size class can be left the
 		 * same.
 		 */
-		assert(arena_bin_info[sz_size2index(oldsize)].reg_size ==
+		assert(bin_infos[sz_size2index(oldsize)].reg_size ==
 		    oldsize);
 		if ((usize_max > SMALL_MAXCLASS || sz_size2index(usize_max) !=
 		    sz_size2index(oldsize)) && (size > oldsize || usize_max <
@ -1885,6 +1711,33 @@ arena_muzzy_decay_ms_default_set(ssize_t decay_ms) {
 	return false;
 }

+bool
+arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena, size_t *old_limit,
+    size_t *new_limit) {
+	assert(opt_retain);
+
+	pszind_t new_ind JEMALLOC_CC_SILENCE_INIT(0);
+	if (new_limit != NULL) {
+		size_t limit = *new_limit;
+		/* Grow no more than the new limit. */
+		if ((new_ind = sz_psz2ind(limit + 1) - 1) >
+		     EXTENT_GROW_MAX_PIND) {
+			return true;
+		}
+	}
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &arena->extent_grow_mtx);
+	if (old_limit != NULL) {
+		*old_limit = sz_pind2sz(arena->retain_grow_limit);
+	}
+	if (new_limit != NULL) {
+		arena->retain_grow_limit = new_ind;
+	}
+	malloc_mutex_unlock(tsd_tsdn(tsd), &arena->extent_grow_mtx);
+
+	return false;
+}
+
 unsigned
 arena_nthreads_get(arena_t *arena, bool internal) {
 	return atomic_load_u(&arena->nthreads[internal], ATOMIC_RELAXED);
@ -1935,6 +1788,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		}

 		ql_new(&arena->tcache_ql);
+		ql_new(&arena->cache_bin_array_descriptor_ql);
 		if (malloc_mutex_init(&arena->tcache_ql_mtx, "tcache_ql",
 		    WITNESS_RANK_TCACHE_QL, malloc_mutex_rank_exclusive)) {
 			goto label_error;
@ -2001,16 +1855,17 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		goto label_error;
 	}

-	if (arena_decay_init(&arena->decay_dirty, &arena->extents_dirty,
+	if (arena_decay_init(&arena->decay_dirty,
 	    arena_dirty_decay_ms_default_get(), &arena->stats.decay_dirty)) {
 		goto label_error;
 	}
-	if (arena_decay_init(&arena->decay_muzzy, &arena->extents_muzzy,
+	if (arena_decay_init(&arena->decay_muzzy,
 	    arena_muzzy_decay_ms_default_get(), &arena->stats.decay_muzzy)) {
 		goto label_error;
 	}

 	arena->extent_grow_next = sz_psz2ind(HUGEPAGE);
+	arena->retain_grow_limit = EXTENT_GROW_MAX_PIND;
 	if (malloc_mutex_init(&arena->extent_grow_mtx, "extent_grow",
 	    WITNESS_RANK_EXTENT_GROW, malloc_mutex_rank_exclusive)) {
 		goto label_error;
@ -2024,17 +1879,10 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {

 	/* Initialize bins. */
 	for (i = 0; i < NBINS; i++) {
-		arena_bin_t *bin = &arena->bins[i];
-		if (malloc_mutex_init(&bin->lock, "arena_bin",
-		    WITNESS_RANK_ARENA_BIN, malloc_mutex_rank_exclusive)) {
+		bool err = bin_init(&arena->bins[i]);
+		if (err) {
 			goto label_error;
 		}
-		bin->slabcur = NULL;
-		extent_heap_new(&bin->slabs_nonfull);
-		extent_list_init(&bin->slabs_full);
-		if (config_stats) {
-			memset(&bin->stats, 0, sizeof(malloc_bin_stats_t));
-		}
 	}

 	arena->base = base;
@ -2070,6 +1918,16 @@ void
 arena_boot(void) {
 	arena_dirty_decay_ms_default_set(opt_dirty_decay_ms);
 	arena_muzzy_decay_ms_default_set(opt_muzzy_decay_ms);
+#define REGIND_bin_yes(index, reg_size) 				\
+	div_init(&arena_binind_div_info[(index)], (reg_size));
+#define REGIND_bin_no(index, reg_size)
+#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
+    lg_delta_lookup)							\
+	REGIND_bin_##bin(index, (1U<<lg_grp) + (ndelta << lg_delta))
+	SIZE_CLASSES
+#undef REGIND_bin_yes
+#undef REGIND_bin_no
+#undef SC
 }

 void
@ -2115,7 +1973,7 @@ arena_prefork6(tsdn_t *tsdn, arena_t *arena) {
 void
 arena_prefork7(tsdn_t *tsdn, arena_t *arena) {
 	for (unsigned i = 0; i < NBINS; i++) {
-		malloc_mutex_prefork(tsdn, &arena->bins[i].lock);
+		bin_prefork(tsdn, &arena->bins[i]);
 	}
 }

@ -2124,7 +1982,7 @@ arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) {
 	unsigned i;

 	for (i = 0; i < NBINS; i++) {
-		malloc_mutex_postfork_parent(tsdn, &arena->bins[i].lock);
+		bin_postfork_parent(tsdn, &arena->bins[i]);
 	}
 	malloc_mutex_postfork_parent(tsdn, &arena->large_mtx);
 	base_postfork_parent(tsdn, arena->base);
@ -2154,15 +2012,21 @@ arena_postfork_child(tsdn_t *tsdn, arena_t *arena) {
 	}
 	if (config_stats) {
 		ql_new(&arena->tcache_ql);
+		ql_new(&arena->cache_bin_array_descriptor_ql);
 		tcache_t *tcache = tcache_get(tsdn_tsd(tsdn));
 		if (tcache != NULL && tcache->arena == arena) {
 			ql_elm_new(tcache, link);
 			ql_tail_insert(&arena->tcache_ql, tcache, link);
+			cache_bin_array_descriptor_init(
+			    &tcache->cache_bin_array_descriptor,
+			    tcache->bins_small, tcache->bins_large);
+			ql_tail_insert(&arena->cache_bin_array_descriptor_ql,
+			    &tcache->cache_bin_array_descriptor, link);
 		}
 	}

 	for (i = 0; i < NBINS; i++) {
-		malloc_mutex_postfork_child(tsdn, &arena->bins[i].lock);
+		bin_postfork_child(tsdn, &arena->bins[i]);
 	}
 	malloc_mutex_postfork_child(tsdn, &arena->large_mtx);
 	base_postfork_child(tsdn, arena->base);
--- a/contrib/jemalloc/src/background_thread.c
+++ b/contrib/jemalloc/src/background_thread.c
@ -11,12 +11,14 @@
 #define BACKGROUND_THREAD_DEFAULT false
 /* Read-only after initialization. */
 bool opt_background_thread = BACKGROUND_THREAD_DEFAULT;
+size_t opt_max_background_threads = MAX_BACKGROUND_THREAD_LIMIT;

 /* Used for thread creation, termination and stats. */
 malloc_mutex_t background_thread_lock;
 /* Indicates global state.  Atomic because decay reads this w/o locking. */
 atomic_b_t background_thread_enabled_state;
 size_t n_background_threads;
+size_t max_background_threads;
 /* Thread info per-index. */
 background_thread_info_t *background_thread_info;

@ -30,19 +32,20 @@ bool can_enable_background_thread;

 static int (*pthread_create_fptr)(pthread_t *__restrict, const pthread_attr_t *,
    void *(*)(void *), void *__restrict);
-static pthread_once_t once_control = PTHREAD_ONCE_INIT;

 static void
-pthread_create_wrapper_once(void) {
+pthread_create_wrapper_init(void) {
 #ifdef JEMALLOC_LAZY_LOCK
-	isthreaded = true;
+	if (!isthreaded) {
+		isthreaded = true;
+	}
 #endif
 }

 int
 pthread_create_wrapper(pthread_t *__restrict thread, const pthread_attr_t *attr,
    void *(*start_routine)(void *), void *__restrict arg) {
-	pthread_once(&once_control, pthread_create_wrapper_once);
+	pthread_create_wrapper_init();

 	return pthread_create_fptr(thread, attr, start_routine, arg);
 }
@ -286,7 +289,7 @@ background_work_sleep_once(tsdn_t *tsdn, background_thread_info_t *info, unsigne
 	uint64_t min_interval = BACKGROUND_THREAD_INDEFINITE_SLEEP;
 	unsigned narenas = narenas_total_get();

-	for (unsigned i = ind; i < narenas; i += ncpus) {
+	for (unsigned i = ind; i < narenas; i += max_background_threads) {
 		arena_t *arena = arena_get(tsdn, i, false);
 		if (!arena) {
 			continue;
@ -379,35 +382,32 @@ background_thread_create_signals_masked(pthread_t *thread,
 	return create_err;
 }

-static void
+static bool
 check_background_thread_creation(tsd_t *tsd, unsigned *n_created,
    bool *created_threads) {
+	bool ret = false;
 	if (likely(*n_created == n_background_threads)) {
-		return;
+		return ret;
 	}

-	malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_info[0].mtx);
-label_restart:
-	malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
-	for (unsigned i = 1; i < ncpus; i++) {
+	tsdn_t *tsdn = tsd_tsdn(tsd);
+	malloc_mutex_unlock(tsdn, &background_thread_info[0].mtx);
+	for (unsigned i = 1; i < max_background_threads; i++) {
 		if (created_threads[i]) {
 			continue;
 		}
 		background_thread_info_t *info = &background_thread_info[i];
-		malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
-		assert(info->state != background_thread_paused);
+		malloc_mutex_lock(tsdn, &info->mtx);
+		/*
+		 * In case of the background_thread_paused state because of
+		 * arena reset, delay the creation.
+		 */
 		bool create = (info->state == background_thread_started);
-		malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx);
+		malloc_mutex_unlock(tsdn, &info->mtx);
 		if (!create) {
 			continue;
 		}

-		/*
-		 * To avoid deadlock with prefork handlers (which waits for the
-		 * mutex held here), unlock before calling pthread_create().
-		 */
-		malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
-
 		pre_reentrancy(tsd, NULL);
 		int err = background_thread_create_signals_masked(&info->thread,
 		    NULL, background_thread_entry, (void *)(uintptr_t)i);
@ -423,19 +423,21 @@ label_restart:
 				abort();
 			}
 		}
-		/* Restart since we unlocked. */
-		goto label_restart;
+		/* Return to restart the loop since we unlocked. */
+		ret = true;
+		break;
 	}
-	malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_info[0].mtx);
-	malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
+	malloc_mutex_lock(tsdn, &background_thread_info[0].mtx);
+
+	return ret;
 }

 static void
 background_thread0_work(tsd_t *tsd) {
 	/* Thread0 is also responsible for launching / terminating threads. */
-	VARIABLE_ARRAY(bool, created_threads, ncpus);
+	VARIABLE_ARRAY(bool, created_threads, max_background_threads);
 	unsigned i;
-	for (i = 1; i < ncpus; i++) {
+	for (i = 1; i < max_background_threads; i++) {
 		created_threads[i] = false;
 	}
 	/* Start working, and create more threads when asked. */
@ -445,8 +447,10 @@ background_thread0_work(tsd_t *tsd) {
 		    &background_thread_info[0])) {
 			continue;
 		}
-		check_background_thread_creation(tsd, &n_created,
-		    (bool *)&created_threads);
+		if (check_background_thread_creation(tsd, &n_created,
+		    (bool *)&created_threads)) {
+			continue;
+		}
 		background_work_sleep_once(tsd_tsdn(tsd),
 		    &background_thread_info[0], 0);
 	}
@ -456,15 +460,20 @@ background_thread0_work(tsd_t *tsd) {
 	 * the global background_thread mutex (and is waiting) for us.
 	 */
 	assert(!background_thread_enabled());
-	for (i = 1; i < ncpus; i++) {
+	for (i = 1; i < max_background_threads; i++) {
 		background_thread_info_t *info = &background_thread_info[i];
 		assert(info->state != background_thread_paused);
 		if (created_threads[i]) {
 			background_threads_disable_single(tsd, info);
 		} else {
 			malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
-			/* Clear in case the thread wasn't created. */
-			info->state = background_thread_stopped;
+			if (info->state != background_thread_stopped) {
+				/* The thread was not created. */
+				assert(info->state ==
+				    background_thread_started);
+				n_background_threads--;
+				info->state = background_thread_stopped;
+			}
 			malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx);
 		}
 	}
@ -498,7 +507,7 @@ background_work(tsd_t *tsd, unsigned ind) {
 static void *
 background_thread_entry(void *ind_arg) {
 	unsigned thread_ind = (unsigned)(uintptr_t)ind_arg;
-	assert(thread_ind < ncpus);
+	assert(thread_ind < max_background_threads);
 #ifdef JEMALLOC_HAVE_PTHREAD_SETNAME_NP
 	pthread_setname_np(pthread_self(), "jemalloc_bg_thd");
 #endif
@ -532,7 +541,7 @@ background_thread_create(tsd_t *tsd, unsigned arena_ind) {
 	malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock);

 	/* We create at most NCPUs threads. */
-	size_t thread_ind = arena_ind % ncpus;
+	size_t thread_ind = arena_ind % max_background_threads;
 	background_thread_info_t *info = &background_thread_info[thread_ind];

 	bool need_new_thread;
@ -586,26 +595,29 @@ background_threads_enable(tsd_t *tsd) {
 	assert(background_thread_enabled());
 	malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock);

-	VARIABLE_ARRAY(bool, marked, ncpus);
+	VARIABLE_ARRAY(bool, marked, max_background_threads);
 	unsigned i, nmarked;
-	for (i = 0; i < ncpus; i++) {
+	for (i = 0; i < max_background_threads; i++) {
 		marked[i] = false;
 	}
 	nmarked = 0;
+	/* Thread 0 is required and created at the end. */
+	marked[0] = true;
 	/* Mark the threads we need to create for thread 0. */
 	unsigned n = narenas_total_get();
 	for (i = 1; i < n; i++) {
-		if (marked[i % ncpus] ||
+		if (marked[i % max_background_threads] ||
 		    arena_get(tsd_tsdn(tsd), i, false) == NULL) {
 			continue;
 		}
-		background_thread_info_t *info = &background_thread_info[i];
+		background_thread_info_t *info = &background_thread_info[
+		    i % max_background_threads];
 		malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
 		assert(info->state == background_thread_stopped);
 		background_thread_init(tsd, info);
 		malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx);
-		marked[i % ncpus] = true;
-		if (++nmarked == ncpus) {
+		marked[i % max_background_threads] = true;
+		if (++nmarked == max_background_threads) {
 			break;
 		}
 	}
@ -720,14 +732,14 @@ background_thread_prefork0(tsdn_t *tsdn) {

 void
 background_thread_prefork1(tsdn_t *tsdn) {
-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		malloc_mutex_prefork(tsdn, &background_thread_info[i].mtx);
 	}
 }

 void
 background_thread_postfork_parent(tsdn_t *tsdn) {
-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		malloc_mutex_postfork_parent(tsdn,
 		    &background_thread_info[i].mtx);
 	}
@ -736,7 +748,7 @@ background_thread_postfork_parent(tsdn_t *tsdn) {

 void
 background_thread_postfork_child(tsdn_t *tsdn) {
-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		malloc_mutex_postfork_child(tsdn,
 		    &background_thread_info[i].mtx);
 	}
@ -749,7 +761,7 @@ background_thread_postfork_child(tsdn_t *tsdn) {
 	malloc_mutex_lock(tsdn, &background_thread_lock);
 	n_background_threads = 0;
 	background_thread_enabled_set(tsdn, false);
-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		background_thread_info_t *info = &background_thread_info[i];
 		malloc_mutex_lock(tsdn, &info->mtx);
 		info->state = background_thread_stopped;
@ -773,7 +785,7 @@ background_thread_stats_read(tsdn_t *tsdn, background_thread_stats_t *stats) {
 	stats->num_threads = n_background_threads;
 	uint64_t num_runs = 0;
 	nstime_init(&stats->run_interval, 0);
-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		background_thread_info_t *info = &background_thread_info[i];
 		malloc_mutex_lock(tsdn, &info->mtx);
 		if (info->state != background_thread_stopped) {
@ -795,6 +807,26 @@ background_thread_stats_read(tsdn_t *tsdn, background_thread_stats_t *stats) {
 #undef BILLION
 #undef BACKGROUND_THREAD_MIN_INTERVAL_NS

+static bool
+pthread_create_fptr_init(void) {
+	if (pthread_create_fptr != NULL) {
+		return false;
+	}
+	pthread_create_fptr = dlsym(RTLD_NEXT, "pthread_create");
+	if (pthread_create_fptr == NULL) {
+		can_enable_background_thread = false;
+		if (config_lazy_lock || opt_background_thread) {
+			malloc_write("<jemalloc>: Error in dlsym(RTLD_NEXT, "
+			    "\"pthread_create\")\n");
+			abort();
+		}
+	} else {
+		can_enable_background_thread = true;
+	}
+
+	return false;
+}
+
 /*
 * When lazy lock is enabled, we need to make sure setting isthreaded before
 * taking any background_thread locks.  This is called early in ctl (instead of
@ -805,7 +837,8 @@ void
 background_thread_ctl_init(tsdn_t *tsdn) {
 	malloc_mutex_assert_not_owner(tsdn, &background_thread_lock);
 #ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
-	pthread_once(&once_control, pthread_create_wrapper_once);
+	pthread_create_fptr_init();
+	pthread_create_wrapper_init();
 #endif
 }

@ -818,18 +851,10 @@ background_thread_boot0(void) {
 		    "supports pthread only\n");
 		return true;
 	}
-
 #ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
-	pthread_create_fptr = dlsym(RTLD_NEXT, "pthread_create");
-	if (pthread_create_fptr == NULL) {
-		can_enable_background_thread = false;
-		if (config_lazy_lock || opt_background_thread) {
-			malloc_write("<jemalloc>: Error in dlsym(RTLD_NEXT, "
-			    "\"pthread_create\")\n");
-			abort();
-		}
-	} else {
-		can_enable_background_thread = true;
+	if ((config_lazy_lock || opt_background_thread) &&
+	    pthread_create_fptr_init()) {
+		return true;
 	}
 #endif
 	return false;
@ -841,6 +866,12 @@ background_thread_boot1(tsdn_t *tsdn) {
 	assert(have_background_thread);
 	assert(narenas_total_get() > 0);

+	if (opt_max_background_threads == MAX_BACKGROUND_THREAD_LIMIT &&
+	    ncpus < MAX_BACKGROUND_THREAD_LIMIT) {
+		opt_max_background_threads = ncpus;
+	}
+	max_background_threads = opt_max_background_threads;
+
 	background_thread_enabled_set(tsdn, opt_background_thread);
 	if (malloc_mutex_init(&background_thread_lock,
 	    "background_thread_global",
@ -848,17 +879,15 @@ background_thread_boot1(tsdn_t *tsdn) {
 	    malloc_mutex_rank_exclusive)) {
 		return true;
 	}
-	if (opt_background_thread) {
-		background_thread_ctl_init(tsdn);
-	}

 	background_thread_info = (background_thread_info_t *)base_alloc(tsdn,
-	    b0get(), ncpus * sizeof(background_thread_info_t), CACHELINE);
+	    b0get(), opt_max_background_threads *
+	    sizeof(background_thread_info_t), CACHELINE);
 	if (background_thread_info == NULL) {
 		return true;
 	}

-	for (unsigned i = 0; i < ncpus; i++) {
+	for (unsigned i = 0; i < max_background_threads; i++) {
 		background_thread_info_t *info = &background_thread_info[i];
 		/* Thread mutex is rank_inclusive because of thread0. */
 		if (malloc_mutex_init(&info->mtx, "background_thread",
--- a/contrib/jemalloc/src/base.c
+++ b/contrib/jemalloc/src/base.c
@ -10,25 +10,40 @@
 /******************************************************************************/
 /* Data. */

-static base_t	*b0;
+static base_t *b0;
+
+metadata_thp_mode_t opt_metadata_thp = METADATA_THP_DEFAULT;
+
+const char *metadata_thp_mode_names[] = {
+	"disabled",
+	"auto",
+	"always"
+};

 /******************************************************************************/

+static inline bool
+metadata_thp_madvise(void) {
+	return (metadata_thp_enabled() &&
+	    (init_system_thp_mode == thp_mode_default));
+}
+
 static void *
 base_map(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, size_t size) {
 	void *addr;
 	bool zero = true;
 	bool commit = true;

+	/* Use huge page sizes and alignment regardless of opt_metadata_thp. */
 	assert(size == HUGEPAGE_CEILING(size));
-
+	size_t alignment = HUGEPAGE;
 	if (extent_hooks == &extent_hooks_default) {
-		addr = extent_alloc_mmap(NULL, size, PAGE, &zero, &commit);
+		addr = extent_alloc_mmap(NULL, size, alignment, &zero, &commit);
 	} else {
 		/* No arena context as we are creating new arenas. */
 		tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn);
 		pre_reentrancy(tsd, NULL);
-		addr = extent_hooks->alloc(extent_hooks, NULL, size, PAGE,
+		addr = extent_hooks->alloc(extent_hooks, NULL, size, alignment,
 		    &zero, &commit, ind);
 		post_reentrancy(tsd);
 	}
@ -51,16 +66,16 @@ base_unmap(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, void *addr,
 	 */
 	if (extent_hooks == &extent_hooks_default) {
 		if (!extent_dalloc_mmap(addr, size)) {
-			return;
+			goto label_done;
 		}
 		if (!pages_decommit(addr, size)) {
-			return;
+			goto label_done;
 		}
 		if (!pages_purge_forced(addr, size)) {
-			return;
+			goto label_done;
 		}
 		if (!pages_purge_lazy(addr, size)) {
-			return;
+			goto label_done;
 		}
 		/* Nothing worked.  This should never happen. */
 		not_reached();
@ -70,27 +85,33 @@ base_unmap(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, void *addr,
 		if (extent_hooks->dalloc != NULL &&
 		    !extent_hooks->dalloc(extent_hooks, addr, size, true,
 		    ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		if (extent_hooks->decommit != NULL &&
 		    !extent_hooks->decommit(extent_hooks, addr, size, 0, size,
 		    ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		if (extent_hooks->purge_forced != NULL &&
 		    !extent_hooks->purge_forced(extent_hooks, addr, size, 0,
 		    size, ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		if (extent_hooks->purge_lazy != NULL &&
 		    !extent_hooks->purge_lazy(extent_hooks, addr, size, 0, size,
 		    ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		/* Nothing worked.  That's the application's problem. */
-	label_done:
+	label_post_reentrancy:
 		post_reentrancy(tsd);
-		return;
+	}
+label_done:
+	if (metadata_thp_madvise()) {
+		/* Set NOHUGEPAGE after unmap to avoid kernel defrag. */
+		assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 &&
+		    (size & HUGEPAGE_MASK) == 0);
+		pages_nohuge(addr, size);
 	}
 }

@ -105,6 +126,56 @@ base_extent_init(size_t *extent_sn_next, extent_t *extent, void *addr,
 	extent_binit(extent, addr, size, sn);
 }

+static size_t
+base_get_num_blocks(base_t *base, bool with_new_block) {
+	base_block_t *b = base->blocks;
+	assert(b != NULL);
+
+	size_t n_blocks = with_new_block ? 2 : 1;
+	while (b->next != NULL) {
+		n_blocks++;
+		b = b->next;
+	}
+
+	return n_blocks;
+}
+
+static void
+base_auto_thp_switch(tsdn_t *tsdn, base_t *base) {
+	assert(opt_metadata_thp == metadata_thp_auto);
+	malloc_mutex_assert_owner(tsdn, &base->mtx);
+	if (base->auto_thp_switched) {
+		return;
+	}
+	/* Called when adding a new block. */
+	bool should_switch;
+	if (base_ind_get(base) != 0) {
+		should_switch = (base_get_num_blocks(base, true) ==
+		    BASE_AUTO_THP_THRESHOLD);
+	} else {
+		should_switch = (base_get_num_blocks(base, true) ==
+		    BASE_AUTO_THP_THRESHOLD_A0);
+	}
+	if (!should_switch) {
+		return;
+	}
+
+	base->auto_thp_switched = true;
+	assert(!config_stats || base->n_thp == 0);
+	/* Make the initial blocks THP lazily. */
+	base_block_t *block = base->blocks;
+	while (block != NULL) {
+		assert((block->size & HUGEPAGE_MASK) == 0);
+		pages_huge(block, block->size);
+		if (config_stats) {
+			base->n_thp += HUGEPAGE_CEILING(block->size -
+			    extent_bsize_get(&block->extent)) >> LG_HUGEPAGE;
+		}
+		block = block->next;
+		assert(block == NULL || (base_ind_get(base) == 0));
+	}
+}
+
 static void *
 base_extent_bump_alloc_helper(extent_t *extent, size_t *gap_size, size_t size,
    size_t alignment) {
@ -124,8 +195,8 @@ base_extent_bump_alloc_helper(extent_t *extent, size_t *gap_size, size_t size,
 }

 static void
-base_extent_bump_alloc_post(tsdn_t *tsdn, base_t *base, extent_t *extent,
-    size_t gap_size, void *addr, size_t size) {
+base_extent_bump_alloc_post(base_t *base, extent_t *extent, size_t gap_size,
+    void *addr, size_t size) {
 	if (extent_bsize_get(extent) > 0) {
 		/*
 		 * Compute the index for the largest size class that does not
@ -140,23 +211,31 @@ base_extent_bump_alloc_post(tsdn_t *tsdn, base_t *base, extent_t *extent,
 		base->allocated += size;
 		/*
 		 * Add one PAGE to base_resident for every page boundary that is
-		 * crossed by the new allocation.
+		 * crossed by the new allocation. Adjust n_thp similarly when
+		 * metadata_thp is enabled.
 		 */
 		base->resident += PAGE_CEILING((uintptr_t)addr + size) -
 		    PAGE_CEILING((uintptr_t)addr - gap_size);
 		assert(base->allocated <= base->resident);
 		assert(base->resident <= base->mapped);
+		if (metadata_thp_madvise() && (opt_metadata_thp ==
+		    metadata_thp_always || base->auto_thp_switched)) {
+			base->n_thp += (HUGEPAGE_CEILING((uintptr_t)addr + size)
+			    - HUGEPAGE_CEILING((uintptr_t)addr - gap_size)) >>
+			    LG_HUGEPAGE;
+			assert(base->mapped >= base->n_thp << LG_HUGEPAGE);
+		}
 	}
 }

 static void *
-base_extent_bump_alloc(tsdn_t *tsdn, base_t *base, extent_t *extent,
-    size_t size, size_t alignment) {
+base_extent_bump_alloc(base_t *base, extent_t *extent, size_t size,
+    size_t alignment) {
 	void *ret;
 	size_t gap_size;

 	ret = base_extent_bump_alloc_helper(extent, &gap_size, size, alignment);
-	base_extent_bump_alloc_post(tsdn, base, extent, gap_size, ret, size);
+	base_extent_bump_alloc_post(base, extent, gap_size, ret, size);
 	return ret;
 }

@ -166,8 +245,8 @@ base_extent_bump_alloc(tsdn_t *tsdn, base_t *base, extent_t *extent,
 * On success a pointer to the initialized base_block_t header is returned.
 */
 static base_block_t *
-base_block_alloc(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind,
-    pszind_t *pind_last, size_t *extent_sn_next, size_t size,
+base_block_alloc(tsdn_t *tsdn, base_t *base, extent_hooks_t *extent_hooks,
+    unsigned ind, pszind_t *pind_last, size_t *extent_sn_next, size_t size,
    size_t alignment) {
 	alignment = ALIGNMENT_CEILING(alignment, QUANTUM);
 	size_t usize = ALIGNMENT_CEILING(size, alignment);
@ -193,6 +272,25 @@ base_block_alloc(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind,
 	if (block == NULL) {
 		return NULL;
 	}
+
+	if (metadata_thp_madvise()) {
+		void *addr = (void *)block;
+		assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 &&
+		    (block_size & HUGEPAGE_MASK) == 0);
+		if (opt_metadata_thp == metadata_thp_always) {
+			pages_huge(addr, block_size);
+		} else if (opt_metadata_thp == metadata_thp_auto &&
+		    base != NULL) {
+			/* base != NULL indicates this is not a new base. */
+			malloc_mutex_lock(tsdn, &base->mtx);
+			base_auto_thp_switch(tsdn, base);
+			if (base->auto_thp_switched) {
+				pages_huge(addr, block_size);
+			}
+			malloc_mutex_unlock(tsdn, &base->mtx);
+		}
+	}
+
 	*pind_last = sz_psz2ind(block_size);
 	block->size = block_size;
 	block->next = NULL;
@ -216,7 +314,7 @@ base_extent_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) {
 	 * called.
 	 */
 	malloc_mutex_unlock(tsdn, &base->mtx);
-	base_block_t *block = base_block_alloc(tsdn, extent_hooks,
+	base_block_t *block = base_block_alloc(tsdn, base, extent_hooks,
 	    base_ind_get(base), &base->pind_last, &base->extent_sn_next, size,
 	    alignment);
 	malloc_mutex_lock(tsdn, &base->mtx);
@ -229,8 +327,16 @@ base_extent_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) {
 		base->allocated += sizeof(base_block_t);
 		base->resident += PAGE_CEILING(sizeof(base_block_t));
 		base->mapped += block->size;
+		if (metadata_thp_madvise() &&
+		    !(opt_metadata_thp == metadata_thp_auto
+		      && !base->auto_thp_switched)) {
+			assert(base->n_thp > 0);
+			base->n_thp += HUGEPAGE_CEILING(sizeof(base_block_t)) >>
+			    LG_HUGEPAGE;
+		}
 		assert(base->allocated <= base->resident);
 		assert(base->resident <= base->mapped);
+		assert(base->n_thp << LG_HUGEPAGE <= base->mapped);
 	}
 	return &block->extent;
 }
@ -244,7 +350,7 @@ base_t *
 base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	pszind_t pind_last = 0;
 	size_t extent_sn_next = 0;
-	base_block_t *block = base_block_alloc(tsdn, extent_hooks, ind,
+	base_block_t *block = base_block_alloc(tsdn, NULL, extent_hooks, ind,
 	    &pind_last, &extent_sn_next, sizeof(base_t), QUANTUM);
 	if (block == NULL) {
 		return NULL;
@ -265,6 +371,7 @@ base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	base->pind_last = pind_last;
 	base->extent_sn_next = extent_sn_next;
 	base->blocks = block;
+	base->auto_thp_switched = false;
 	for (szind_t i = 0; i < NSIZES; i++) {
 		extent_heap_new(&base->avail[i]);
 	}
@ -272,10 +379,14 @@ base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		base->allocated = sizeof(base_block_t);
 		base->resident = PAGE_CEILING(sizeof(base_block_t));
 		base->mapped = block->size;
+		base->n_thp = (opt_metadata_thp == metadata_thp_always) &&
+		    metadata_thp_madvise() ? HUGEPAGE_CEILING(sizeof(base_block_t))
+		    >> LG_HUGEPAGE : 0;
 		assert(base->allocated <= base->resident);
 		assert(base->resident <= base->mapped);
+		assert(base->n_thp << LG_HUGEPAGE <= base->mapped);
 	}
-	base_extent_bump_alloc_post(tsdn, base, &block->extent, gap_size, base,
+	base_extent_bump_alloc_post(base, &block->extent, gap_size, base,
 	    base_size);

 	return base;
@ -332,7 +443,7 @@ base_alloc_impl(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment,
 		goto label_return;
 	}

-	ret = base_extent_bump_alloc(tsdn, base, extent, usize, alignment);
+	ret = base_extent_bump_alloc(base, extent, usize, alignment);
 	if (esn != NULL) {
 		*esn = extent_sn_get(extent);
 	}
@ -368,7 +479,7 @@ base_alloc_extent(tsdn_t *tsdn, base_t *base) {

 void
 base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, size_t *resident,
-    size_t *mapped) {
+    size_t *mapped, size_t *n_thp) {
 	cassert(config_stats);

 	malloc_mutex_lock(tsdn, &base->mtx);
@ -377,6 +488,7 @@ base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, size_t *resident,
 	*allocated = base->allocated;
 	*resident = base->resident;
 	*mapped = base->mapped;
+	*n_thp = base->n_thp;
 	malloc_mutex_unlock(tsdn, &base->mtx);
 }

--- a/contrib/jemalloc/src/bin.c
+++ b/contrib/jemalloc/src/bin.c
@ -0,0 +1,50 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+#include "jemalloc/internal/bin.h"
+#include "jemalloc/internal/witness.h"
+
+const bin_info_t bin_infos[NBINS] = {
+#define BIN_INFO_bin_yes(reg_size, slab_size, nregs)			\
+	{reg_size, slab_size, nregs, BITMAP_INFO_INITIALIZER(nregs)},
+#define BIN_INFO_bin_no(reg_size, slab_size, nregs)
+#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
+    lg_delta_lookup)							\
+	BIN_INFO_bin_##bin((1U<<lg_grp) + (ndelta<<lg_delta),		\
+	    (pgs << LG_PAGE), (pgs << LG_PAGE) / ((1U<<lg_grp) +	\
+	    (ndelta<<lg_delta)))
+	SIZE_CLASSES
+#undef BIN_INFO_bin_yes
+#undef BIN_INFO_bin_no
+#undef SC
+};
+
+bool
+bin_init(bin_t *bin) {
+	if (malloc_mutex_init(&bin->lock, "bin", WITNESS_RANK_BIN,
+	    malloc_mutex_rank_exclusive)) {
+		return true;
+	}
+	bin->slabcur = NULL;
+	extent_heap_new(&bin->slabs_nonfull);
+	extent_list_init(&bin->slabs_full);
+	if (config_stats) {
+		memset(&bin->stats, 0, sizeof(bin_stats_t));
+	}
+	return false;
+}
+
+void
+bin_prefork(tsdn_t *tsdn, bin_t *bin) {
+	malloc_mutex_prefork(tsdn, &bin->lock);
+}
+
+void
+bin_postfork_parent(tsdn_t *tsdn, bin_t *bin) {
+	malloc_mutex_postfork_parent(tsdn, &bin->lock);
+}
+
+void
+bin_postfork_child(tsdn_t *tsdn, bin_t *bin) {
+	malloc_mutex_postfork_child(tsdn, &bin->lock);
+}
--- a/contrib/jemalloc/src/ctl.c
+++ b/contrib/jemalloc/src/ctl.c
@ -57,6 +57,7 @@ static const ctl_named_node_t	*n##_index(tsdn_t *tsdn,		\
 CTL_PROTO(version)
 CTL_PROTO(epoch)
 CTL_PROTO(background_thread)
+CTL_PROTO(max_background_threads)
 CTL_PROTO(thread_tcache_enabled)
 CTL_PROTO(thread_tcache_flush)
 CTL_PROTO(thread_prof_name)
@ -75,16 +76,17 @@ CTL_PROTO(config_prof)
 CTL_PROTO(config_prof_libgcc)
 CTL_PROTO(config_prof_libunwind)
 CTL_PROTO(config_stats)
-CTL_PROTO(config_thp)
 CTL_PROTO(config_utrace)
 CTL_PROTO(config_xmalloc)
 CTL_PROTO(opt_abort)
 CTL_PROTO(opt_abort_conf)
+CTL_PROTO(opt_metadata_thp)
 CTL_PROTO(opt_retain)
 CTL_PROTO(opt_dss)
 CTL_PROTO(opt_narenas)
 CTL_PROTO(opt_percpu_arena)
 CTL_PROTO(opt_background_thread)
+CTL_PROTO(opt_max_background_threads)
 CTL_PROTO(opt_dirty_decay_ms)
 CTL_PROTO(opt_muzzy_decay_ms)
 CTL_PROTO(opt_stats_print)
@ -94,6 +96,8 @@ CTL_PROTO(opt_zero)
 CTL_PROTO(opt_utrace)
 CTL_PROTO(opt_xmalloc)
 CTL_PROTO(opt_tcache)
+CTL_PROTO(opt_thp)
+CTL_PROTO(opt_lg_extent_max_active_fit)
 CTL_PROTO(opt_lg_tcache_max)
 CTL_PROTO(opt_prof)
 CTL_PROTO(opt_prof_prefix)
@ -117,6 +121,7 @@ CTL_PROTO(arena_i_dss)
 CTL_PROTO(arena_i_dirty_decay_ms)
 CTL_PROTO(arena_i_muzzy_decay_ms)
 CTL_PROTO(arena_i_extent_hooks)
+CTL_PROTO(arena_i_retain_grow_limit)
 INDEX_PROTO(arena_i)
 CTL_PROTO(arenas_bin_i_size)
 CTL_PROTO(arenas_bin_i_nregs)
@ -134,6 +139,7 @@ CTL_PROTO(arenas_nbins)
 CTL_PROTO(arenas_nhbins)
 CTL_PROTO(arenas_nlextents)
 CTL_PROTO(arenas_create)
+CTL_PROTO(arenas_lookup)
 CTL_PROTO(prof_thread_active_init)
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
@ -182,6 +188,7 @@ CTL_PROTO(stats_arenas_i_muzzy_nmadvise)
 CTL_PROTO(stats_arenas_i_muzzy_purged)
 CTL_PROTO(stats_arenas_i_base)
 CTL_PROTO(stats_arenas_i_internal)
+CTL_PROTO(stats_arenas_i_metadata_thp)
 CTL_PROTO(stats_arenas_i_tcache_bytes)
 CTL_PROTO(stats_arenas_i_resident)
 INDEX_PROTO(stats_arenas_i)
@ -191,6 +198,7 @@ CTL_PROTO(stats_background_thread_num_threads)
 CTL_PROTO(stats_background_thread_num_runs)
 CTL_PROTO(stats_background_thread_run_interval)
 CTL_PROTO(stats_metadata)
+CTL_PROTO(stats_metadata_thp)
 CTL_PROTO(stats_resident)
 CTL_PROTO(stats_mapped)
 CTL_PROTO(stats_retained)
@ -266,7 +274,6 @@ static const ctl_named_node_t	config_node[] = {
 	{NAME("prof_libgcc"),	CTL(config_prof_libgcc)},
 	{NAME("prof_libunwind"), CTL(config_prof_libunwind)},
 	{NAME("stats"),		CTL(config_stats)},
-	{NAME("thp"),		CTL(config_thp)},
 	{NAME("utrace"),	CTL(config_utrace)},
 	{NAME("xmalloc"),	CTL(config_xmalloc)}
 };
@ -274,11 +281,13 @@ static const ctl_named_node_t	config_node[] = {
 static const ctl_named_node_t opt_node[] = {
 	{NAME("abort"),		CTL(opt_abort)},
 	{NAME("abort_conf"),	CTL(opt_abort_conf)},
+	{NAME("metadata_thp"),	CTL(opt_metadata_thp)},
 	{NAME("retain"),	CTL(opt_retain)},
 	{NAME("dss"),		CTL(opt_dss)},
 	{NAME("narenas"),	CTL(opt_narenas)},
 	{NAME("percpu_arena"),	CTL(opt_percpu_arena)},
 	{NAME("background_thread"),	CTL(opt_background_thread)},
+	{NAME("max_background_threads"),	CTL(opt_max_background_threads)},
 	{NAME("dirty_decay_ms"), CTL(opt_dirty_decay_ms)},
 	{NAME("muzzy_decay_ms"), CTL(opt_muzzy_decay_ms)},
 	{NAME("stats_print"),	CTL(opt_stats_print)},
@ -288,6 +297,8 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("utrace"),	CTL(opt_utrace)},
 	{NAME("xmalloc"),	CTL(opt_xmalloc)},
 	{NAME("tcache"),	CTL(opt_tcache)},
+	{NAME("thp"),		CTL(opt_thp)},
+	{NAME("lg_extent_max_active_fit"), CTL(opt_lg_extent_max_active_fit)},
 	{NAME("lg_tcache_max"),	CTL(opt_lg_tcache_max)},
 	{NAME("prof"),		CTL(opt_prof)},
 	{NAME("prof_prefix"),	CTL(opt_prof_prefix)},
@ -316,7 +327,8 @@ static const ctl_named_node_t arena_i_node[] = {
 	{NAME("dss"),		CTL(arena_i_dss)},
 	{NAME("dirty_decay_ms"), CTL(arena_i_dirty_decay_ms)},
 	{NAME("muzzy_decay_ms"), CTL(arena_i_muzzy_decay_ms)},
-	{NAME("extent_hooks"),	CTL(arena_i_extent_hooks)}
+	{NAME("extent_hooks"),	CTL(arena_i_extent_hooks)},
+	{NAME("retain_grow_limit"),	CTL(arena_i_retain_grow_limit)}
 };
 static const ctl_named_node_t super_arena_i_node[] = {
 	{NAME(""),		CHILD(named, arena_i)}
@ -362,7 +374,8 @@ static const ctl_named_node_t arenas_node[] = {
 	{NAME("bin"),		CHILD(indexed, arenas_bin)},
 	{NAME("nlextents"),	CTL(arenas_nlextents)},
 	{NAME("lextent"),	CHILD(indexed, arenas_lextent)},
-	{NAME("create"),	CTL(arenas_create)}
+	{NAME("create"),	CTL(arenas_create)},
+	{NAME("lookup"),	CTL(arenas_lookup)}
 };

 static const ctl_named_node_t	prof_node[] = {
@ -474,6 +487,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("muzzy_purged"),	CTL(stats_arenas_i_muzzy_purged)},
 	{NAME("base"),		CTL(stats_arenas_i_base)},
 	{NAME("internal"),	CTL(stats_arenas_i_internal)},
+	{NAME("metadata_thp"),	CTL(stats_arenas_i_metadata_thp)},
 	{NAME("tcache_bytes"),	CTL(stats_arenas_i_tcache_bytes)},
 	{NAME("resident"),	CTL(stats_arenas_i_resident)},
 	{NAME("small"),		CHILD(named, stats_arenas_i_small)},
@ -512,6 +526,7 @@ static const ctl_named_node_t stats_node[] = {
 	{NAME("allocated"),	CTL(stats_allocated)},
 	{NAME("active"),	CTL(stats_active)},
 	{NAME("metadata"),	CTL(stats_metadata)},
+	{NAME("metadata_thp"),	CTL(stats_metadata_thp)},
 	{NAME("resident"),	CTL(stats_resident)},
 	{NAME("mapped"),	CTL(stats_mapped)},
 	{NAME("retained"),	CTL(stats_retained)},
@ -525,6 +540,7 @@ static const ctl_named_node_t	root_node[] = {
 	{NAME("version"),	CTL(version)},
 	{NAME("epoch"),		CTL(epoch)},
 	{NAME("background_thread"),	CTL(background_thread)},
+	{NAME("max_background_threads"),	CTL(max_background_threads)},
 	{NAME("thread"),	CHILD(named, thread)},
 	{NAME("config"),	CHILD(named, config)},
 	{NAME("opt"),		CHILD(named, opt)},
@ -550,7 +566,7 @@ static const ctl_named_node_t super_root_node[] = {
 * synchronized by the ctl mutex.
 */
 static void
-accum_arena_stats_u64(arena_stats_u64_t *dst, arena_stats_u64_t *src) {
+ctl_accum_arena_stats_u64(arena_stats_u64_t *dst, arena_stats_u64_t *src) {
 #ifdef JEMALLOC_ATOMIC_U64
 	uint64_t cur_dst = atomic_load_u64(dst, ATOMIC_RELAXED);
 	uint64_t cur_src = atomic_load_u64(src, ATOMIC_RELAXED);
@ -562,7 +578,7 @@ accum_arena_stats_u64(arena_stats_u64_t *dst, arena_stats_u64_t *src) {

 /* Likewise: with ctl mutex synchronization, reading is simple. */
 static uint64_t
-arena_stats_read_u64(arena_stats_u64_t *p) {
+ctl_arena_stats_read_u64(arena_stats_u64_t *p) {
 #ifdef JEMALLOC_ATOMIC_U64
 	return atomic_load_u64(p, ATOMIC_RELAXED);
 #else
@ -570,7 +586,8 @@ arena_stats_read_u64(arena_stats_u64_t *p) {
 #endif
 }

-static void accum_atomic_zu(atomic_zu_t *dst, atomic_zu_t *src) {
+static void
+accum_atomic_zu(atomic_zu_t *dst, atomic_zu_t *src) {
 	size_t cur_dst = atomic_load_zu(dst, ATOMIC_RELAXED);
 	size_t cur_src = atomic_load_zu(src, ATOMIC_RELAXED);
 	atomic_store_zu(dst, cur_dst + cur_src, ATOMIC_RELAXED);
@ -680,9 +697,9 @@ ctl_arena_clear(ctl_arena_t *ctl_arena) {
 		ctl_arena->astats->ndalloc_small = 0;
 		ctl_arena->astats->nrequests_small = 0;
 		memset(ctl_arena->astats->bstats, 0, NBINS *
-		    sizeof(malloc_bin_stats_t));
+		    sizeof(bin_stats_t));
 		memset(ctl_arena->astats->lstats, 0, (NSIZES - NBINS) *
-		    sizeof(malloc_large_stats_t));
+		    sizeof(arena_stats_large_t));
 	}
 }

@ -745,18 +762,18 @@ ctl_arena_stats_sdmerge(ctl_arena_t *ctl_sdarena, ctl_arena_t *ctl_arena,
 			    &astats->astats.retained);
 		}

-		accum_arena_stats_u64(&sdstats->astats.decay_dirty.npurge,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_dirty.npurge,
 		    &astats->astats.decay_dirty.npurge);
-		accum_arena_stats_u64(&sdstats->astats.decay_dirty.nmadvise,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_dirty.nmadvise,
 		    &astats->astats.decay_dirty.nmadvise);
-		accum_arena_stats_u64(&sdstats->astats.decay_dirty.purged,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_dirty.purged,
 		    &astats->astats.decay_dirty.purged);

-		accum_arena_stats_u64(&sdstats->astats.decay_muzzy.npurge,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_muzzy.npurge,
 		    &astats->astats.decay_muzzy.npurge);
-		accum_arena_stats_u64(&sdstats->astats.decay_muzzy.nmadvise,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_muzzy.nmadvise,
 		    &astats->astats.decay_muzzy.nmadvise);
-		accum_arena_stats_u64(&sdstats->astats.decay_muzzy.purged,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_muzzy.purged,
 		    &astats->astats.decay_muzzy.purged);

 #define OP(mtx) malloc_mutex_prof_merge(				\
@ -773,6 +790,8 @@ MUTEX_PROF_ARENA_MUTEXES
 			    &astats->astats.internal);
 			accum_atomic_zu(&sdstats->astats.resident,
 			    &astats->astats.resident);
+			accum_atomic_zu(&sdstats->astats.metadata_thp,
+			    &astats->astats.metadata_thp);
 		} else {
 			assert(atomic_load_zu(
 			    &astats->astats.internal, ATOMIC_RELAXED) == 0);
@ -794,11 +813,11 @@ MUTEX_PROF_ARENA_MUTEXES
 			assert(atomic_load_zu(&astats->astats.allocated_large,
 			    ATOMIC_RELAXED) == 0);
 		}
-		accum_arena_stats_u64(&sdstats->astats.nmalloc_large,
+		ctl_accum_arena_stats_u64(&sdstats->astats.nmalloc_large,
 		    &astats->astats.nmalloc_large);
-		accum_arena_stats_u64(&sdstats->astats.ndalloc_large,
+		ctl_accum_arena_stats_u64(&sdstats->astats.ndalloc_large,
 		    &astats->astats.ndalloc_large);
-		accum_arena_stats_u64(&sdstats->astats.nrequests_large,
+		ctl_accum_arena_stats_u64(&sdstats->astats.nrequests_large,
 		    &astats->astats.nrequests_large);

 		accum_atomic_zu(&sdstats->astats.tcache_bytes,
@ -835,11 +854,11 @@ MUTEX_PROF_ARENA_MUTEXES
 		}

 		for (i = 0; i < NSIZES - NBINS; i++) {
-			accum_arena_stats_u64(&sdstats->lstats[i].nmalloc,
+			ctl_accum_arena_stats_u64(&sdstats->lstats[i].nmalloc,
 			    &astats->lstats[i].nmalloc);
-			accum_arena_stats_u64(&sdstats->lstats[i].ndalloc,
+			ctl_accum_arena_stats_u64(&sdstats->lstats[i].ndalloc,
 			    &astats->lstats[i].ndalloc);
-			accum_arena_stats_u64(&sdstats->lstats[i].nrequests,
+			ctl_accum_arena_stats_u64(&sdstats->lstats[i].nrequests,
 			    &astats->lstats[i].nrequests);
 			if (!destroyed) {
 				sdstats->lstats[i].curlextents +=
@ -938,6 +957,8 @@ ctl_refresh(tsdn_t *tsdn) {
 		    &ctl_sarena->astats->astats.base, ATOMIC_RELAXED) +
 		    atomic_load_zu(&ctl_sarena->astats->astats.internal,
 			ATOMIC_RELAXED);
+		ctl_stats->metadata_thp = atomic_load_zu(
+		    &ctl_sarena->astats->astats.metadata_thp, ATOMIC_RELAXED);
 		ctl_stats->resident = atomic_load_zu(
 		    &ctl_sarena->astats->astats.resident, ATOMIC_RELAXED);
 		ctl_stats->mapped = atomic_load_zu(
@ -1549,6 +1570,71 @@ label_return:
 	return ret;
 }

+static int
+max_background_threads_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	size_t oldval;
+
+	if (!have_background_thread) {
+		return ENOENT;
+	}
+	background_thread_ctl_init(tsd_tsdn(tsd));
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx);
+	malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
+	if (newp == NULL) {
+		oldval = max_background_threads;
+		READ(oldval, size_t);
+	} else {
+		if (newlen != sizeof(size_t)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		oldval = max_background_threads;
+		READ(oldval, size_t);
+
+		size_t newval = *(size_t *)newp;
+		if (newval == oldval) {
+			ret = 0;
+			goto label_return;
+		}
+		if (newval > opt_max_background_threads) {
+			ret = EINVAL;
+			goto label_return;
+		}
+
+		if (background_thread_enabled()) {
+			if (!can_enable_background_thread) {
+				malloc_printf("<jemalloc>: Error in dlsym("
+			            "RTLD_NEXT, \"pthread_create\"). Cannot "
+				    "enable background_thread\n");
+				ret = EFAULT;
+				goto label_return;
+			}
+			background_thread_enabled_set(tsd_tsdn(tsd), false);
+			if (background_threads_disable(tsd)) {
+				ret = EFAULT;
+				goto label_return;
+			}
+			max_background_threads = newval;
+			background_thread_enabled_set(tsd_tsdn(tsd), true);
+			if (background_threads_enable(tsd)) {
+				ret = EFAULT;
+				goto label_return;
+			}
+		} else {
+			max_background_threads = newval;
+		}
+	}
+	ret = 0;
+label_return:
+	malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx);
+
+	return ret;
+}
+
 /******************************************************************************/

 CTL_RO_CONFIG_GEN(config_cache_oblivious, bool)
@ -1560,7 +1646,6 @@ CTL_RO_CONFIG_GEN(config_prof, bool)
 CTL_RO_CONFIG_GEN(config_prof_libgcc, bool)
 CTL_RO_CONFIG_GEN(config_prof_libunwind, bool)
 CTL_RO_CONFIG_GEN(config_stats, bool)
-CTL_RO_CONFIG_GEN(config_thp, bool)
 CTL_RO_CONFIG_GEN(config_utrace, bool)
 CTL_RO_CONFIG_GEN(config_xmalloc, bool)

@ -1568,12 +1653,15 @@ CTL_RO_CONFIG_GEN(config_xmalloc, bool)

 CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
 CTL_RO_NL_GEN(opt_abort_conf, opt_abort_conf, bool)
+CTL_RO_NL_GEN(opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp],
+    const char *)
 CTL_RO_NL_GEN(opt_retain, opt_retain, bool)
 CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
 CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned)
 CTL_RO_NL_GEN(opt_percpu_arena, percpu_arena_mode_names[opt_percpu_arena],
    const char *)
 CTL_RO_NL_GEN(opt_background_thread, opt_background_thread, bool)
+CTL_RO_NL_GEN(opt_max_background_threads, opt_max_background_threads, size_t)
 CTL_RO_NL_GEN(opt_dirty_decay_ms, opt_dirty_decay_ms, ssize_t)
 CTL_RO_NL_GEN(opt_muzzy_decay_ms, opt_muzzy_decay_ms, ssize_t)
 CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool)
@ -1583,6 +1671,9 @@ CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool)
 CTL_RO_NL_CGEN(config_utrace, opt_utrace, opt_utrace, bool)
 CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool)
 CTL_RO_NL_GEN(opt_tcache, opt_tcache, bool)
+CTL_RO_NL_GEN(opt_thp, thp_mode_names[opt_thp], const char *)
+CTL_RO_NL_GEN(opt_lg_extent_max_active_fit, opt_lg_extent_max_active_fit,
+    size_t)
 CTL_RO_NL_GEN(opt_lg_tcache_max, opt_lg_tcache_max, ssize_t)
 CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool)
 CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *)
@ -2162,20 +2253,41 @@ arena_i_extent_hooks_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,

 	malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx);
 	MIB_UNSIGNED(arena_ind, 1);
-	if (arena_ind < narenas_total_get() && (arena =
-	    arena_get(tsd_tsdn(tsd), arena_ind, false)) != NULL) {
-		if (newp != NULL) {
-			extent_hooks_t *old_extent_hooks;
-			extent_hooks_t *new_extent_hooks
-			    JEMALLOC_CC_SILENCE_INIT(NULL);
-			WRITE(new_extent_hooks, extent_hooks_t *);
-			old_extent_hooks = extent_hooks_set(tsd, arena,
-			    new_extent_hooks);
+	if (arena_ind < narenas_total_get()) {
+		extent_hooks_t *old_extent_hooks;
+		arena = arena_get(tsd_tsdn(tsd), arena_ind, false);
+		if (arena == NULL) {
+			if (arena_ind >= narenas_auto) {
+				ret = EFAULT;
+				goto label_return;
+			}
+			old_extent_hooks =
+			    (extent_hooks_t *)&extent_hooks_default;
 			READ(old_extent_hooks, extent_hooks_t *);
+			if (newp != NULL) {
+				/* Initialize a new arena as a side effect. */
+				extent_hooks_t *new_extent_hooks
+				    JEMALLOC_CC_SILENCE_INIT(NULL);
+				WRITE(new_extent_hooks, extent_hooks_t *);
+				arena = arena_init(tsd_tsdn(tsd), arena_ind,
+				    new_extent_hooks);
+				if (arena == NULL) {
+					ret = EFAULT;
+					goto label_return;
+				}
+			}
 		} else {
-			extent_hooks_t *old_extent_hooks =
-			    extent_hooks_get(arena);
-			READ(old_extent_hooks, extent_hooks_t *);
+			if (newp != NULL) {
+				extent_hooks_t *new_extent_hooks
+				    JEMALLOC_CC_SILENCE_INIT(NULL);
+				WRITE(new_extent_hooks, extent_hooks_t *);
+				old_extent_hooks = extent_hooks_set(tsd, arena,
+				    new_extent_hooks);
+				READ(old_extent_hooks, extent_hooks_t *);
+			} else {
+				old_extent_hooks = extent_hooks_get(arena);
+				READ(old_extent_hooks, extent_hooks_t *);
+			}
 		}
 	} else {
 		ret = EFAULT;
@ -2187,6 +2299,42 @@ label_return:
 	return ret;
 }

+static int
+arena_i_retain_grow_limit_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	unsigned arena_ind;
+	arena_t *arena;
+
+	if (!opt_retain) {
+		/* Only relevant when retain is enabled. */
+		return ENOENT;
+	}
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx);
+	MIB_UNSIGNED(arena_ind, 1);
+	if (arena_ind < narenas_total_get() && (arena =
+	    arena_get(tsd_tsdn(tsd), arena_ind, false)) != NULL) {
+		size_t old_limit, new_limit;
+		if (newp != NULL) {
+			WRITE(new_limit, size_t);
+		}
+		bool err = arena_retain_grow_limit_get_set(tsd, arena,
+		    &old_limit, newp != NULL ? &new_limit : NULL);
+		if (!err) {
+			READ(old_limit, size_t);
+			ret = 0;
+		} else {
+			ret = EFAULT;
+		}
+	} else {
+		ret = EFAULT;
+	}
+label_return:
+	malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx);
+	return ret;
+}
+
 static const ctl_named_node_t *
 arena_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
 	const ctl_named_node_t *ret;
@ -2248,7 +2396,7 @@ arenas_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, size_t miblen,
 			ret = EINVAL;
 			goto label_return;
 		}
-		if (dirty ?  arena_dirty_decay_ms_default_set(*(ssize_t *)newp)
+		if (dirty ? arena_dirty_decay_ms_default_set(*(ssize_t *)newp)
 		    : arena_muzzy_decay_ms_default_set(*(ssize_t *)newp)) {
 			ret = EFAULT;
 			goto label_return;
@ -2279,9 +2427,9 @@ CTL_RO_NL_GEN(arenas_page, PAGE, size_t)
 CTL_RO_NL_GEN(arenas_tcache_max, tcache_maxclass, size_t)
 CTL_RO_NL_GEN(arenas_nbins, NBINS, unsigned)
 CTL_RO_NL_GEN(arenas_nhbins, nhbins, unsigned)
-CTL_RO_NL_GEN(arenas_bin_i_size, arena_bin_info[mib[2]].reg_size, size_t)
-CTL_RO_NL_GEN(arenas_bin_i_nregs, arena_bin_info[mib[2]].nregs, uint32_t)
-CTL_RO_NL_GEN(arenas_bin_i_slab_size, arena_bin_info[mib[2]].slab_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_size, bin_infos[mib[2]].reg_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_nregs, bin_infos[mib[2]].nregs, uint32_t)
+CTL_RO_NL_GEN(arenas_bin_i_slab_size, bin_infos[mib[2]].slab_size, size_t)
 static const ctl_named_node_t *
 arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
 	if (i > NBINS) {
@ -2325,6 +2473,36 @@ label_return:
 	return ret;
 }

+static int
+arenas_lookup_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	unsigned arena_ind;
+	void *ptr;
+	extent_t *extent;
+	arena_t *arena;
+
+	ptr = NULL;
+	ret = EINVAL;
+	malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx);
+	WRITE(ptr, void *);
+	extent = iealloc(tsd_tsdn(tsd), ptr);
+	if (extent == NULL)
+		goto label_return;
+
+	arena = extent_arena_get(extent);
+	if (arena == NULL)
+		goto label_return;
+
+	arena_ind = arena_ind_get(arena);
+	READ(arena_ind, unsigned);
+
+	ret = 0;
+label_return:
+	malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx);
+	return ret;
+}
+
 /******************************************************************************/

 static int
@ -2460,6 +2638,7 @@ CTL_RO_NL_CGEN(config_prof, lg_prof_sample, lg_prof_sample, size_t)
 CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats->allocated, size_t)
 CTL_RO_CGEN(config_stats, stats_active, ctl_stats->active, size_t)
 CTL_RO_CGEN(config_stats, stats_metadata, ctl_stats->metadata, size_t)
+CTL_RO_CGEN(config_stats, stats_metadata_thp, ctl_stats->metadata_thp, size_t)
 CTL_RO_CGEN(config_stats, stats_resident, ctl_stats->resident, size_t)
 CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats->mapped, size_t)
 CTL_RO_CGEN(config_stats, stats_retained, ctl_stats->retained, size_t)
@ -2490,24 +2669,24 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_retained,
    size_t)

 CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_npurge,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_dirty.npurge),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_dirty.npurge), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_nmadvise,
-    arena_stats_read_u64(
+    ctl_arena_stats_read_u64(
    &arenas_i(mib[2])->astats->astats.decay_dirty.nmadvise), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_purged,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_dirty.purged),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_dirty.purged), uint64_t)

 CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_npurge,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_muzzy.npurge),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_muzzy.npurge), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_nmadvise,
-    arena_stats_read_u64(
+    ctl_arena_stats_read_u64(
    &arenas_i(mib[2])->astats->astats.decay_muzzy.nmadvise), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_purged,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_muzzy.purged),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_muzzy.purged), uint64_t)

 CTL_RO_CGEN(config_stats, stats_arenas_i_base,
    atomic_load_zu(&arenas_i(mib[2])->astats->astats.base, ATOMIC_RELAXED),
@ -2515,6 +2694,9 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_base,
 CTL_RO_CGEN(config_stats, stats_arenas_i_internal,
    atomic_load_zu(&arenas_i(mib[2])->astats->astats.internal, ATOMIC_RELAXED),
    size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_metadata_thp,
+    atomic_load_zu(&arenas_i(mib[2])->astats->astats.metadata_thp,
+    ATOMIC_RELAXED), size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_tcache_bytes,
    atomic_load_zu(&arenas_i(mib[2])->astats->astats.tcache_bytes,
    ATOMIC_RELAXED), size_t)
@ -2534,14 +2716,17 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_large_allocated,
    atomic_load_zu(&arenas_i(mib[2])->astats->astats.allocated_large,
    ATOMIC_RELAXED), size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_large_nmalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.nmalloc_large),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.nmalloc_large), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_large_ndalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.ndalloc_large),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.ndalloc_large), uint64_t)
+/*
+ * Note: "nmalloc" here instead of "nrequests" in the read.  This is intentional.
+ */
 CTL_RO_CGEN(config_stats, stats_arenas_i_large_nrequests,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.nmalloc_large),
-    uint64_t) /* Intentional. */
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.nmalloc_large), uint64_t) /* Intentional. */

 /* Lock profiling related APIs below. */
 #define RO_MUTEX_CTL_GEN(n, l)						\
@ -2622,7 +2807,7 @@ stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 		MUTEX_PROF_RESET(arena->base->mtx);

 		for (szind_t i = 0; i < NBINS; i++) {
-			arena_bin_t *bin = &arena->bins[i];
+			bin_t *bin = &arena->bins[i];
 			MUTEX_PROF_RESET(bin->lock);
 		}
 	}
@ -2659,14 +2844,14 @@ stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
 }

 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nmalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->lstats[mib[4]].nmalloc),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->lstats[mib[4]].nmalloc), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_ndalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->lstats[mib[4]].ndalloc),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->lstats[mib[4]].ndalloc), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nrequests,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->lstats[mib[4]].nrequests),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->lstats[mib[4]].nrequests), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_curlextents,
    arenas_i(mib[2])->astats->lstats[mib[4]].curlextents, size_t)

--- a/contrib/jemalloc/src/div.c
+++ b/contrib/jemalloc/src/div.c
@ -0,0 +1,55 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#include "jemalloc/internal/div.h"
+
+#include "jemalloc/internal/assert.h"
+
+/*
+ * Suppose we have n = q * d, all integers. We know n and d, and want q = n / d.
+ *
+ * For any k, we have (here, all division is exact; not C-style rounding):
+ * floor(ceil(2^k / d) * n / 2^k) = floor((2^k + r) / d * n / 2^k), where
+ * r = (-2^k) mod d.
+ *
+ * Expanding this out:
+ * ... = floor(2^k / d * n / 2^k + r / d * n / 2^k)
+ *     = floor(n / d + (r / d) * (n / 2^k)).
+ *
+ * The fractional part of n / d is 0 (because of the assumption that d divides n
+ * exactly), so we have:
+ * ... = n / d + floor((r / d) * (n / 2^k))
+ *
+ * So that our initial expression is equal to the quantity we seek, so long as
+ * (r / d) * (n / 2^k) < 1.
+ *
+ * r is a remainder mod d, so r < d and r / d < 1 always. We can make
+ * n / 2 ^ k < 1 by setting k = 32. This gets us a value of magic that works.
+ */
+
+void
+div_init(div_info_t *div_info, size_t d) {
+	/* Nonsensical. */
+	assert(d != 0);
+	/*
+	 * This would make the value of magic too high to fit into a uint32_t
+	 * (we would want magic = 2^32 exactly). This would mess with code gen
+	 * on 32-bit machines.
+	 */
+	assert(d != 1);
+
+	uint64_t two_to_k = ((uint64_t)1 << 32);
+	uint32_t magic = (uint32_t)(two_to_k / d);
+
+	/*
+	 * We want magic = ceil(2^k / d), but C gives us floor. We have to
+	 * increment it unless the result was exact (i.e. unless d is a power of
+	 * two).
+	 */
+	if (two_to_k % d != 0) {
+		magic++;
+	}
+	div_info->magic = magic;
+#ifdef JEMALLOC_DEBUG
+	div_info->d = d;
+#endif
+}
--- a/contrib/jemalloc/src/extent.c
+++ b/contrib/jemalloc/src/extent.c
@ -17,6 +17,8 @@ rtree_t		extents_rtree;
 /* Keyed by the address of the extent_t being protected. */
 mutex_pool_t	extent_mutex_pool;

+size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT;
+
 static const bitmap_info_t extents_bitmap_info =
    BITMAP_INFO_INITIALIZER(NPSIZES+1);

@ -117,7 +119,7 @@ static void extent_record(tsdn_t *tsdn, arena_t *arena,

 /******************************************************************************/

-rb_gen(UNUSED, extent_avail_, extent_tree_t, extent_t, rb_link,
+ph_gen(UNUSED, extent_avail_, extent_tree_t, extent_t, ph_link,
    extent_esnead_comp)

 typedef enum {
@ -304,8 +306,7 @@ extents_npages_get(extents_t *extents) {
 }

 static void
-extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
-    bool preserve_lru) {
+extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent) {
 	malloc_mutex_assert_owner(tsdn, &extents->mtx);
 	assert(extent_state_get(extent) == extents->state);

@ -317,9 +318,7 @@ extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
 		    (size_t)pind);
 	}
 	extent_heap_insert(&extents->heaps[pind], extent);
-	if (!preserve_lru) {
-		extent_list_append(&extents->lru, extent);
-	}
+	extent_list_append(&extents->lru, extent);
 	size_t npages = size >> LG_PAGE;
 	/*
 	 * All modifications to npages hold the mutex (as asserted above), so we
@ -333,8 +332,7 @@ extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
 }

 static void
-extents_remove_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
-    bool preserve_lru) {
+extents_remove_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent) {
 	malloc_mutex_assert_owner(tsdn, &extents->mtx);
 	assert(extent_state_get(extent) == extents->state);

@ -346,9 +344,7 @@ extents_remove_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
 		bitmap_set(extents->bitmap, &extents_bitmap_info,
 		    (size_t)pind);
 	}
-	if (!preserve_lru) {
-		extent_list_remove(&extents->lru, extent);
-	}
+	extent_list_remove(&extents->lru, extent);
 	size_t npages = size >> LG_PAGE;
 	/*
 	 * As in extents_insert_locked, we hold extents->mtx and so don't need
@ -361,6 +357,43 @@ extents_remove_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
 	    cur_extents_npages - (size >> LG_PAGE), ATOMIC_RELAXED);
 }

+/*
+ * Find an extent with size [min_size, max_size) to satisfy the alignment
+ * requirement.  For each size, try only the first extent in the heap.
+ */
+static extent_t *
+extents_fit_alignment(extents_t *extents, size_t min_size, size_t max_size,
+    size_t alignment) {
+        pszind_t pind = sz_psz2ind(extent_size_quantize_ceil(min_size));
+        pszind_t pind_max = sz_psz2ind(extent_size_quantize_ceil(max_size));
+
+	for (pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap,
+	    &extents_bitmap_info, (size_t)pind); i < pind_max; i =
+	    (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
+	    (size_t)i+1)) {
+		assert(i < NPSIZES);
+		assert(!extent_heap_empty(&extents->heaps[i]));
+		extent_t *extent = extent_heap_first(&extents->heaps[i]);
+		uintptr_t base = (uintptr_t)extent_base_get(extent);
+		size_t candidate_size = extent_size_get(extent);
+		assert(candidate_size >= min_size);
+
+		uintptr_t next_align = ALIGNMENT_CEILING((uintptr_t)base,
+		    PAGE_CEILING(alignment));
+		if (base > next_align || base + candidate_size <= next_align) {
+			/* Overflow or not crossing the next alignment. */
+			continue;
+		}
+
+		size_t leadsize = next_align - base;
+		if (candidate_size - leadsize >= min_size) {
+			return extent;
+		}
+	}
+
+	return NULL;
+}
+
 /* Do any-best-fit extent selection, i.e. select any extent that best fits. */
 static extent_t *
 extents_best_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
@ -369,8 +402,15 @@ extents_best_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 	pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)pind);
 	if (i < NPSIZES+1) {
+		/*
+		 * In order to reduce fragmentation, avoid reusing and splitting
+		 * large extents for much smaller sizes.
+		 */
+		if ((sz_pind2sz(i) >> opt_lg_extent_max_active_fit) > size) {
+			return NULL;
+		}
 		assert(!extent_heap_empty(&extents->heaps[i]));
-		extent_t *extent = extent_heap_any(&extents->heaps[i]);
+		extent_t *extent = extent_heap_first(&extents->heaps[i]);
 		assert(extent_size_get(extent) >= size);
 		return extent;
 	}
@ -415,12 +455,30 @@ extents_first_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 */
 static extent_t *
 extents_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
-    size_t size) {
+    size_t esize, size_t alignment) {
 	malloc_mutex_assert_owner(tsdn, &extents->mtx);

-	return extents->delay_coalesce ? extents_best_fit_locked(tsdn, arena,
-	    extents, size) : extents_first_fit_locked(tsdn, arena, extents,
-	    size);
+	size_t max_size = esize + PAGE_CEILING(alignment) - PAGE;
+	/* Beware size_t wrap-around. */
+	if (max_size < esize) {
+		return NULL;
+	}
+
+	extent_t *extent = extents->delay_coalesce ?
+	    extents_best_fit_locked(tsdn, arena, extents, max_size) :
+	    extents_first_fit_locked(tsdn, arena, extents, max_size);
+
+	if (alignment > PAGE && extent == NULL) {
+		/*
+		 * max_size guarantees the alignment requirement but is rather
+		 * pessimistic.  Next we try to satisfy the aligned allocation
+		 * with sizes in [esize, max_size).
+		 */
+		extent = extents_fit_alignment(extents, esize, max_size,
+		    alignment);
+	}
+
+	return extent;
 }

 static bool
@ -436,7 +494,7 @@ extent_try_delayed_coalesce(tsdn_t *tsdn, arena_t *arena,
 	if (!coalesced) {
 		return true;
 	}
-	extents_insert_locked(tsdn, extents, extent, true);
+	extents_insert_locked(tsdn, extents, extent);
 	return false;
 }

@ -449,8 +507,10 @@ extents_alloc(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);

-	return extent_recycle(tsdn, arena, r_extent_hooks, extents, new_addr,
-	    size, pad, alignment, slab, szind, zero, commit, false);
+	extent_t *extent = extent_recycle(tsdn, arena, r_extent_hooks, extents,
+	    new_addr, size, pad, alignment, slab, szind, zero, commit, false);
+	assert(extent == NULL || extent_dumpable_get(extent));
+	return extent;
 }

 void
@ -458,6 +518,7 @@ extents_dalloc(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
    extents_t *extents, extent_t *extent) {
 	assert(extent_base_get(extent) != NULL);
 	assert(extent_size_get(extent) != 0);
+	assert(extent_dumpable_get(extent));
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);

@ -487,14 +548,13 @@ extents_evict(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 			goto label_return;
 		}
 		/* Check the eviction limit. */
-		size_t npages = extent_size_get(extent) >> LG_PAGE;
 		size_t extents_npages = atomic_load_zu(&extents->npages,
 		    ATOMIC_RELAXED);
-		if (extents_npages - npages < npages_min) {
+		if (extents_npages <= npages_min) {
 			extent = NULL;
 			goto label_return;
 		}
-		extents_remove_locked(tsdn, extents, extent, false);
+		extents_remove_locked(tsdn, extents, extent);
 		if (!extents->delay_coalesce) {
 			break;
 		}
@ -567,29 +627,29 @@ extents_postfork_child(tsdn_t *tsdn, extents_t *extents) {

 static void
 extent_deactivate_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
-    extent_t *extent, bool preserve_lru) {
+    extent_t *extent) {
 	assert(extent_arena_get(extent) == arena);
 	assert(extent_state_get(extent) == extent_state_active);

 	extent_state_set(extent, extents_state_get(extents));
-	extents_insert_locked(tsdn, extents, extent, preserve_lru);
+	extents_insert_locked(tsdn, extents, extent);
 }

 static void
 extent_deactivate(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
-    extent_t *extent, bool preserve_lru) {
+    extent_t *extent) {
 	malloc_mutex_lock(tsdn, &extents->mtx);
-	extent_deactivate_locked(tsdn, arena, extents, extent, preserve_lru);
+	extent_deactivate_locked(tsdn, arena, extents, extent);
 	malloc_mutex_unlock(tsdn, &extents->mtx);
 }

 static void
 extent_activate_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
-    extent_t *extent, bool preserve_lru) {
+    extent_t *extent) {
 	assert(extent_arena_get(extent) == arena);
 	assert(extent_state_get(extent) == extents_state_get(extents));

-	extents_remove_locked(tsdn, extents, extent, preserve_lru);
+	extents_remove_locked(tsdn, extents, extent);
 	extent_state_set(extent, extent_state_active);
 }

@ -723,6 +783,13 @@ extent_reregister(tsdn_t *tsdn, extent_t *extent) {
 	assert(!err);
 }

+/*
+ * Removes all pointers to the given extent from the global rtree indices for
+ * its interior.  This is relevant for slab extents, for which we need to do
+ * metadata lookups at places other than the head of the extent.  We deregister
+ * on the interior, then, when an extent moves from being an active slab to an
+ * inactive state.
+ */
 static void
 extent_interior_deregister(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx,
    extent_t *extent) {
@ -737,8 +804,11 @@ extent_interior_deregister(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx,
 	}
 }

+/*
+ * Removes all pointers to the given extent from the global rtree.
+ */
 static void
-extent_deregister(tsdn_t *tsdn, extent_t *extent) {
+extent_deregister_impl(tsdn_t *tsdn, extent_t *extent, bool gdump) {
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 	rtree_leaf_elm_t *elm_a, *elm_b;
@ -755,16 +825,30 @@ extent_deregister(tsdn_t *tsdn, extent_t *extent) {

 	extent_unlock(tsdn, extent);

-	if (config_prof) {
+	if (config_prof && gdump) {
 		extent_gdump_sub(tsdn, extent);
 	}
 }

+static void
+extent_deregister(tsdn_t *tsdn, extent_t *extent) {
+	extent_deregister_impl(tsdn, extent, true);
+}
+
+static void
+extent_deregister_no_gdump_sub(tsdn_t *tsdn, extent_t *extent) {
+	extent_deregister_impl(tsdn, extent, false);
+}
+
+/*
+ * Tries to find and remove an extent from extents that can be used for the
+ * given allocation request.
+ */
 static extent_t *
 extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
    void *new_addr, size_t size, size_t pad, size_t alignment, bool slab,
-    bool *zero, bool *commit, bool growing_retained) {
+    bool growing_retained) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, growing_retained ? 1 : 0);
 	assert(alignment > 0);
@ -786,11 +870,6 @@ extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
 	}

 	size_t esize = size + pad;
-	size_t alloc_size = esize + PAGE_CEILING(alignment) - PAGE;
-	/* Beware size_t wrap-around. */
-	if (alloc_size < esize) {
-		return NULL;
-	}
 	malloc_mutex_lock(tsdn, &extents->mtx);
 	extent_hooks_assure_initialized(arena, r_extent_hooks);
 	extent_t *extent;
@ -812,86 +891,172 @@ extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
 			extent_unlock(tsdn, unlock_extent);
 		}
 	} else {
-		extent = extents_fit_locked(tsdn, arena, extents, alloc_size);
+		extent = extents_fit_locked(tsdn, arena, extents, esize,
+		    alignment);
 	}
 	if (extent == NULL) {
 		malloc_mutex_unlock(tsdn, &extents->mtx);
 		return NULL;
 	}

-	extent_activate_locked(tsdn, arena, extents, extent, false);
+	extent_activate_locked(tsdn, arena, extents, extent);
 	malloc_mutex_unlock(tsdn, &extents->mtx);

-	if (extent_zeroed_get(extent)) {
-		*zero = true;
-	}
-	if (extent_committed_get(extent)) {
-		*commit = true;
-	}
-
 	return extent;
 }

-static extent_t *
-extent_recycle_split(tsdn_t *tsdn, arena_t *arena,
-    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+/*
+ * Given an allocation request and an extent guaranteed to be able to satisfy
+ * it, this splits off lead and trail extents, leaving extent pointing to an
+ * extent satisfying the allocation.
+ * This function doesn't put lead or trail into any extents_t; it's the caller's
+ * job to ensure that they can be reused.
+ */
+typedef enum {
+	/*
+	 * Split successfully.  lead, extent, and trail, are modified to extents
+	 * describing the ranges before, in, and after the given allocation.
+	 */
+	extent_split_interior_ok,
+	/*
+	 * The extent can't satisfy the given allocation request.  None of the
+	 * input extent_t *s are touched.
+	 */
+	extent_split_interior_cant_alloc,
+	/*
+	 * In a potentially invalid state.  Must leak (if *to_leak is non-NULL),
+	 * and salvage what's still salvageable (if *to_salvage is non-NULL).
+	 * None of lead, extent, or trail are valid.
+	 */
+	extent_split_interior_error
+} extent_split_interior_result_t;
+
+static extent_split_interior_result_t
+extent_split_interior(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx,
+    /* The result of splitting, in case of success. */
+    extent_t **extent, extent_t **lead, extent_t **trail,
+    /* The mess to clean up, in case of error. */
+    extent_t **to_leak, extent_t **to_salvage,
    void *new_addr, size_t size, size_t pad, size_t alignment, bool slab,
-    szind_t szind, extent_t *extent, bool growing_retained) {
+    szind_t szind, bool growing_retained) {
 	size_t esize = size + pad;
-	size_t leadsize = ALIGNMENT_CEILING((uintptr_t)extent_base_get(extent),
-	    PAGE_CEILING(alignment)) - (uintptr_t)extent_base_get(extent);
+	size_t leadsize = ALIGNMENT_CEILING((uintptr_t)extent_base_get(*extent),
+	    PAGE_CEILING(alignment)) - (uintptr_t)extent_base_get(*extent);
 	assert(new_addr == NULL || leadsize == 0);
-	assert(extent_size_get(extent) >= leadsize + esize);
-	size_t trailsize = extent_size_get(extent) - leadsize - esize;
+	if (extent_size_get(*extent) < leadsize + esize) {
+		return extent_split_interior_cant_alloc;
+	}
+	size_t trailsize = extent_size_get(*extent) - leadsize - esize;
+
+	*lead = NULL;
+	*trail = NULL;
+	*to_leak = NULL;
+	*to_salvage = NULL;

 	/* Split the lead. */
 	if (leadsize != 0) {
-		extent_t *lead = extent;
-		extent = extent_split_impl(tsdn, arena, r_extent_hooks,
-		    lead, leadsize, NSIZES, false, esize + trailsize, szind,
+		*lead = *extent;
+		*extent = extent_split_impl(tsdn, arena, r_extent_hooks,
+		    *lead, leadsize, NSIZES, false, esize + trailsize, szind,
 		    slab, growing_retained);
-		if (extent == NULL) {
-			extent_deregister(tsdn, lead);
-			extents_leak(tsdn, arena, r_extent_hooks, extents,
-			    lead, growing_retained);
-			return NULL;
+		if (*extent == NULL) {
+			*to_leak = *lead;
+			*lead = NULL;
+			return extent_split_interior_error;
 		}
-		extent_deactivate(tsdn, arena, extents, lead, false);
 	}

 	/* Split the trail. */
 	if (trailsize != 0) {
-		extent_t *trail = extent_split_impl(tsdn, arena,
-		    r_extent_hooks, extent, esize, szind, slab, trailsize,
-		    NSIZES, false, growing_retained);
-		if (trail == NULL) {
-			extent_deregister(tsdn, extent);
-			extents_leak(tsdn, arena, r_extent_hooks, extents,
-			    extent, growing_retained);
-			return NULL;
+		*trail = extent_split_impl(tsdn, arena, r_extent_hooks, *extent,
+		    esize, szind, slab, trailsize, NSIZES, false,
+		    growing_retained);
+		if (*trail == NULL) {
+			*to_leak = *extent;
+			*to_salvage = *lead;
+			*lead = NULL;
+			*extent = NULL;
+			return extent_split_interior_error;
 		}
-		extent_deactivate(tsdn, arena, extents, trail, false);
-	} else if (leadsize == 0) {
+	}
+
+	if (leadsize == 0 && trailsize == 0) {
 		/*
 		 * Splitting causes szind to be set as a side effect, but no
 		 * splitting occurred.
 		 */
-		extent_szind_set(extent, szind);
+		extent_szind_set(*extent, szind);
 		if (szind != NSIZES) {
 			rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx,
-			    (uintptr_t)extent_addr_get(extent), szind, slab);
-			if (slab && extent_size_get(extent) > PAGE) {
+			    (uintptr_t)extent_addr_get(*extent), szind, slab);
+			if (slab && extent_size_get(*extent) > PAGE) {
 				rtree_szind_slab_update(tsdn, &extents_rtree,
 				    rtree_ctx,
-				    (uintptr_t)extent_past_get(extent) -
+				    (uintptr_t)extent_past_get(*extent) -
 				    (uintptr_t)PAGE, szind, slab);
 			}
 		}
 	}

-	return extent;
+	return extent_split_interior_ok;
 }

+/*
+ * This fulfills the indicated allocation request out of the given extent (which
+ * the caller should have ensured was big enough).  If there's any unused space
+ * before or after the resulting allocation, that space is given its own extent
+ * and put back into extents.
+ */
+static extent_t *
+extent_recycle_split(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+    void *new_addr, size_t size, size_t pad, size_t alignment, bool slab,
+    szind_t szind, extent_t *extent, bool growing_retained) {
+	extent_t *lead;
+	extent_t *trail;
+	extent_t *to_leak;
+	extent_t *to_salvage;
+
+	extent_split_interior_result_t result = extent_split_interior(
+	    tsdn, arena, r_extent_hooks, rtree_ctx, &extent, &lead, &trail,
+	    &to_leak, &to_salvage, new_addr, size, pad, alignment, slab, szind,
+	    growing_retained);
+
+	if (result == extent_split_interior_ok) {
+		if (lead != NULL) {
+			extent_deactivate(tsdn, arena, extents, lead);
+		}
+		if (trail != NULL) {
+			extent_deactivate(tsdn, arena, extents, trail);
+		}
+		return extent;
+	} else {
+		/*
+		 * We should have picked an extent that was large enough to
+		 * fulfill our allocation request.
+		 */
+		assert(result == extent_split_interior_error);
+		if (to_salvage != NULL) {
+			extent_deregister(tsdn, to_salvage);
+		}
+		if (to_leak != NULL) {
+			void *leak = extent_base_get(to_leak);
+			extent_deregister_no_gdump_sub(tsdn, to_leak);
+			extents_leak(tsdn, arena, r_extent_hooks, extents,
+			    to_leak, growing_retained);
+			assert(extent_lock_from_addr(tsdn, rtree_ctx, leak)
+			    == NULL);
+		}
+		return NULL;
+	}
+	unreachable();
+}
+
+/*
+ * Tries to satisfy the given allocation request by reusing one of the extents
+ * in the given extents_t.
+ */
 static extent_t *
 extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
    extents_t *extents, void *new_addr, size_t size, size_t pad,
@ -906,16 +1071,12 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);

-	bool committed = false;
 	extent_t *extent = extent_recycle_extract(tsdn, arena, r_extent_hooks,
-	    rtree_ctx, extents, new_addr, size, pad, alignment, slab, zero,
-	    &committed, growing_retained);
+	    rtree_ctx, extents, new_addr, size, pad, alignment, slab,
+	    growing_retained);
 	if (extent == NULL) {
 		return NULL;
 	}
-	if (committed) {
-		*commit = true;
-	}

 	extent = extent_recycle_split(tsdn, arena, r_extent_hooks, rtree_ctx,
 	    extents, new_addr, size, pad, alignment, slab, szind, extent,
@ -934,6 +1095,13 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 		extent_zeroed_set(extent, true);
 	}

+	if (extent_committed_get(extent)) {
+		*commit = true;
+	}
+	if (extent_zeroed_get(extent)) {
+		*zero = true;
+	}
+
 	if (pad != 0) {
 		extent_addr_randomize(tsdn, extent, alignment);
 	}
@ -999,11 +1167,12 @@ extent_alloc_core(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 static void *
 extent_alloc_default_impl(tsdn_t *tsdn, arena_t *arena, void *new_addr,
    size_t size, size_t alignment, bool *zero, bool *commit) {
-	void *ret;
-
-	ret = extent_alloc_core(tsdn, arena, new_addr, size, alignment, zero,
+	void *ret = extent_alloc_core(tsdn, arena, new_addr, size, alignment, zero,
 	    commit, (dss_prec_t)atomic_load_u(&arena->dss_prec,
 	    ATOMIC_RELAXED));
+	if (have_madvise_huge && ret) {
+		pages_set_thp_state(ret, size);
+	}
 	return ret;
 }

@ -1028,7 +1197,18 @@ extent_alloc_default(extent_hooks_t *extent_hooks, void *new_addr, size_t size,
 static void
 extent_hook_pre_reentrancy(tsdn_t *tsdn, arena_t *arena) {
 	tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn);
-	pre_reentrancy(tsd, arena);
+	if (arena == arena_get(tsd_tsdn(tsd), 0, false)) {
+		/*
+		 * The only legitimate case of customized extent hooks for a0 is
+		 * hooks with no allocation activities.  One such example is to
+		 * place metadata on pre-allocated resources such as huge pages.
+		 * In that case, rely on reentrancy_level checks to catch
+		 * infinite recursions.
+		 */
+		pre_reentrancy(tsd, NULL);
+	} else {
+		pre_reentrancy(tsd, arena);
+	}
 }

 static void
@ -1081,9 +1261,8 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,

 	void *ptr;
 	if (*r_extent_hooks == &extent_hooks_default) {
-		ptr = extent_alloc_core(tsdn, arena, NULL, alloc_size, PAGE,
-		    &zeroed, &committed, (dss_prec_t)atomic_load_u(
-		    &arena->dss_prec, ATOMIC_RELAXED));
+		ptr = extent_alloc_default_impl(tsdn, arena, NULL,
+		    alloc_size, PAGE, &zeroed, &committed);
 	} else {
 		extent_hook_pre_reentrancy(tsdn, arena);
 		ptr = (*r_extent_hooks)->alloc(*r_extent_hooks, NULL,
@ -1094,21 +1273,18 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,

 	extent_init(extent, arena, ptr, alloc_size, false, NSIZES,
 	    arena_extent_sn_next(arena), extent_state_active, zeroed,
-	    committed);
+	    committed, true);
 	if (ptr == NULL) {
 		extent_dalloc(tsdn, arena, extent);
 		goto label_err;
 	}
+
 	if (extent_register_no_gdump_add(tsdn, extent)) {
 		extents_leak(tsdn, arena, r_extent_hooks,
 		    &arena->extents_retained, extent, true);
 		goto label_err;
 	}

-	size_t leadsize = ALIGNMENT_CEILING((uintptr_t)ptr,
-	    PAGE_CEILING(alignment)) - (uintptr_t)ptr;
-	assert(alloc_size >= leadsize + esize);
-	size_t trailsize = alloc_size - leadsize - esize;
 	if (extent_zeroed_get(extent) && extent_committed_get(extent)) {
 		*zero = true;
 	}
@ -1116,54 +1292,46 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 		*commit = true;
 	}

-	/* Split the lead. */
-	if (leadsize != 0) {
-		extent_t *lead = extent;
-		extent = extent_split_impl(tsdn, arena, r_extent_hooks, lead,
-		    leadsize, NSIZES, false, esize + trailsize, szind, slab,
-		    true);
-		if (extent == NULL) {
-			extent_deregister(tsdn, lead);
-			extents_leak(tsdn, arena, r_extent_hooks,
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
+
+	extent_t *lead;
+	extent_t *trail;
+	extent_t *to_leak;
+	extent_t *to_salvage;
+	extent_split_interior_result_t result = extent_split_interior(
+	    tsdn, arena, r_extent_hooks, rtree_ctx, &extent, &lead, &trail,
+	    &to_leak, &to_salvage, NULL, size, pad, alignment, slab, szind,
+	    true);
+
+	if (result == extent_split_interior_ok) {
+		if (lead != NULL) {
+			extent_record(tsdn, arena, r_extent_hooks,
 			    &arena->extents_retained, lead, true);
-			goto label_err;
 		}
-		extent_record(tsdn, arena, r_extent_hooks,
-		    &arena->extents_retained, lead, true);
-	}
-
-	/* Split the trail. */
-	if (trailsize != 0) {
-		extent_t *trail = extent_split_impl(tsdn, arena, r_extent_hooks,
-		    extent, esize, szind, slab, trailsize, NSIZES, false, true);
-		if (trail == NULL) {
-			extent_deregister(tsdn, extent);
-			extents_leak(tsdn, arena, r_extent_hooks,
-			    &arena->extents_retained, extent, true);
-			goto label_err;
+		if (trail != NULL) {
+			extent_record(tsdn, arena, r_extent_hooks,
+			    &arena->extents_retained, trail, true);
 		}
-		extent_record(tsdn, arena, r_extent_hooks,
-		    &arena->extents_retained, trail, true);
-	} else if (leadsize == 0) {
+	} else {
 		/*
-		 * Splitting causes szind to be set as a side effect, but no
-		 * splitting occurred.
+		 * We should have allocated a sufficiently large extent; the
+		 * cant_alloc case should not occur.
 		 */
-		rtree_ctx_t rtree_ctx_fallback;
-		rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn,
-		    &rtree_ctx_fallback);
-
-		extent_szind_set(extent, szind);
-		if (szind != NSIZES) {
-			rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx,
-			    (uintptr_t)extent_addr_get(extent), szind, slab);
-			if (slab && extent_size_get(extent) > PAGE) {
-				rtree_szind_slab_update(tsdn, &extents_rtree,
-				    rtree_ctx,
-				    (uintptr_t)extent_past_get(extent) -
-				    (uintptr_t)PAGE, szind, slab);
+		assert(result == extent_split_interior_error);
+		if (to_salvage != NULL) {
+			if (config_prof) {
+				extent_gdump_add(tsdn, to_salvage);
 			}
+			extent_record(tsdn, arena, r_extent_hooks,
+			    &arena->extents_retained, to_salvage, true);
 		}
+		if (to_leak != NULL) {
+			extent_deregister_no_gdump_sub(tsdn, to_leak);
+			extents_leak(tsdn, arena, r_extent_hooks,
+			    &arena->extents_retained, to_leak, true);
+		}
+		goto label_err;
 	}

 	if (*commit && !extent_committed_get(extent)) {
@ -1177,13 +1345,14 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 	}

 	/*
-	 * Increment extent_grow_next if doing so wouldn't exceed the legal
+	 * Increment extent_grow_next if doing so wouldn't exceed the allowed
 	 * range.
 	 */
-	if (arena->extent_grow_next + egn_skip + 1 < NPSIZES) {
+	if (arena->extent_grow_next + egn_skip + 1 <=
+	    arena->retain_grow_limit) {
 		arena->extent_grow_next += egn_skip + 1;
 	} else {
-		arena->extent_grow_next = NPSIZES - 1;
+		arena->extent_grow_next = arena->retain_grow_limit;
 	}
 	/* All opportunities for failure are past. */
 	malloc_mutex_unlock(tsdn, &arena->extent_grow_mtx);
@ -1271,7 +1440,8 @@ extent_alloc_wrapper_hard(tsdn_t *tsdn, arena_t *arena,
 		return NULL;
 	}
 	extent_init(extent, arena, addr, esize, slab, szind,
-	    arena_extent_sn_next(arena), extent_state_active, zero, commit);
+	    arena_extent_sn_next(arena), extent_state_active, *zero, *commit,
+	    true);
 	if (pad != 0) {
 		extent_addr_randomize(tsdn, extent, alignment);
 	}
@ -1296,10 +1466,20 @@ extent_alloc_wrapper(tsdn_t *tsdn, arena_t *arena,
 	extent_t *extent = extent_alloc_retained(tsdn, arena, r_extent_hooks,
 	    new_addr, size, pad, alignment, slab, szind, zero, commit);
 	if (extent == NULL) {
+		if (opt_retain && new_addr != NULL) {
+			/*
+			 * When retain is enabled and new_addr is set, we do not
+			 * attempt extent_alloc_wrapper_hard which does mmap
+			 * that is very unlikely to succeed (unless it happens
+			 * to be at the end).
+			 */
+			return NULL;
+		}
 		extent = extent_alloc_wrapper_hard(tsdn, arena, r_extent_hooks,
 		    new_addr, size, pad, alignment, slab, szind, zero, commit);
 	}

+	assert(extent == NULL || extent_dumpable_get(extent));
 	return extent;
 }

@ -1329,16 +1509,7 @@ extent_coalesce(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
    bool growing_retained) {
 	assert(extent_can_coalesce(arena, extents, inner, outer));

-	if (forward && extents->delay_coalesce) {
-		/*
-		 * The extent that remains after coalescing must occupy the
-		 * outer extent's position in the LRU.  For forward coalescing,
-		 * swap the inner extent into the LRU.
-		 */
-		extent_list_replace(&extents->lru, outer, inner);
-	}
-	extent_activate_locked(tsdn, arena, extents, outer,
-	    extents->delay_coalesce);
+	extent_activate_locked(tsdn, arena, extents, outer);

 	malloc_mutex_unlock(tsdn, &extents->mtx);
 	bool err = extent_merge_impl(tsdn, arena, r_extent_hooks,
@ -1346,11 +1517,7 @@ extent_coalesce(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	malloc_mutex_lock(tsdn, &extents->mtx);

 	if (err) {
-		if (forward && extents->delay_coalesce) {
-			extent_list_replace(&extents->lru, inner, outer);
-		}
-		extent_deactivate_locked(tsdn, arena, extents, outer,
-		    extents->delay_coalesce);
+		extent_deactivate_locked(tsdn, arena, extents, outer);
 	}

 	return err;
@ -1422,6 +1589,10 @@ extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
 	return extent;
 }

+/*
+ * Does the metadata management portions of putting an unused extent into the
+ * given extents_t (coalesces, deregisters slab interiors, the heap operations).
+ */
 static void
 extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
    extents_t *extents, extent_t *extent, bool growing_retained) {
@ -1447,9 +1618,20 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	if (!extents->delay_coalesce) {
 		extent = extent_try_coalesce(tsdn, arena, r_extent_hooks,
 		    rtree_ctx, extents, extent, NULL, growing_retained);
+	} else if (extent_size_get(extent) >= LARGE_MINCLASS) {
+		/* Always coalesce large extents eagerly. */
+		bool coalesced;
+		size_t prev_size;
+		do {
+			prev_size = extent_size_get(extent);
+			assert(extent_state_get(extent) == extent_state_active);
+			extent = extent_try_coalesce(tsdn, arena,
+			    r_extent_hooks, rtree_ctx, extents, extent,
+			    &coalesced, growing_retained);
+		} while (coalesced &&
+		    extent_size_get(extent) >= prev_size + LARGE_MINCLASS);
 	}
-
-	extent_deactivate_locked(tsdn, arena, extents, extent, false);
+	extent_deactivate_locked(tsdn, arena, extents, extent);

 	malloc_mutex_unlock(tsdn, &extents->mtx);
 }
@ -1520,6 +1702,7 @@ extent_dalloc_wrapper_try(tsdn_t *tsdn, arena_t *arena,
 void
 extent_dalloc_wrapper(tsdn_t *tsdn, arena_t *arena,
    extent_hooks_t **r_extent_hooks, extent_t *extent) {
+	assert(extent_dumpable_get(extent));
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);

@ -1780,6 +1963,13 @@ extent_split_default(extent_hooks_t *extent_hooks, void *addr, size_t size,
 }
 #endif

+/*
+ * Accepts the extent to split, and the characteristics of each side of the
+ * split.  The 'a' parameters go with the 'lead' of the resulting pair of
+ * extents (the lower addressed portion of the split), and the 'b' parameters go
+ * with the trail (the higher addressed portion).  This makes 'extent' the lead,
+ * and returns the trail (except in case of error).
+ */
 static extent_t *
 extent_split_impl(tsdn_t *tsdn, arena_t *arena,
    extent_hooks_t **r_extent_hooks, extent_t *extent, size_t size_a,
@ -1803,7 +1993,7 @@ extent_split_impl(tsdn_t *tsdn, arena_t *arena,
 	extent_init(trail, arena, (void *)((uintptr_t)extent_base_get(extent) +
 	    size_a), size_b, slab_b, szind_b, extent_sn_get(extent),
 	    extent_state_get(extent), extent_zeroed_get(extent),
-	    extent_committed_get(extent));
+	    extent_committed_get(extent), extent_dumpable_get(extent));

 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
@ -1814,7 +2004,7 @@ extent_split_impl(tsdn_t *tsdn, arena_t *arena,
 		extent_init(&lead, arena, extent_addr_get(extent), size_a,
 		    slab_a, szind_a, extent_sn_get(extent),
 		    extent_state_get(extent), extent_zeroed_get(extent),
-		    extent_committed_get(extent));
+		    extent_committed_get(extent), extent_dumpable_get(extent));

 		extent_rtree_leaf_elms_lookup(tsdn, rtree_ctx, &lead, false,
 		    true, &lead_elm_a, &lead_elm_b);
--- a/contrib/jemalloc/src/extent_dss.c
+++ b/contrib/jemalloc/src/extent_dss.c
@ -156,7 +156,7 @@ extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 				extent_init(gap, arena, gap_addr_page,
 				    gap_size_page, false, NSIZES,
 				    arena_extent_sn_next(arena),
-				    extent_state_active, false, true);
+				    extent_state_active, false, true, true);
 			}
 			/*
 			 * Compute the address just past the end of the desired
@ -199,7 +199,8 @@ extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,

 					extent_init(&extent, arena, ret, size,
 					    size, false, NSIZES,
-					    extent_state_active, false, true);
+					    extent_state_active, false, true,
+					    true);
 					if (extent_purge_forced_wrapper(tsdn,
 					    arena, &extent_hooks, &extent, 0,
 					    size)) {
--- a/contrib/jemalloc/src/jemalloc.c
+++ b/contrib/jemalloc/src/jemalloc.c
@ -8,6 +8,7 @@
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/extent_mmap.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/log.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree.h"
@ -852,10 +853,8 @@ malloc_conf_error(const char *msg, const char *k, size_t klen, const char *v,
    size_t vlen) {
 	malloc_printf("<jemalloc>: %s: %.*s:%.*s\n", msg, (int)klen, k,
 	    (int)vlen, v);
+	/* If abort_conf is set, error out after processing all options. */
 	had_conf_error = true;
-	if (opt_abort_conf) {
-		malloc_abort_invalid_conf();
-	}
 }

 static void
@ -1055,8 +1054,22 @@ malloc_conf_init(void) {

 			CONF_HANDLE_BOOL(opt_abort, "abort")
 			CONF_HANDLE_BOOL(opt_abort_conf, "abort_conf")
-			if (opt_abort_conf && had_conf_error) {
-				malloc_abort_invalid_conf();
+			if (strncmp("metadata_thp", k, klen) == 0) {
+				int i;
+				bool match = false;
+				for (i = 0; i < metadata_thp_mode_limit; i++) {
+					if (strncmp(metadata_thp_mode_names[i],
+					    v, vlen) == 0) {
+						opt_metadata_thp = i;
+						match = true;
+						break;
+					}
+				}
+				if (!match) {
+					malloc_conf_error("Invalid conf value",
+					    k, klen, v, vlen);
+				}
+				continue;
 			}
 			CONF_HANDLE_BOOL(opt_retain, "retain")
 			if (strncmp("dss", k, klen) == 0) {
@ -1132,12 +1145,14 @@ malloc_conf_init(void) {
 				CONF_HANDLE_BOOL(opt_xmalloc, "xmalloc")
 			}
 			CONF_HANDLE_BOOL(opt_tcache, "tcache")
+			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
+			    "lg_extent_max_active_fit", 0,
+			    (sizeof(size_t) << 3), yes, yes, false)
 			CONF_HANDLE_SSIZE_T(opt_lg_tcache_max, "lg_tcache_max",
 			    -1, (sizeof(size_t) << 3) - 1)
 			if (strncmp("percpu_arena", k, klen) == 0) {
-				int i;
 				bool match = false;
-				for (i = percpu_arena_mode_names_base; i <
+				for (int i = percpu_arena_mode_names_base; i <
 				    percpu_arena_mode_names_limit; i++) {
 					if (strncmp(percpu_arena_mode_names[i],
 					    v, vlen) == 0) {
@ -1159,6 +1174,10 @@ malloc_conf_init(void) {
 			}
 			CONF_HANDLE_BOOL(opt_background_thread,
 			    "background_thread");
+			CONF_HANDLE_SIZE_T(opt_max_background_threads,
+					   "max_background_threads", 1,
+					   opt_max_background_threads, yes, yes,
+					   true);
 			if (config_prof) {
 				CONF_HANDLE_BOOL(opt_prof, "prof")
 				CONF_HANDLE_CHAR_P(opt_prof_prefix,
@ -1177,6 +1196,37 @@ malloc_conf_init(void) {
 				CONF_HANDLE_BOOL(opt_prof_final, "prof_final")
 				CONF_HANDLE_BOOL(opt_prof_leak, "prof_leak")
 			}
+			if (config_log) {
+				if (CONF_MATCH("log")) {
+					size_t cpylen = (
+					    vlen <= sizeof(log_var_names) ?
+					    vlen : sizeof(log_var_names) - 1);
+					strncpy(log_var_names, v, cpylen);
+					log_var_names[cpylen] = '\0';
+					continue;
+				}
+			}
+			if (CONF_MATCH("thp")) {
+				bool match = false;
+				for (int i = 0; i < thp_mode_names_limit; i++) {
+					if (strncmp(thp_mode_names[i],v, vlen)
+					    == 0) {
+						if (!have_madvise_huge) {
+							malloc_conf_error(
+							    "No THP support",
+							    k, klen, v, vlen);
+						}
+						opt_thp = i;
+						match = true;
+						break;
+					}
+				}
+				if (!match) {
+					malloc_conf_error("Invalid conf value",
+					    k, klen, v, vlen);
+				}
+				continue;
+			}
 			malloc_conf_error("Invalid conf pair", k, klen, v,
 			    vlen);
 #undef CONF_MATCH
@ -1192,7 +1242,11 @@ malloc_conf_init(void) {
 #undef CONF_HANDLE_SSIZE_T
 #undef CONF_HANDLE_CHAR_P
 		}
+		if (opt_abort_conf && had_conf_error) {
+			malloc_abort_invalid_conf();
+		}
 	}
+	atomic_store_b(&log_init_done, true, ATOMIC_RELEASE);
 }

 static bool
@ -1497,6 +1551,8 @@ malloc_init_hard(void) {
 	post_reentrancy(tsd);
 	malloc_mutex_unlock(tsd_tsdn(tsd), &init_lock);

+	witness_assert_lockless(witness_tsd_tsdn(
+	    tsd_witness_tsdp_get_unsafe(tsd)));
 	malloc_tsd_boot1();
 	/* Update TSD after tsd_boot1. */
 	tsd = tsd_fetch();
@ -1504,8 +1560,11 @@ malloc_init_hard(void) {
 		assert(have_background_thread);
 		/*
 		 * Need to finish init & unlock first before creating background
-		 * threads (pthread_create depends on malloc).
+		 * threads (pthread_create depends on malloc).  ctl_init (which
+		 * sets isthreaded) needs to be called without holding any lock.
 		 */
+		background_thread_ctl_init(tsd_tsdn(tsd));
+
 		malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
 		bool err = background_thread_create(tsd, 0);
 		malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
@ -1705,7 +1764,7 @@ compute_size_with_overflow(bool may_overflow, dynamic_opts_t *dopts,
 	}

 	/* A size_t with its high-half bits all set to 1. */
-	const static size_t high_bits = SIZE_T_MAX << (sizeof(size_t) * 8 / 2);
+	static const size_t high_bits = SIZE_T_MAX << (sizeof(size_t) * 8 / 2);

 	*size = dopts->item_size * dopts->num_items;

@ -1966,6 +2025,8 @@ je_malloc(size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;

+	LOG("core.malloc.entry", "size: %zu", size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);

@ -1980,6 +2041,8 @@ je_malloc(size_t size) {

 	imalloc(&sopts, &dopts);

+	LOG("core.malloc.exit", "result: %p", ret);
+
 	return ret;
 }

@ -1990,6 +2053,9 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;

+	LOG("core.posix_memalign.entry", "mem ptr: %p, alignment: %zu, "
+	    "size: %zu", memptr, alignment, size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);

@ -2006,6 +2072,10 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size) {
 	dopts.alignment = alignment;

 	ret = imalloc(&sopts, &dopts);
+
+	LOG("core.posix_memalign.exit", "result: %d, alloc ptr: %p", ret,
+	    *memptr);
+
 	return ret;
 }

@ -2018,6 +2088,9 @@ je_aligned_alloc(size_t alignment, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;

+	LOG("core.aligned_alloc.entry", "alignment: %zu, size: %zu\n",
+	    alignment, size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);

@ -2036,6 +2109,9 @@ je_aligned_alloc(size_t alignment, size_t size) {
 	dopts.alignment = alignment;

 	imalloc(&sopts, &dopts);
+
+	LOG("core.aligned_alloc.exit", "result: %p", ret);
+
 	return ret;
 }

@ -2047,6 +2123,8 @@ je_calloc(size_t num, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;

+	LOG("core.calloc.entry", "num: %zu, size: %zu\n", num, size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);

@ -2063,6 +2141,8 @@ je_calloc(size_t num, size_t size) {

 	imalloc(&sopts, &dopts);

+	LOG("core.calloc.exit", "result: %p", ret);
+
 	return ret;
 }

@ -2165,17 +2245,37 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache, bool slow_path) {
 	assert(malloc_initialized() || IS_INITIALIZER);

 	alloc_ctx_t alloc_ctx, *ctx;
-	if (config_prof && opt_prof) {
+	if (!config_cache_oblivious && ((uintptr_t)ptr & PAGE_MASK) != 0) {
+		/*
+		 * When cache_oblivious is disabled and ptr is not page aligned,
+		 * the allocation was not sampled -- usize can be used to
+		 * determine szind directly.
+		 */
+		alloc_ctx.szind = sz_size2index(usize);
+		alloc_ctx.slab = true;
+		ctx = &alloc_ctx;
+		if (config_debug) {
+			alloc_ctx_t dbg_ctx;
+			rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
+			rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree,
+			    rtree_ctx, (uintptr_t)ptr, true, &dbg_ctx.szind,
+			    &dbg_ctx.slab);
+			assert(dbg_ctx.szind == alloc_ctx.szind);
+			assert(dbg_ctx.slab == alloc_ctx.slab);
+		}
+	} else if (config_prof && opt_prof) {
 		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 		rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 		    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
 		assert(alloc_ctx.szind == sz_size2index(usize));
 		ctx = &alloc_ctx;
-		prof_free(tsd, ptr, usize, ctx);
 	} else {
 		ctx = NULL;
 	}

+	if (config_prof && opt_prof) {
+		prof_free(tsd, ptr, usize, ctx);
+	}
 	if (config_stats) {
 		*tsd_thread_deallocatedp_get(tsd) += usize;
 	}
@ -2196,6 +2296,8 @@ je_realloc(void *ptr, size_t size) {
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t old_usize = 0;

+	LOG("core.realloc.entry", "ptr: %p, size: %zu\n", ptr, size);
+
 	if (unlikely(size == 0)) {
 		if (ptr != NULL) {
 			/* realloc(ptr, 0) is equivalent to free(ptr). */
@ -2208,6 +2310,8 @@ je_realloc(void *ptr, size_t size) {
 				tcache = NULL;
 			}
 			ifree(tsd, ptr, tcache, true);
+
+			LOG("core.realloc.exit", "result: %p", NULL);
 			return NULL;
 		}
 		size = 1;
@ -2240,7 +2344,9 @@ je_realloc(void *ptr, size_t size) {
 		tsdn = tsd_tsdn(tsd);
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
-		return je_malloc(size);
+		void *ret = je_malloc(size);
+		LOG("core.realloc.exit", "result: %p", ret);
+		return ret;
 	}

 	if (unlikely(ret == NULL)) {
@ -2261,11 +2367,15 @@ je_realloc(void *ptr, size_t size) {
 	}
 	UTRACE(ptr, size, ret);
 	check_entry_exit_locking(tsdn);
+
+	LOG("core.realloc.exit", "result: %p", ret);
 	return ret;
 }

 JEMALLOC_EXPORT void JEMALLOC_NOTHROW
 je_free(void *ptr) {
+	LOG("core.free.entry", "ptr: %p", ptr);
+
 	UTRACE(ptr, 0, 0);
 	if (likely(ptr != NULL)) {
 		/*
@ -2295,6 +2405,7 @@ je_free(void *ptr) {
 		}
 		check_entry_exit_locking(tsd_tsdn(tsd));
 	}
+	LOG("core.free.exit", "");
 }

 /*
@ -2314,6 +2425,9 @@ je_memalign(size_t alignment, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;

+	LOG("core.memalign.entry", "alignment: %zu, size: %zu\n", alignment,
+	    size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);

@ -2331,6 +2445,8 @@ je_memalign(size_t alignment, size_t size) {
 	dopts.alignment = alignment;

 	imalloc(&sopts, &dopts);
+
+	LOG("core.memalign.exit", "result: %p", ret);
 	return ret;
 }
 #endif
@ -2345,6 +2461,8 @@ je_valloc(size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;

+	LOG("core.valloc.entry", "size: %zu\n", size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);

@ -2363,6 +2481,7 @@ je_valloc(size_t size) {

 	imalloc(&sopts, &dopts);

+	LOG("core.valloc.exit", "result: %p\n", ret);
 	return ret;
 }
 #endif
@ -2436,6 +2555,8 @@ je_mallocx(size_t size, int flags) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;

+	LOG("core.mallocx.entry", "size: %zu, flags: %d", size, flags);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);

@ -2469,6 +2590,8 @@ je_mallocx(size_t size, int flags) {
 	}

 	imalloc(&sopts, &dopts);
+
+	LOG("core.mallocx.exit", "result: %p", ret);
 	return ret;
 }

@ -2549,6 +2672,10 @@ je_rallocx(void *ptr, size_t size, int flags) {
 	arena_t *arena;
 	tcache_t *tcache;

+	LOG("core.rallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
+	    size, flags);
+
+
 	assert(ptr != NULL);
 	assert(size != 0);
 	assert(malloc_initialized() || IS_INITIALIZER);
@ -2611,6 +2738,8 @@ je_rallocx(void *ptr, size_t size, int flags) {
 	}
 	UTRACE(ptr, size, p);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.rallocx.exit", "result: %p", p);
 	return p;
 label_oom:
 	if (config_xmalloc && unlikely(opt_xmalloc)) {
@ -2619,6 +2748,8 @@ label_oom:
 	}
 	UTRACE(ptr, size, 0);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.rallocx.exit", "result: %p", NULL);
 	return NULL;
 }

@ -2705,6 +2836,9 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 	size_t alignment = MALLOCX_ALIGN_GET(flags);
 	bool zero = flags & MALLOCX_ZERO;

+	LOG("core.xallocx.entry", "ptr: %p, size: %zu, extra: %zu, "
+	    "flags: %d", ptr, size, extra, flags);
+
 	assert(ptr != NULL);
 	assert(size != 0);
 	assert(SIZE_T_MAX - size >= extra);
@ -2754,15 +2888,19 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 label_not_resized:
 	UTRACE(ptr, size, ptr);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.xallocx.exit", "result: %zu", usize);
 	return usize;
 }

 JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
 JEMALLOC_ATTR(pure)
-je_sallocx(const void *ptr, int flags) {
+je_sallocx(const void *ptr, UNUSED int flags) {
 	size_t usize;
 	tsdn_t *tsdn;

+	LOG("core.sallocx.entry", "ptr: %p, flags: %d", ptr, flags);
+
 	assert(malloc_initialized() || IS_INITIALIZER);
 	assert(ptr != NULL);

@ -2777,11 +2915,15 @@ je_sallocx(const void *ptr, int flags) {
 	}

 	check_entry_exit_locking(tsdn);
+
+	LOG("core.sallocx.exit", "result: %zu", usize);
 	return usize;
 }

 JEMALLOC_EXPORT void JEMALLOC_NOTHROW
 je_dallocx(void *ptr, int flags) {
+	LOG("core.dallocx.entry", "ptr: %p, flags: %d", ptr, flags);
+
 	assert(ptr != NULL);
 	assert(malloc_initialized() || IS_INITIALIZER);

@ -2819,6 +2961,8 @@ je_dallocx(void *ptr, int flags) {
 		ifree(tsd, ptr, tcache, true);
 	}
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.dallocx.exit", "");
 }

 JEMALLOC_ALWAYS_INLINE size_t
@ -2840,6 +2984,9 @@ je_sdallocx(void *ptr, size_t size, int flags) {
 	assert(ptr != NULL);
 	assert(malloc_initialized() || IS_INITIALIZER);

+	LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
+	    size, flags);
+
 	tsd_t *tsd = tsd_fetch();
 	bool fast = tsd_fast(tsd);
 	size_t usize = inallocx(tsd_tsdn(tsd), size, flags);
@ -2876,6 +3023,8 @@ je_sdallocx(void *ptr, size_t size, int flags) {
 		isfree(tsd, ptr, usize, tcache, true);
 	}
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.sdallocx.exit", "");
 }

 JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
@ -2887,6 +3036,7 @@ je_nallocx(size_t size, int flags) {
 	assert(size != 0);

 	if (unlikely(malloc_init())) {
+		LOG("core.nallocx.exit", "result: %zu", ZU(0));
 		return 0;
 	}

@ -2895,10 +3045,12 @@ je_nallocx(size_t size, int flags) {

 	usize = inallocx(tsdn, size, flags);
 	if (unlikely(usize > LARGE_MAXCLASS)) {
+		LOG("core.nallocx.exit", "result: %zu", ZU(0));
 		return 0;
 	}

 	check_entry_exit_locking(tsdn);
+	LOG("core.nallocx.exit", "result: %zu", usize);
 	return usize;
 }

@ -2908,7 +3060,10 @@ je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
 	int ret;
 	tsd_t *tsd;

+	LOG("core.mallctl.entry", "name: %s", name);
+
 	if (unlikely(malloc_init())) {
+		LOG("core.mallctl.exit", "result: %d", EAGAIN);
 		return EAGAIN;
 	}

@ -2916,6 +3071,8 @@ je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
 	check_entry_exit_locking(tsd_tsdn(tsd));
 	ret = ctl_byname(tsd, name, oldp, oldlenp, newp, newlen);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.mallctl.exit", "result: %d", ret);
 	return ret;
 }

@ -2923,7 +3080,10 @@ JEMALLOC_EXPORT int JEMALLOC_NOTHROW
 je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp) {
 	int ret;

+	LOG("core.mallctlnametomib.entry", "name: %s", name);
+
 	if (unlikely(malloc_init())) {
+		LOG("core.mallctlnametomib.exit", "result: %d", EAGAIN);
 		return EAGAIN;
 	}

@ -2931,6 +3091,8 @@ je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp) {
 	check_entry_exit_locking(tsd_tsdn(tsd));
 	ret = ctl_nametomib(tsd, name, mibp, miblenp);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.mallctlnametomib.exit", "result: %d", ret);
 	return ret;
 }

@ -2940,7 +3102,10 @@ je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	int ret;
 	tsd_t *tsd;

+	LOG("core.mallctlbymib.entry", "");
+
 	if (unlikely(malloc_init())) {
+		LOG("core.mallctlbymib.exit", "result: %d", EAGAIN);
 		return EAGAIN;
 	}

@ -2948,6 +3113,7 @@ je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	check_entry_exit_locking(tsd_tsdn(tsd));
 	ret = ctl_bymib(tsd, mib, miblen, oldp, oldlenp, newp, newlen);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+	LOG("core.mallctlbymib.exit", "result: %d", ret);
 	return ret;
 }

@ -2956,10 +3122,13 @@ je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
    const char *opts) {
 	tsdn_t *tsdn;

+	LOG("core.malloc_stats_print.entry", "");
+
 	tsdn = tsdn_fetch();
 	check_entry_exit_locking(tsdn);
 	stats_print(write_cb, cbopaque, opts);
 	check_entry_exit_locking(tsdn);
+	LOG("core.malloc_stats_print.exit", "");
 }

 JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
@ -2967,6 +3136,8 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) {
 	size_t ret;
 	tsdn_t *tsdn;

+	LOG("core.malloc_usable_size.entry", "ptr: %p", ptr);
+
 	assert(malloc_initialized() || IS_INITIALIZER);

 	tsdn = tsdn_fetch();
@ -2984,6 +3155,7 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) {
 	}

 	check_entry_exit_locking(tsdn);
+	LOG("core.malloc_usable_size.exit", "result: %zu", ret);
 	return ret;
 }

--- a/contrib/jemalloc/src/log.c
+++ b/contrib/jemalloc/src/log.c
@ -0,0 +1,78 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+#include "jemalloc/internal/log.h"
+
+char log_var_names[JEMALLOC_LOG_VAR_BUFSIZE];
+atomic_b_t log_init_done = ATOMIC_INIT(false);
+
+/*
+ * Returns true if we were able to pick out a segment.  Fills in r_segment_end
+ * with a pointer to the first character after the end of the string.
+ */
+static const char *
+log_var_extract_segment(const char* segment_begin) {
+	const char *end;
+	for (end = segment_begin; *end != '\0' && *end != '|'; end++) {
+	}
+	return end;
+}
+
+static bool
+log_var_matches_segment(const char *segment_begin, const char *segment_end,
+    const char *log_var_begin, const char *log_var_end) {
+	assert(segment_begin <= segment_end);
+	assert(log_var_begin < log_var_end);
+
+	ptrdiff_t segment_len = segment_end - segment_begin;
+	ptrdiff_t log_var_len = log_var_end - log_var_begin;
+	/* The special '.' segment matches everything. */
+	if (segment_len == 1 && *segment_begin == '.') {
+		return true;
+	}
+        if (segment_len == log_var_len) {
+		return strncmp(segment_begin, log_var_begin, segment_len) == 0;
+	} else if (segment_len < log_var_len) {
+		return strncmp(segment_begin, log_var_begin, segment_len) == 0
+		    && log_var_begin[segment_len] == '.';
+        } else {
+		return false;
+	}
+}
+
+unsigned
+log_var_update_state(log_var_t *log_var) {
+	const char *log_var_begin = log_var->name;
+	const char *log_var_end = log_var->name + strlen(log_var->name);
+
+	/* Pointer to one before the beginning of the current segment. */
+	const char *segment_begin = log_var_names;
+
+	/*
+	 * If log_init done is false, we haven't parsed the malloc conf yet.  To
+	 * avoid log-spew, we default to not displaying anything.
+	 */
+	if (!atomic_load_b(&log_init_done, ATOMIC_ACQUIRE)) {
+		return LOG_INITIALIZED_NOT_ENABLED;
+	}
+
+	while (true) {
+		const char *segment_end = log_var_extract_segment(
+		    segment_begin);
+		assert(segment_end < log_var_names + JEMALLOC_LOG_VAR_BUFSIZE);
+		if (log_var_matches_segment(segment_begin, segment_end,
+		    log_var_begin, log_var_end)) {
+			atomic_store_u(&log_var->state, LOG_ENABLED,
+			    ATOMIC_RELAXED);
+			return LOG_ENABLED;
+		}
+		if (*segment_end == '\0') {
+			/* Hit the end of the segment string with no match. */
+			atomic_store_u(&log_var->state,
+			    LOG_INITIALIZED_NOT_ENABLED, ATOMIC_RELAXED);
+			return LOG_INITIALIZED_NOT_ENABLED;
+		}
+		/* Otherwise, skip the delimiter and continue. */
+		segment_begin = segment_end + 1;
+	}
+}
--- a/contrib/jemalloc/src/malloc_io.c
+++ b/contrib/jemalloc/src/malloc_io.c
@ -70,20 +70,7 @@ static char *x2s(uintmax_t x, bool alt_form, bool uppercase, char *s,
 /* malloc_message() setup. */
 static void
 wrtmessage(void *cbopaque, const char *s) {
-#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_write)
-	/*
-	 * Use syscall(2) rather than write(2) when possible in order to avoid
-	 * the possibility of memory allocation within libc.  This is necessary
-	 * on FreeBSD; most operating systems do not have this problem though.
-	 *
-	 * syscall() returns long or int, depending on platform, so capture the
-	 * unused result in the widest plausible type to avoid compiler
-	 * warnings.
-	 */
-	UNUSED long result = syscall(SYS_write, STDERR_FILENO, s, strlen(s));
-#else
-	UNUSED ssize_t result = write(STDERR_FILENO, s, strlen(s));
-#endif
+	malloc_write_fd(STDERR_FILENO, s, strlen(s));
 }

 JEMALLOC_EXPORT void	(*je_malloc_message)(void *, const char *s);
@ -125,7 +112,7 @@ buferror(int err, char *buf, size_t buflen) {
 	FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, err, 0,
 	    (LPSTR)buf, (DWORD)buflen, NULL);
 	return 0;
-#elif defined(__GLIBC__) && defined(_GNU_SOURCE)
+#elif defined(JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE) && defined(_GNU_SOURCE)
 	char *b = strerror_r(err, buf, buflen);
 	if (b != buf) {
 		strncpy(buf, b, buflen);
--- a/contrib/jemalloc/src/mutex.c
+++ b/contrib/jemalloc/src/mutex.c
@ -4,6 +4,7 @@

 #include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/malloc_io.h"
+#include "jemalloc/internal/spin.h"

 #ifndef _CRT_SPINCOUNT
 #define _CRT_SPINCOUNT 4000
@ -64,7 +65,7 @@ malloc_mutex_lock_slow(malloc_mutex_t *mutex) {

 	int cnt = 0, max_cnt = MALLOC_MUTEX_MAX_SPIN;
 	do {
-		CPU_SPINWAIT;
+		spin_cpu_spinwait();
 		if (!malloc_mutex_trylock_final(mutex)) {
 			data->n_spin_acquired++;
 			return;
@ -194,7 +195,7 @@ malloc_mutex_init(malloc_mutex_t *mutex, const char *name,
 		mutex->lock_order = lock_order;
 		if (lock_order == malloc_mutex_address_ordered) {
 			witness_init(&mutex->witness, name, rank,
-			    mutex_addr_comp, &mutex);
+			    mutex_addr_comp, mutex);
 		} else {
 			witness_init(&mutex->witness, name, rank, NULL, NULL);
 		}
--- a/contrib/jemalloc/src/pages.c
+++ b/contrib/jemalloc/src/pages.c
@ -10,6 +10,9 @@

 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
 #include <sys/sysctl.h>
+#ifdef __FreeBSD__
+#include <vm/vm_param.h>
+#endif
 #endif

 /******************************************************************************/
@ -25,6 +28,18 @@ static int	mmap_flags;
 #endif
 static bool	os_overcommits;

+const char *thp_mode_names[] = {
+	"default",
+	"always",
+	"never",
+	"not supported"
+};
+thp_mode_t opt_thp = THP_MODE_DEFAULT;
+thp_mode_t init_system_thp_mode;
+
+/* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
+static bool pages_can_purge_lazy_runtime = true;
+
 /******************************************************************************/
 /*
 * Function prototypes for static functions that are referenced prior to
@ -252,12 +267,25 @@ pages_purge_lazy(void *addr, size_t size) {
 	if (!pages_can_purge_lazy) {
 		return true;
 	}
+	if (!pages_can_purge_lazy_runtime) {
+		/*
+		 * Built with lazy purge enabled, but detected it was not
+		 * supported on the current system.
+		 */
+		return true;
+	}

 #ifdef _WIN32
 	VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
 	return false;
 #elif defined(JEMALLOC_PURGE_MADVISE_FREE)
-	return (madvise(addr, size, MADV_FREE) != 0);
+	return (madvise(addr, size,
+#  ifdef MADV_FREE
+	    MADV_FREE
+#  else
+	    JEMALLOC_MADV_FREE
+#  endif
+	    ) != 0);
 #elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
    !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
 	return (madvise(addr, size, MADV_DONTNEED) != 0);
@ -286,12 +314,13 @@ pages_purge_forced(void *addr, size_t size) {
 #endif
 }

-bool
-pages_huge(void *addr, size_t size) {
-	assert(HUGEPAGE_ADDR2BASE(addr) == addr);
-	assert(HUGEPAGE_CEILING(size) == size);
-
-#ifdef JEMALLOC_THP
+static bool
+pages_huge_impl(void *addr, size_t size, bool aligned) {
+	if (aligned) {
+		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
+		assert(HUGEPAGE_CEILING(size) == size);
+	}
+#ifdef JEMALLOC_HAVE_MADVISE_HUGE
 	return (madvise(addr, size, MADV_HUGEPAGE) != 0);
 #else
 	return true;
@ -299,23 +328,70 @@ pages_huge(void *addr, size_t size) {
 }

 bool
-pages_nohuge(void *addr, size_t size) {
-	assert(HUGEPAGE_ADDR2BASE(addr) == addr);
-	assert(HUGEPAGE_CEILING(size) == size);
+pages_huge(void *addr, size_t size) {
+	return pages_huge_impl(addr, size, true);
+}

-#ifdef JEMALLOC_THP
+static bool
+pages_huge_unaligned(void *addr, size_t size) {
+	return pages_huge_impl(addr, size, false);
+}
+
+static bool
+pages_nohuge_impl(void *addr, size_t size, bool aligned) {
+	if (aligned) {
+		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
+		assert(HUGEPAGE_CEILING(size) == size);
+	}
+
+#ifdef JEMALLOC_HAVE_MADVISE_HUGE
 	return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
 #else
 	return false;
 #endif
 }

+bool
+pages_nohuge(void *addr, size_t size) {
+	return pages_nohuge_impl(addr, size, true);
+}
+
+static bool
+pages_nohuge_unaligned(void *addr, size_t size) {
+	return pages_nohuge_impl(addr, size, false);
+}
+
+bool
+pages_dontdump(void *addr, size_t size) {
+	assert(PAGE_ADDR2BASE(addr) == addr);
+	assert(PAGE_CEILING(size) == size);
+#ifdef JEMALLOC_MADVISE_DONTDUMP
+	return madvise(addr, size, MADV_DONTDUMP) != 0;
+#else
+	return false;
+#endif
+}
+
+bool
+pages_dodump(void *addr, size_t size) {
+	assert(PAGE_ADDR2BASE(addr) == addr);
+	assert(PAGE_CEILING(size) == size);
+#ifdef JEMALLOC_MADVISE_DONTDUMP
+	return madvise(addr, size, MADV_DODUMP) != 0;
+#else
+	return false;
+#endif
+}
+
+
 static size_t
 os_page_detect(void) {
 #ifdef _WIN32
 	SYSTEM_INFO si;
 	GetSystemInfo(&si);
 	return si.dwPageSize;
+#elif defined(__FreeBSD__)
+	return getpagesize();
 #else
 	long result = sysconf(_SC_PAGESIZE);
 	if (result == -1) {
@ -332,9 +408,19 @@ os_overcommits_sysctl(void) {
 	size_t sz;

 	sz = sizeof(vm_overcommit);
+#if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
+	int mib[2];
+
+	mib[0] = CTL_VM;
+	mib[1] = VM_OVERCOMMIT;
+	if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
+		return false; /* Error. */
+	}
+#else
 	if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
 		return false; /* Error. */
 	}
+#endif

 	return ((vm_overcommit & 0x3) == 0);
 }
@ -350,27 +436,44 @@ static bool
 os_overcommits_proc(void) {
 	int fd;
 	char buf[1];
-	ssize_t nread;

 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
-	fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
-	    O_CLOEXEC);
+	#if defined(O_CLOEXEC)
+		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
+			O_CLOEXEC);
+	#else
+		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
+		if (fd != -1) {
+			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+		}
+	#endif
 #elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
-	fd = (int)syscall(SYS_openat,
-	    AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#if defined(O_CLOEXEC)
+		fd = (int)syscall(SYS_openat,
+			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#else
+		fd = (int)syscall(SYS_openat,
+			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY);
+		if (fd != -1) {
+			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+		}
+	#endif
 #else
-	fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#if defined(O_CLOEXEC)
+		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#else
+		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+		if (fd != -1) {
+			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+		}
+	#endif
 #endif
+
 	if (fd == -1) {
 		return false; /* Error. */
 	}

-#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_read)
-	nread = (ssize_t)syscall(SYS_read, fd, &buf, sizeof(buf));
-#else
-	nread = read(fd, &buf, sizeof(buf));
-#endif
-
+	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
 	syscall(SYS_close, fd);
 #else
@ -390,6 +493,71 @@ os_overcommits_proc(void) {
 }
 #endif

+void
+pages_set_thp_state (void *ptr, size_t size) {
+	if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) {
+		return;
+	}
+	assert(opt_thp != thp_mode_not_supported &&
+	    init_system_thp_mode != thp_mode_not_supported);
+
+	if (opt_thp == thp_mode_always
+	    && init_system_thp_mode != thp_mode_never) {
+		assert(init_system_thp_mode == thp_mode_default);
+		pages_huge_unaligned(ptr, size);
+	} else if (opt_thp == thp_mode_never) {
+		assert(init_system_thp_mode == thp_mode_default ||
+		    init_system_thp_mode == thp_mode_always);
+		pages_nohuge_unaligned(ptr, size);
+	}
+}
+
+static void
+init_thp_state(void) {
+	if (!have_madvise_huge) {
+		if (metadata_thp_enabled() && opt_abort) {
+			malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
+			abort();
+		}
+		goto label_error;
+	}
+
+	static const char sys_state_madvise[] = "always [madvise] never\n";
+	static const char sys_state_always[] = "[always] madvise never\n";
+	static const char sys_state_never[] = "always madvise [never]\n";
+	char buf[sizeof(sys_state_madvise)];
+
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
+	int fd = (int)syscall(SYS_open,
+	    "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
+#else
+	int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
+#endif
+	if (fd == -1) {
+		goto label_error;
+	}
+
+	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
+	syscall(SYS_close, fd);
+#else
+	close(fd);
+#endif
+
+	if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
+		init_system_thp_mode = thp_mode_default;
+	} else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
+		init_system_thp_mode = thp_mode_always;
+	} else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) {
+		init_system_thp_mode = thp_mode_never;
+	} else {
+		goto label_error;
+	}
+	return;
+label_error:
+	opt_thp = init_system_thp_mode = thp_mode_not_supported;
+}
+
 bool
 pages_boot(void) {
 	os_page = os_page_detect();
@ -418,5 +586,21 @@ pages_boot(void) {
 	os_overcommits = false;
 #endif

+	init_thp_state();
+
+	/* Detect lazy purge runtime support. */
+	if (pages_can_purge_lazy) {
+		bool committed = false;
+		void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed);
+		if (madv_free_page == NULL) {
+			return true;
+		}
+		assert(pages_can_purge_lazy_runtime);
+		if (pages_purge_lazy(madv_free_page, PAGE)) {
+			pages_can_purge_lazy_runtime = false;
+		}
+		os_pages_unmap(madv_free_page, PAGE);
+	}
+
 	return false;
 }
--- a/contrib/jemalloc/src/prof.c
+++ b/contrib/jemalloc/src/prof.c
@ -978,7 +978,7 @@ prof_dump_flush(bool propagate_err) {

 	cassert(config_prof);

-	err = write(prof_dump_fd, prof_dump_buf, prof_dump_buf_end);
+	err = malloc_write_fd(prof_dump_fd, prof_dump_buf, prof_dump_buf_end);
 	if (err == -1) {
 		if (!propagate_err) {
 			malloc_write("<jemalloc>: write() failed during heap "
@ -1409,7 +1409,15 @@ prof_open_maps(const char *format, ...) {
 	va_start(ap, format);
 	malloc_vsnprintf(filename, sizeof(filename), format, ap);
 	va_end(ap);
+
+#if defined(O_CLOEXEC)
 	mfd = open(filename, O_RDONLY | O_CLOEXEC);
+#else
+	mfd = open(filename, O_RDONLY);
+	if (mfd != -1) {
+		fcntl(mfd, F_SETFD, fcntl(mfd, F_GETFD) | FD_CLOEXEC);
+	}
+#endif

 	return mfd;
 }
@ -1463,8 +1471,9 @@ prof_dump_maps(bool propagate_err) {
 					goto label_return;
 				}
 			}
-			nread = read(mfd, &prof_dump_buf[prof_dump_buf_end],
-			    PROF_DUMP_BUFSIZE - prof_dump_buf_end);
+			nread = malloc_read_fd(mfd,
+			    &prof_dump_buf[prof_dump_buf_end], PROF_DUMP_BUFSIZE
+			    - prof_dump_buf_end);
 		} while (nread > 0);
 	} else {
 		ret = true;
@ -1772,7 +1781,7 @@ prof_idump(tsdn_t *tsdn) {

 	cassert(config_prof);

-	if (!prof_booted || tsdn_null(tsdn)) {
+	if (!prof_booted || tsdn_null(tsdn) || !prof_active_get_unlocked()) {
 		return;
 	}
 	tsd = tsdn_tsd(tsdn);
@ -1829,7 +1838,7 @@ prof_gdump(tsdn_t *tsdn) {

 	cassert(config_prof);

-	if (!prof_booted || tsdn_null(tsdn)) {
+	if (!prof_booted || tsdn_null(tsdn) || !prof_active_get_unlocked()) {
 		return;
 	}
 	tsd = tsdn_tsd(tsdn);
--- a/contrib/jemalloc/src/stats.c
+++ b/contrib/jemalloc/src/stats.c
--- a/contrib/jemalloc/src/sz.c
+++ b/contrib/jemalloc/src/sz.c
@ -26,7 +26,8 @@ const size_t sz_index2size_tab[NSIZES] = {
 JEMALLOC_ALIGNED(CACHELINE)
 const uint8_t sz_size2index_tab[] = {
 #if LG_TINY_MIN == 0
-#warning "Dangerous LG_TINY_MIN"
+/* The div module doesn't support division by 1. */
+#error "Unsupported LG_TINY_MIN"
 #define S2B_0(i)	i,
 #elif LG_TINY_MIN == 1
 #warning "Dangerous LG_TINY_MIN"
--- a/contrib/jemalloc/src/tcache.c
+++ b/contrib/jemalloc/src/tcache.c
@ -12,7 +12,7 @@
 bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;

-tcache_bin_info_t	*tcache_bin_info;
+cache_bin_info_t	*tcache_bin_info;
 static unsigned		stack_nelms; /* Total stack elms per tcache. */

 unsigned		nhbins;
@ -40,7 +40,7 @@ void
 tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 	szind_t binind = tcache->next_gc_bin;

-	tcache_bin_t *tbin;
+	cache_bin_t *tbin;
 	if (binind < NBINS) {
 		tbin = tcache_small_bin_get(tcache, binind);
 	} else {
@ -58,7 +58,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 			 * Reduce fill count by 2X.  Limit lg_fill_div such that
 			 * the fill count is always at least 1.
 			 */
-			tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];
+			cache_bin_info_t *tbin_info = &tcache_bin_info[binind];
 			if ((tbin_info->ncached_max >>
 			     (tcache->lg_fill_div[binind] + 1)) >= 1) {
 				tcache->lg_fill_div[binind]++;
@ -86,7 +86,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {

 void *
 tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind, bool *tcache_success) {
+    cache_bin_t *tbin, szind_t binind, bool *tcache_success) {
 	void *ret;

 	assert(tcache->arena != NULL);
@ -95,18 +95,18 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	if (config_prof) {
 		tcache->prof_accumbytes = 0;
 	}
-	ret = tcache_alloc_easy(tbin, tcache_success);
+	ret = cache_bin_alloc_easy(tbin, tcache_success);

 	return ret;
 }

 void
-tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
+tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
    szind_t binind, unsigned rem) {
 	bool merged_stats = false;

 	assert(binind < NBINS);
-	assert(rem <= tbin->ncached);
+	assert((cache_bin_sz_t)rem <= tbin->ncached);

 	arena_t *arena = tcache->arena;
 	assert(arena != NULL);
@ -121,7 +121,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 		/* Lock the arena bin associated with the first object. */
 		extent_t *extent = item_extent[0];
 		arena_t *bin_arena = extent_arena_get(extent);
-		arena_bin_t *bin = &bin_arena->bins[binind];
+		bin_t *bin = &bin_arena->bins[binind];

 		if (config_prof && bin_arena == arena) {
 			if (arena_prof_accum(tsd_tsdn(tsd), arena,
@ -169,7 +169,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		arena_bin_t *bin = &arena->bins[binind];
+		bin_t *bin = &arena->bins[binind];
 		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
 		bin->stats.nflushes++;
 		bin->stats.nrequests += tbin->tstats.nrequests;
@ -180,18 +180,18 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
 	    sizeof(void *));
 	tbin->ncached = rem;
-	if ((low_water_t)tbin->ncached < tbin->low_water) {
+	if (tbin->ncached < tbin->low_water) {
 		tbin->low_water = tbin->ncached;
 	}
 }

 void
-tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
+tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
    unsigned rem, tcache_t *tcache) {
 	bool merged_stats = false;

 	assert(binind < nhbins);
-	assert(rem <= tbin->ncached);
+	assert((cache_bin_sz_t)rem <= tbin->ncached);

 	arena_t *arena = tcache->arena;
 	assert(arena != NULL);
@ -278,7 +278,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
 	    sizeof(void *));
 	tbin->ncached = rem;
-	if ((low_water_t)tbin->ncached < tbin->low_water) {
+	if (tbin->ncached < tbin->low_water) {
 		tbin->low_water = tbin->ncached;
 	}
 }
@ -291,8 +291,15 @@ tcache_arena_associate(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 	if (config_stats) {
 		/* Link into list of extant tcaches. */
 		malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx);
+
 		ql_elm_new(tcache, link);
 		ql_tail_insert(&arena->tcache_ql, tcache, link);
+		cache_bin_array_descriptor_init(
+		    &tcache->cache_bin_array_descriptor, tcache->bins_small,
+		    tcache->bins_large);
+		ql_tail_insert(&arena->cache_bin_array_descriptor_ql,
+		    &tcache->cache_bin_array_descriptor, link);
+
 		malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx);
 	}
 }
@ -316,6 +323,8 @@ tcache_arena_dissociate(tsdn_t *tsdn, tcache_t *tcache) {
 			assert(in_ql);
 		}
 		ql_remove(&arena->tcache_ql, tcache, link);
+		ql_remove(&arena->cache_bin_array_descriptor_ql,
+		    &tcache->cache_bin_array_descriptor, link);
 		tcache_stats_merge(tsdn, tcache, arena);
 		malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx);
 	}
@ -354,8 +363,8 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {

 	size_t stack_offset = 0;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
-	memset(tcache->tbins_small, 0, sizeof(tcache_bin_t) * NBINS);
-	memset(tcache->tbins_large, 0, sizeof(tcache_bin_t) * (nhbins - NBINS));
+	memset(tcache->bins_small, 0, sizeof(cache_bin_t) * NBINS);
+	memset(tcache->bins_large, 0, sizeof(cache_bin_t) * (nhbins - NBINS));
 	unsigned i = 0;
 	for (; i < NBINS; i++) {
 		tcache->lg_fill_div[i] = 1;
@ -450,7 +459,7 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 	assert(tcache->arena != NULL);

 	for (unsigned i = 0; i < NBINS; i++) {
-		tcache_bin_t *tbin = tcache_small_bin_get(tcache, i);
+		cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
 		tcache_bin_flush_small(tsd, tcache, tbin, i, 0);

 		if (config_stats) {
@ -458,7 +467,7 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 		}
 	}
 	for (unsigned i = NBINS; i < nhbins; i++) {
-		tcache_bin_t *tbin = tcache_large_bin_get(tcache, i);
+		cache_bin_t *tbin = tcache_large_bin_get(tcache, i);
 		tcache_bin_flush_large(tsd, tbin, i, 0, tcache);

 		if (config_stats) {
@ -524,8 +533,8 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {

 	/* Merge and reset tcache stats. */
 	for (i = 0; i < NBINS; i++) {
-		arena_bin_t *bin = &arena->bins[i];
-		tcache_bin_t *tbin = tcache_small_bin_get(tcache, i);
+		bin_t *bin = &arena->bins[i];
+		cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
 		malloc_mutex_lock(tsdn, &bin->lock);
 		bin->stats.nrequests += tbin->tstats.nrequests;
 		malloc_mutex_unlock(tsdn, &bin->lock);
@ -533,7 +542,7 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 	}

 	for (; i < nhbins; i++) {
-		tcache_bin_t *tbin = tcache_large_bin_get(tcache, i);
+		cache_bin_t *tbin = tcache_large_bin_get(tcache, i);
 		arena_stats_large_nrequests_add(tsdn, &arena->stats, i,
 		    tbin->tstats.nrequests);
 		tbin->tstats.nrequests = 0;
@ -657,21 +666,21 @@ tcache_boot(tsdn_t *tsdn) {
 	nhbins = sz_size2index(tcache_maxclass) + 1;

 	/* Initialize tcache_bin_info. */
-	tcache_bin_info = (tcache_bin_info_t *)base_alloc(tsdn, b0get(), nhbins
-	    * sizeof(tcache_bin_info_t), CACHELINE);
+	tcache_bin_info = (cache_bin_info_t *)base_alloc(tsdn, b0get(), nhbins
+	    * sizeof(cache_bin_info_t), CACHELINE);
 	if (tcache_bin_info == NULL) {
 		return true;
 	}
 	stack_nelms = 0;
 	unsigned i;
 	for (i = 0; i < NBINS; i++) {
-		if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
+		if ((bin_infos[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
 			tcache_bin_info[i].ncached_max =
 			    TCACHE_NSLOTS_SMALL_MIN;
-		} else if ((arena_bin_info[i].nregs << 1) <=
+		} else if ((bin_infos[i].nregs << 1) <=
 		    TCACHE_NSLOTS_SMALL_MAX) {
 			tcache_bin_info[i].ncached_max =
-			    (arena_bin_info[i].nregs << 1);
+			    (bin_infos[i].nregs << 1);
 		} else {
 			tcache_bin_info[i].ncached_max =
 			    TCACHE_NSLOTS_SMALL_MAX;
--- a/contrib/jemalloc/src/tsd.c
+++ b/contrib/jemalloc/src/tsd.c
@ -71,6 +71,16 @@ tsd_data_init(tsd_t *tsd) {
 	 */
 	rtree_ctx_data_init(tsd_rtree_ctxp_get_unsafe(tsd));

+	/*
+	 * A nondeterministic seed based on the address of tsd reduces
+	 * the likelihood of lockstep non-uniform cache index
+	 * utilization among identical concurrent processes, but at the
+	 * cost of test repeatability.  For debug builds, instead use a
+	 * deterministic seed.
+	 */
+	*tsd_offset_statep_get(tsd) = config_debug ? 0 :
+	    (uint64_t)(uintptr_t)tsd;
+
 	return tsd_tcache_enabled_data_init(tsd);
 }

--- a/lib/libc/stdlib/jemalloc/Makefile.inc
+++ b/lib/libc/stdlib/jemalloc/Makefile.inc
@ -2,10 +2,10 @@

 .PATH: ${LIBC_SRCTOP}/stdlib/jemalloc

-JEMALLOCSRCS:= jemalloc.c arena.c background_thread.c base.c bitmap.c ckh.c \
-	ctl.c extent.c extent_dss.c extent_mmap.c hash.c hooks.c large.c \
-	malloc_io.c mutex.c mutex_pool.c nstime.c pages.c prng.c prof.c \
-	rtree.c stats.c sz.c tcache.c ticker.c tsd.c witness.c
+JEMALLOCSRCS:= jemalloc.c arena.c background_thread.c base.c bin.c bitmap.c \
+	ckh.c ctl.c div.c extent.c extent_dss.c extent_mmap.c hash.c hooks.c \
+	large.c log.c malloc_io.c mutex.c mutex_pool.c nstime.c pages.c \
+	prng.c prof.c rtree.c stats.c sz.c tcache.c ticker.c tsd.c witness.c

 SYM_MAPS+=${LIBC_SRCTOP}/stdlib/jemalloc/Symbol.map