Merge llvm, clang, lld, lldb, compiler-rt and libc++ r303197, and update

build glue.
svn path=/projects/clang500-import/; revision=318384
2017-05-16 21:50:29 +00:00 · 2017-05-16 21:50:29 +00:00 · 5517e702c0 · 2020-12-20 02:59:44 +00:00
commit 5517e702c0
parent 209be20560 f9102cdabb 6b3f41ed88 b76161e41b
1214 changed files with 23778 additions and 11589 deletions
--- a/contrib/compiler-rt/include/xray/xray_interface.h
+++ b/contrib/compiler-rt/include/xray/xray_interface.h
@ -1,4 +1,4 @@
-//===-- xray_interface.h ----------------------------------------*- C++ -*-===//
+//===- xray_interface.h -----------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -11,11 +11,12 @@
 //
 // APIs for controlling XRay functionality explicitly.
 //===----------------------------------------------------------------------===//
+
 #ifndef XRAY_XRAY_INTERFACE_H
 #define XRAY_XRAY_INTERFACE_H

+#include <cstddef>
 #include <cstdint>
-#include <stddef.h>

 extern "C" {

@ -25,6 +26,7 @@ enum XRayEntryType {
  EXIT = 1,
  TAIL = 2,
  LOG_ARGS_ENTRY = 3,
+  CUSTOM_EVENT = 4,
 };

 /// Provide a function to invoke for when instrumentation points are hit. This
@ -64,6 +66,9 @@ extern int __xray_set_handler_arg1(void (*)(int32_t, XRayEntryType, uint64_t));
 /// Returns 1 on success, 0 on error.
 extern int __xray_remove_handler_arg1();

+/// Provide a function to invoke when XRay encounters a custom event.
+extern int __xray_set_customevent_handler(void (*entry)(void*, std::size_t));
+
 enum XRayPatchingStatus {
  NOT_INITIALIZED = 0,
  SUCCESS = 1,
@ -96,6 +101,6 @@ extern uintptr_t __xray_function_address(int32_t FuncId);
 /// encounter errors (when there are no instrumented functions, etc.).
 extern size_t __xray_max_function_id();

-}
+} // end extern "C"

-#endif
+#endif // XRAY_XRAY_INTERFACE_H
--- a/contrib/compiler-rt/lib/asan/asan_allocator.h
+++ b/contrib/compiler-rt/lib/asan/asan_allocator.h
@ -161,10 +161,17 @@ typedef FlatByteMap<kNumRegions> ByteMap;
 typedef TwoLevelByteMap<(kNumRegions >> 12), 1 << 12> ByteMap;
 # endif
 typedef CompactSizeClassMap SizeClassMap;
-typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, 16,
-  SizeClassMap, kRegionSizeLog,
-  ByteMap,
-  AsanMapUnmapCallback> PrimaryAllocator;
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = 16;
+  typedef __asan::SizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = __asan::kRegionSizeLog;
+  typedef __asan::ByteMap ByteMap;
+  typedef AsanMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #endif  // SANITIZER_CAN_USE_ALLOCATOR64

 static const uptr kNumberOfSizeClasses = SizeClassMap::kNumClasses;
--- a/contrib/compiler-rt/lib/asan/asan_flags.cc
+++ b/contrib/compiler-rt/lib/asan/asan_flags.cc
@ -194,6 +194,10 @@ void InitializeFlags() {
    Report("WARNING: strchr* interceptors are enabled even though "
           "replace_str=0. Use intercept_strchr=0 to disable them.");
  }
+  if (!f->replace_str && common_flags()->intercept_strndup) {
+    Report("WARNING: strndup* interceptors are enabled even though "
+           "replace_str=0. Use intercept_strndup=0 to disable them.");
+  }
 }

 }  // namespace __asan
--- a/contrib/compiler-rt/lib/builtins/adddf3.c
+++ b/contrib/compiler-rt/lib/builtins/adddf3.c
@ -15,8 +15,13 @@
 #define DOUBLE_PRECISION
 #include "fp_add_impl.inc"

-ARM_EABI_FNALIAS(dadd, adddf3)
-
 COMPILER_RT_ABI double __adddf3(double a, double b){
    return __addXf3__(a, b);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI double __aeabi_dadd(double a, double b) {
+  return __adddf3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/addsf3.c
+++ b/contrib/compiler-rt/lib/builtins/addsf3.c
@ -15,8 +15,13 @@
 #define SINGLE_PRECISION
 #include "fp_add_impl.inc"

-ARM_EABI_FNALIAS(fadd, addsf3)
-
 COMPILER_RT_ABI float __addsf3(float a, float b) {
    return __addXf3__(a, b);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI float __aeabi_fadd(float a, float b) {
+  return __addsf3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/arm/aeabi_cdcmpeq_check_nan.c
+++ b/contrib/compiler-rt/lib/builtins/arm/aeabi_cdcmpeq_check_nan.c
@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//

 #include <stdint.h>
+#include "../int_lib.h"

-__attribute__((pcs("aapcs")))
-__attribute__((visibility("hidden")))
+AEABI_RTABI __attribute__((visibility("hidden")))
 int __aeabi_cdcmpeq_check_nan(double a, double b) {
    return __builtin_isnan(a) || __builtin_isnan(b);
 }
--- a/contrib/compiler-rt/lib/builtins/arm/aeabi_cfcmpeq_check_nan.c
+++ b/contrib/compiler-rt/lib/builtins/arm/aeabi_cfcmpeq_check_nan.c
@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//

 #include <stdint.h>
+#include "../int_lib.h"

-__attribute__((pcs("aapcs")))
-__attribute__((visibility("hidden")))
+AEABI_RTABI __attribute__((visibility("hidden")))
 int __aeabi_cfcmpeq_check_nan(float a, float b) {
    return __builtin_isnan(a) || __builtin_isnan(b);
 }
--- a/contrib/compiler-rt/lib/builtins/arm/aeabi_div0.c
+++ b/contrib/compiler-rt/lib/builtins/arm/aeabi_div0.c
@ -26,16 +26,18 @@
 * line.
 */

+#include "../int_lib.h"
+
 /* provide an unused declaration to pacify pendantic compilation */
 extern unsigned char declaration;

 #if defined(__ARM_EABI__)
-int __attribute__((weak)) __attribute__((visibility("hidden")))
+AEABI_RTABI int __attribute__((weak)) __attribute__((visibility("hidden")))
 __aeabi_idiv0(int return_value) {
  return return_value;
 }

-long long __attribute__((weak)) __attribute__((visibility("hidden")))
+AEABI_RTABI long long __attribute__((weak)) __attribute__((visibility("hidden")))
 __aeabi_ldiv0(long long return_value) {
  return return_value;
 }
--- a/contrib/compiler-rt/lib/builtins/arm/aeabi_drsub.c
+++ b/contrib/compiler-rt/lib/builtins/arm/aeabi_drsub.c
@ -10,10 +10,10 @@
 #define DOUBLE_PRECISION
 #include "../fp_lib.h"

-COMPILER_RT_ABI fp_t
+AEABI_RTABI fp_t
 __aeabi_dsub(fp_t, fp_t);

-COMPILER_RT_ABI fp_t
+AEABI_RTABI fp_t
 __aeabi_drsub(fp_t a, fp_t b) {
    return __aeabi_dsub(b, a);
 }
--- a/contrib/compiler-rt/lib/builtins/arm/aeabi_frsub.c
+++ b/contrib/compiler-rt/lib/builtins/arm/aeabi_frsub.c
@ -10,10 +10,10 @@
 #define SINGLE_PRECISION
 #include "../fp_lib.h"

-COMPILER_RT_ABI fp_t
+AEABI_RTABI fp_t
 __aeabi_fsub(fp_t, fp_t);

-COMPILER_RT_ABI fp_t
+AEABI_RTABI fp_t
 __aeabi_frsub(fp_t a, fp_t b) {
    return __aeabi_fsub(b, a);
 }
--- a/contrib/compiler-rt/lib/builtins/ashldi3.c
+++ b/contrib/compiler-rt/lib/builtins/ashldi3.c
@ -18,8 +18,6 @@

 /* Precondition:  0 <= b < bits_in_dword */

-ARM_EABI_FNALIAS(llsl, ashldi3)
-
 COMPILER_RT_ABI di_int
 __ashldi3(di_int a, si_int b)
 {
@ -41,3 +39,10 @@ __ashldi3(di_int a, si_int b)
    }
    return result.all;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int __aeabi_llsl(di_int a, si_int b) {
+  return __ashldi3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/ashrdi3.c
+++ b/contrib/compiler-rt/lib/builtins/ashrdi3.c
@ -18,8 +18,6 @@

 /* Precondition:  0 <= b < bits_in_dword */

-ARM_EABI_FNALIAS(lasr, ashrdi3)
-
 COMPILER_RT_ABI di_int
 __ashrdi3(di_int a, si_int b)
 {
@ -42,3 +40,10 @@ __ashrdi3(di_int a, si_int b)
    }
    return result.all;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int __aeabi_lasr(di_int a, si_int b) {
+  return __ashrdi3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/assembly.h
+++ b/contrib/compiler-rt/lib/builtins/assembly.h
@ -44,7 +44,8 @@
 #endif
 #define CONST_SECTION .section .rodata

-#if defined(__GNU__) || defined(__ANDROID__) || defined(__FreeBSD__)
+#if defined(__GNU__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \
+    defined(__linux__)
 #define NO_EXEC_STACK_DIRECTIVE .section .note.GNU-stack,"",%progbits
 #else
 #define NO_EXEC_STACK_DIRECTIVE
--- a/contrib/compiler-rt/lib/builtins/comparedf2.c
+++ b/contrib/compiler-rt/lib/builtins/comparedf2.c
@ -113,8 +113,6 @@ __gedf2(fp_t a, fp_t b) {
    }
 }

-ARM_EABI_FNALIAS(dcmpun, unorddf2)
-
 COMPILER_RT_ABI int
 __unorddf2(fp_t a, fp_t b) {
    const rep_t aAbs = toRep(a) & absMask;
@ -144,3 +142,9 @@ __gtdf2(fp_t a, fp_t b) {
    return __gedf2(a, b);
 }

+#if defined(__ARM_EABI__)
+AEABI_RTABI int __aeabi_dcmpun(fp_t a, fp_t b) {
+  return __unorddf2(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/comparesf2.c
+++ b/contrib/compiler-rt/lib/builtins/comparesf2.c
@ -113,8 +113,6 @@ __gesf2(fp_t a, fp_t b) {
    }
 }

-ARM_EABI_FNALIAS(fcmpun, unordsf2)
-
 COMPILER_RT_ABI int
 __unordsf2(fp_t a, fp_t b) {
    const rep_t aAbs = toRep(a) & absMask;
@ -143,3 +141,10 @@ COMPILER_RT_ABI enum GE_RESULT
 __gtsf2(fp_t a, fp_t b) {
    return __gesf2(a, b);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI int __aeabi_fcmpun(fp_t a, fp_t b) {
+  return __unordsf2(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/divdf3.c
+++ b/contrib/compiler-rt/lib/builtins/divdf3.c
@ -19,8 +19,6 @@
 #define DOUBLE_PRECISION
 #include "fp_lib.h"

-ARM_EABI_FNALIAS(ddiv, divdf3)
-
 COMPILER_RT_ABI fp_t
 __divdf3(fp_t a, fp_t b) {
    
@ -183,3 +181,10 @@ __divdf3(fp_t a, fp_t b) {
        return result;
    }
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_ddiv(fp_t a, fp_t b) {
+  return __divdf3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/divsf3.c
+++ b/contrib/compiler-rt/lib/builtins/divsf3.c
@ -19,8 +19,6 @@
 #define SINGLE_PRECISION
 #include "fp_lib.h"

-ARM_EABI_FNALIAS(fdiv, divsf3)
-
 COMPILER_RT_ABI fp_t
 __divsf3(fp_t a, fp_t b) {
    
@ -167,3 +165,10 @@ __divsf3(fp_t a, fp_t b) {
        return fromRep(absResult | quotientSign);
    }
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_fdiv(fp_t a, fp_t b) {
+  return __divsf3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/divsi3.c
+++ b/contrib/compiler-rt/lib/builtins/divsi3.c
@ -16,8 +16,6 @@

 /* Returns: a / b */

-ARM_EABI_FNALIAS(idiv, divsi3)
-
 COMPILER_RT_ABI si_int
 __divsi3(si_int a, si_int b)
 {
@ -35,3 +33,10 @@ __divsi3(si_int a, si_int b)
     */
    return ((su_int)a/(su_int)b ^ s_a) - s_a;    /* negate if s_a == -1 */
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI si_int __aeabi_idiv(si_int a, si_int b) {
+  return __divsi3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/extendhfsf2.c
+++ b/contrib/compiler-rt/lib/builtins/extendhfsf2.c
@ -12,8 +12,6 @@
 #define DST_SINGLE
 #include "fp_extend_impl.inc"

-ARM_EABI_FNALIAS(h2f, extendhfsf2)
-
 // Use a forwarding definition and noinline to implement a poor man's alias,
 // as there isn't a good cross-platform way of defining one.
 COMPILER_RT_ABI NOINLINE float __extendhfsf2(uint16_t a) {
@ -23,3 +21,10 @@ COMPILER_RT_ABI NOINLINE float __extendhfsf2(uint16_t a) {
 COMPILER_RT_ABI float __gnu_h2f_ieee(uint16_t a) {
    return __extendhfsf2(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI float __aeabi_h2f(uint16_t a) {
+  return __extendhfsf2(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/extendsfdf2.c
+++ b/contrib/compiler-rt/lib/builtins/extendsfdf2.c
@ -12,8 +12,13 @@
 #define DST_DOUBLE
 #include "fp_extend_impl.inc"

-ARM_EABI_FNALIAS(f2d, extendsfdf2)
-
 COMPILER_RT_ABI double __extendsfdf2(float a) {
    return __extendXfYf2__(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI double __aeabi_f2d(float a) {
+  return __extendsfdf2(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/fixdfdi.c
+++ b/contrib/compiler-rt/lib/builtins/fixdfdi.c
@ -10,7 +10,6 @@

 #define DOUBLE_PRECISION
 #include "fp_lib.h"
-ARM_EABI_FNALIAS(d2lz, fixdfdi)

 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; can set the invalid
@ -44,3 +43,15 @@ __fixdfdi(fp_t a) {
 }

 #endif
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int
+#if defined(__SOFT_FP__)
+__aeabi_d2lz(fp_t a) {
+#else
+__aeabi_d2lz(double a) {
+#endif
+  return __fixdfdi(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/fixdfsi.c
+++ b/contrib/compiler-rt/lib/builtins/fixdfsi.c
@ -14,9 +14,14 @@ typedef si_int fixint_t;
 typedef su_int fixuint_t;
 #include "fp_fixint_impl.inc"

-ARM_EABI_FNALIAS(d2iz, fixdfsi)
-
 COMPILER_RT_ABI si_int
 __fixdfsi(fp_t a) {
    return __fixint(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI si_int __aeabi_d2iz(fp_t a) {
+  return __fixdfsi(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/fixsfdi.c
+++ b/contrib/compiler-rt/lib/builtins/fixsfdi.c
@ -11,8 +11,6 @@
 #define SINGLE_PRECISION
 #include "fp_lib.h"

-ARM_EABI_FNALIAS(f2lz, fixsfdi)
-
 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; can set the invalid
 * flag as a side-effect of computation.
@ -45,3 +43,15 @@ __fixsfdi(fp_t a) {
 }

 #endif
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int
+#if defined(__SOFT_FP__)
+__aeabi_f2lz(fp_t a) {
+#else
+__aeabi_f2lz(float a) {
+#endif
+  return __fixsfdi(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/fixsfsi.c
+++ b/contrib/compiler-rt/lib/builtins/fixsfsi.c
@ -14,9 +14,14 @@ typedef si_int fixint_t;
 typedef su_int fixuint_t;
 #include "fp_fixint_impl.inc"

-ARM_EABI_FNALIAS(f2iz, fixsfsi)
-
 COMPILER_RT_ABI si_int
 __fixsfsi(fp_t a) {
    return __fixint(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI si_int __aeabi_f2iz(fp_t a) {
+  return __fixsfsi(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/fixunsdfdi.c
+++ b/contrib/compiler-rt/lib/builtins/fixunsdfdi.c
@ -11,8 +11,6 @@
 #define DOUBLE_PRECISION
 #include "fp_lib.h"

-ARM_EABI_FNALIAS(d2ulz, fixunsdfdi)
-
 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; can set the invalid
 * flag as a side-effect of computation.
@ -42,3 +40,15 @@ __fixunsdfdi(fp_t a) {
 }

 #endif
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI du_int
+#if defined(__SOFT_FP__)
+__aeabi_d2ulz(fp_t a) {
+#else
+__aeabi_d2ulz(double a) {
+#endif
+  return __fixunsdfdi(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/fixunsdfsi.c
+++ b/contrib/compiler-rt/lib/builtins/fixunsdfsi.c
@ -13,9 +13,14 @@
 typedef su_int fixuint_t;
 #include "fp_fixuint_impl.inc"

-ARM_EABI_FNALIAS(d2uiz, fixunsdfsi)
-
 COMPILER_RT_ABI su_int
 __fixunsdfsi(fp_t a) {
    return __fixuint(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI su_int __aeabi_d2uiz(fp_t a) {
+  return __fixunsdfsi(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/fixunssfdi.c
+++ b/contrib/compiler-rt/lib/builtins/fixunssfdi.c
@ -11,8 +11,6 @@
 #define SINGLE_PRECISION
 #include "fp_lib.h"

-ARM_EABI_FNALIAS(f2ulz, fixunssfdi)
-
 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; can set the invalid
 * flag as a side-effect of computation.
@ -43,3 +41,15 @@ __fixunssfdi(fp_t a) {
 }

 #endif
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI du_int
+#if defined(__SOFT_FP__)
+__aeabi_f2ulz(fp_t a) {
+#else
+__aeabi_f2ulz(float a) {
+#endif
+  return __fixunssfdi(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/fixunssfsi.c
+++ b/contrib/compiler-rt/lib/builtins/fixunssfsi.c
@ -17,9 +17,14 @@
 typedef su_int fixuint_t;
 #include "fp_fixuint_impl.inc"

-ARM_EABI_FNALIAS(f2uiz, fixunssfsi)
-
 COMPILER_RT_ABI su_int
 __fixunssfsi(fp_t a) {
    return __fixuint(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI su_int __aeabi_f2uiz(fp_t a) {
+  return __fixunssfsi(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/floatdidf.c
+++ b/contrib/compiler-rt/lib/builtins/floatdidf.c
@ -22,8 +22,6 @@

 /* seee eeee eeee mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm */

-ARM_EABI_FNALIAS(l2d, floatdidf)
-
 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; we'll set the inexact flag
 * as a side-effect of this computation.
@ -105,3 +103,10 @@ __floatdidf(di_int a)
    return fb.f;
 }
 #endif
+
+#if defined(__AEABI__)
+AEABI_RTABI double __aeabi_l2d(di_int a) {
+  return __floatdidf(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/floatdisf.c
+++ b/contrib/compiler-rt/lib/builtins/floatdisf.c
@ -22,8 +22,6 @@

 #include "int_lib.h"

-ARM_EABI_FNALIAS(l2f, floatdisf)
-
 COMPILER_RT_ABI float
 __floatdisf(di_int a)
 {
@ -78,3 +76,10 @@ __floatdisf(di_int a)
           ((su_int)a & 0x007FFFFF);   /* mantissa */
    return fb.f;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI float __aeabi_l2f(di_int a) {
+  return __floatdisf(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/floatsidf.c
+++ b/contrib/compiler-rt/lib/builtins/floatsidf.c
@ -18,8 +18,6 @@

 #include "int_lib.h"

-ARM_EABI_FNALIAS(i2d, floatsidf)
-
 COMPILER_RT_ABI fp_t
 __floatsidf(int a) {
    
@ -51,3 +49,10 @@ __floatsidf(int a) {
    // Insert the sign bit and return
    return fromRep(result | sign);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_i2d(int a) {
+  return __floatsidf(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/floatsisf.c
+++ b/contrib/compiler-rt/lib/builtins/floatsisf.c
@ -18,8 +18,6 @@

 #include "int_lib.h"

-ARM_EABI_FNALIAS(i2f, floatsisf)
-
 COMPILER_RT_ABI fp_t
 __floatsisf(int a) {
    
@ -57,3 +55,10 @@ __floatsisf(int a) {
    // Insert the sign bit and return
    return fromRep(result | sign);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_i2f(int a) {
+  return __floatsisf(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/floatundidf.c
+++ b/contrib/compiler-rt/lib/builtins/floatundidf.c
@ -22,8 +22,6 @@

 #include "int_lib.h"

-ARM_EABI_FNALIAS(ul2d, floatundidf)
-
 #ifndef __SOFT_FP__
 /* Support for systems that have hardware floating-point; we'll set the inexact flag
 * as a side-effect of this computation.
@ -104,3 +102,10 @@ __floatundidf(du_int a)
    return fb.f;
 }
 #endif
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI double __aeabi_ul2d(du_int a) {
+  return __floatundidf(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/floatundisf.c
+++ b/contrib/compiler-rt/lib/builtins/floatundisf.c
@ -22,8 +22,6 @@

 #include "int_lib.h"

-ARM_EABI_FNALIAS(ul2f, floatundisf)
-
 COMPILER_RT_ABI float
 __floatundisf(du_int a)
 {
@ -75,3 +73,10 @@ __floatundisf(du_int a)
           ((su_int)a & 0x007FFFFF);  /* mantissa */
    return fb.f;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI float __aeabi_ul2f(du_int a) {
+  return __floatundisf(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/floatunsidf.c
+++ b/contrib/compiler-rt/lib/builtins/floatunsidf.c
@ -18,8 +18,6 @@

 #include "int_lib.h"

-ARM_EABI_FNALIAS(ui2d, floatunsidf)
-
 COMPILER_RT_ABI fp_t
 __floatunsidf(unsigned int a) {
    
@ -40,3 +38,10 @@ __floatunsidf(unsigned int a) {
    result += (rep_t)(exponent + exponentBias) << significandBits;
    return fromRep(result);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_ui2d(unsigned int a) {
+  return __floatunsidf(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/floatunsisf.c
+++ b/contrib/compiler-rt/lib/builtins/floatunsisf.c
@ -18,8 +18,6 @@

 #include "int_lib.h"

-ARM_EABI_FNALIAS(ui2f, floatunsisf)
-
 COMPILER_RT_ABI fp_t
 __floatunsisf(unsigned int a) {
    
@ -48,3 +46,10 @@ __floatunsisf(unsigned int a) {
    result += (rep_t)(exponent + exponentBias) << significandBits;
    return fromRep(result);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_ui2f(unsigned int a) {
+  return __floatunsisf(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/int_lib.h
+++ b/contrib/compiler-rt/lib/builtins/int_lib.h
@ -30,20 +30,19 @@
 /* ABI macro definitions */

 #if __ARM_EABI__
-# define ARM_EABI_FNALIAS(aeabi_name, name)         \
-  void __aeabi_##aeabi_name() __attribute__((alias("__" #name)));
 # if defined(COMPILER_RT_ARMHF_TARGET) || (!defined(__clang__) && \
     defined(__GNUC__) && (__GNUC__ < 4 || __GNUC__ == 4 && __GNUC_MINOR__ < 5))
 /* The pcs attribute was introduced in GCC 4.5.0 */
 #   define COMPILER_RT_ABI
 # else
-#   define COMPILER_RT_ABI __attribute__((pcs("aapcs")))
+#   define COMPILER_RT_ABI __attribute__((__pcs__("aapcs")))
 # endif
 #else
-# define ARM_EABI_FNALIAS(aeabi_name, name)
 # define COMPILER_RT_ABI
 #endif

+#define AEABI_RTABI __attribute__((__pcs__("aapcs")))
+
 #ifdef _MSC_VER
 #define ALWAYS_INLINE __forceinline
 #define NOINLINE __declspec(noinline)
--- a/contrib/compiler-rt/lib/builtins/lshrdi3.c
+++ b/contrib/compiler-rt/lib/builtins/lshrdi3.c
@ -18,8 +18,6 @@

 /* Precondition:  0 <= b < bits_in_dword */

-ARM_EABI_FNALIAS(llsr, lshrdi3)
-
 COMPILER_RT_ABI di_int
 __lshrdi3(di_int a, si_int b)
 {
@ -41,3 +39,10 @@ __lshrdi3(di_int a, si_int b)
    }
    return result.all;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int __aeabi_llsr(di_int a, si_int b) {
+  return __lshrdi3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/muldf3.c
+++ b/contrib/compiler-rt/lib/builtins/muldf3.c
@ -15,8 +15,13 @@
 #define DOUBLE_PRECISION
 #include "fp_mul_impl.inc"

-ARM_EABI_FNALIAS(dmul, muldf3)
-
 COMPILER_RT_ABI fp_t __muldf3(fp_t a, fp_t b) {
    return __mulXf3__(a, b);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_dmul(fp_t a, fp_t b) {
+  return __muldf3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/muldi3.c
+++ b/contrib/compiler-rt/lib/builtins/muldi3.c
@ -40,8 +40,6 @@ __muldsi3(su_int a, su_int b)

 /* Returns: a * b */

-ARM_EABI_FNALIAS(lmul, muldi3)
-
 COMPILER_RT_ABI di_int
 __muldi3(di_int a, di_int b)
 {
@ -54,3 +52,10 @@ __muldi3(di_int a, di_int b)
    r.s.high += x.s.high * y.s.low + x.s.low * y.s.high;
    return r.all;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI di_int __aeabi_lmul(di_int a, di_int b) {
+  return __muldi3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/mulsf3.c
+++ b/contrib/compiler-rt/lib/builtins/mulsf3.c
@ -15,8 +15,13 @@
 #define SINGLE_PRECISION
 #include "fp_mul_impl.inc"

-ARM_EABI_FNALIAS(fmul, mulsf3)
-
 COMPILER_RT_ABI fp_t __mulsf3(fp_t a, fp_t b) {
    return __mulXf3__(a, b);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_fmul(fp_t a, fp_t b) {
+  return __mulsf3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/negdf2.c
+++ b/contrib/compiler-rt/lib/builtins/negdf2.c
@ -14,9 +14,14 @@
 #define DOUBLE_PRECISION
 #include "fp_lib.h"

-ARM_EABI_FNALIAS(dneg, negdf2)
-
 COMPILER_RT_ABI fp_t
 __negdf2(fp_t a) {
    return fromRep(toRep(a) ^ signBit);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_dneg(fp_t a) {
+  return __negdf2(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/negsf2.c
+++ b/contrib/compiler-rt/lib/builtins/negsf2.c
@ -14,9 +14,14 @@
 #define SINGLE_PRECISION
 #include "fp_lib.h"

-ARM_EABI_FNALIAS(fneg, negsf2)
-
 COMPILER_RT_ABI fp_t
 __negsf2(fp_t a) {
    return fromRep(toRep(a) ^ signBit);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_fneg(fp_t a) {
+  return __negsf2(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/subdf3.c
+++ b/contrib/compiler-rt/lib/builtins/subdf3.c
@ -15,11 +15,15 @@
 #define DOUBLE_PRECISION
 #include "fp_lib.h"

-ARM_EABI_FNALIAS(dsub, subdf3)
-
 // Subtraction; flip the sign bit of b and add.
 COMPILER_RT_ABI fp_t
 __subdf3(fp_t a, fp_t b) {
    return __adddf3(a, fromRep(toRep(b) ^ signBit));
 }

+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_dsub(fp_t a, fp_t b) {
+  return __subdf3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/subsf3.c
+++ b/contrib/compiler-rt/lib/builtins/subsf3.c
@ -15,11 +15,15 @@
 #define SINGLE_PRECISION
 #include "fp_lib.h"

-ARM_EABI_FNALIAS(fsub, subsf3)
-
 // Subtraction; flip the sign bit of b and add.
 COMPILER_RT_ABI fp_t
 __subsf3(fp_t a, fp_t b) {
    return __addsf3(a, fromRep(toRep(b) ^ signBit));
 }

+#if defined(__ARM_EABI__)
+AEABI_RTABI fp_t __aeabi_fsub(fp_t a, fp_t b) {
+  return __subsf3(a, b);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/truncdfhf2.c
+++ b/contrib/compiler-rt/lib/builtins/truncdfhf2.c
@ -11,8 +11,13 @@
 #define DST_HALF
 #include "fp_trunc_impl.inc"

-ARM_EABI_FNALIAS(d2h, truncdfhf2)
-
 COMPILER_RT_ABI uint16_t __truncdfhf2(double a) {
    return __truncXfYf2__(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI uint16_t __aeabi_d2h(double a) {
+  return __truncdfhf2(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/truncdfsf2.c
+++ b/contrib/compiler-rt/lib/builtins/truncdfsf2.c
@ -11,8 +11,13 @@
 #define DST_SINGLE
 #include "fp_trunc_impl.inc"

-ARM_EABI_FNALIAS(d2f, truncdfsf2)
-
 COMPILER_RT_ABI float __truncdfsf2(double a) {
    return __truncXfYf2__(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI float __aeabi_d2f(double a) {
+  return __truncdfsf2(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/truncsfhf2.c
+++ b/contrib/compiler-rt/lib/builtins/truncsfhf2.c
@ -11,8 +11,6 @@
 #define DST_HALF
 #include "fp_trunc_impl.inc"

-ARM_EABI_FNALIAS(f2h, truncsfhf2)
-
 // Use a forwarding definition and noinline to implement a poor man's alias,
 // as there isn't a good cross-platform way of defining one.
 COMPILER_RT_ABI NOINLINE uint16_t __truncsfhf2(float a) {
@ -22,3 +20,10 @@ COMPILER_RT_ABI NOINLINE uint16_t __truncsfhf2(float a) {
 COMPILER_RT_ABI uint16_t __gnu_f2h_ieee(float a) {
    return __truncsfhf2(a);
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI uint16_t __aeabi_f2h(float a) {
+  return __truncsfhf2(a);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/builtins/udivsi3.c
+++ b/contrib/compiler-rt/lib/builtins/udivsi3.c
@ -18,8 +18,6 @@

 /* Translated from Figure 3-40 of The PowerPC Compiler Writer's Guide */

-ARM_EABI_FNALIAS(uidiv, udivsi3)
-
 /* This function should not call __divsi3! */
 COMPILER_RT_ABI su_int
 __udivsi3(su_int n, su_int d)
@ -64,3 +62,10 @@ __udivsi3(su_int n, su_int d)
    q = (q << 1) | carry;
    return q;
 }
+
+#if defined(__ARM_EABI__)
+AEABI_RTABI su_int __aeabi_uidiv(su_int n, su_int d) {
+  return __udivsi3(n, d);
+}
+#endif
+
--- a/contrib/compiler-rt/lib/esan/esan_interceptors.cpp
+++ b/contrib/compiler-rt/lib/esan/esan_interceptors.cpp
@ -31,6 +31,8 @@ using namespace __esan; // NOLINT
 // Get the per-platform defines for what is possible to intercept
 #include "sanitizer_common/sanitizer_platform_interceptors.h"

+DECLARE_REAL_AND_INTERCEPTOR(void *, malloc, uptr)
+
 // TODO(bruening): tsan disables several interceptors (getpwent, etc.) claiming
 // that interception is a perf hit: should we do the same?

--- a/contrib/compiler-rt/lib/lsan/lsan_allocator.h
+++ b/contrib/compiler-rt/lib/lsan/lsan_allocator.h
@ -55,10 +55,18 @@ struct ChunkMetadata {
 static const uptr kRegionSizeLog = 20;
 static const uptr kNumRegions = SANITIZER_MMAP_RANGE_SIZE >> kRegionSizeLog;
 typedef TwoLevelByteMap<(kNumRegions >> 12), 1 << 12> ByteMap;
-typedef CompactSizeClassMap SizeClassMap;
-typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE,
-    sizeof(ChunkMetadata), SizeClassMap, kRegionSizeLog, ByteMap>
-    PrimaryAllocator;
+
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = sizeof(ChunkMetadata);
+  typedef __sanitizer::CompactSizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = __lsan::kRegionSizeLog;
+  typedef __lsan::ByteMap ByteMap;
+  typedef NoOpMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #elif defined(__x86_64__) || defined(__powerpc64__)
 struct AP64 {  // Allocator64 parameters. Deliberately using a short name.
  static const uptr kSpaceBeg = 0x600000000000ULL;
--- a/contrib/compiler-rt/lib/lsan/lsan_common_linux.cc
+++ b/contrib/compiler-rt/lib/lsan/lsan_common_linux.cc
@ -62,8 +62,10 @@ void InitializePlatformSpecificModules() {
      return;
    }
  }
-  VReport(1, "LeakSanitizer: Dynamic linker not found. "
-             "TLS will not be handled correctly.\n");
+  if (linker == nullptr) {
+    VReport(1, "LeakSanitizer: Dynamic linker not found. "
+               "TLS will not be handled correctly.\n");
+  }
 }

 static int ProcessGlobalRegionsCallback(struct dl_phdr_info *info, size_t size,
--- a/contrib/compiler-rt/lib/lsan/lsan_common_mac.cc
+++ b/contrib/compiler-rt/lib/lsan/lsan_common_mac.cc
@ -144,6 +144,11 @@ void ProcessPlatformSpecificAllocations(Frontier *frontier) {
    if (info.user_tag == VM_MEMORY_OS_ALLOC_ONCE) {
      ScanRangeForPointers(address, end_address, frontier, "GLOBAL",
                           kReachable);
+
+      // Recursing over the full memory map is very slow, break out
+      // early if we don't need the full iteration.
+      if (!flags()->use_root_regions || !root_regions->size())
+        break;
    }

    // This additional root region scan is required on Darwin in order to
--- a/contrib/compiler-rt/lib/msan/msan_allocator.cc
+++ b/contrib/compiler-rt/lib/msan/msan_allocator.cc
@ -47,12 +47,18 @@ struct MsanMapUnmapCallback {
  static const uptr kRegionSizeLog = 20;
  static const uptr kNumRegions = SANITIZER_MMAP_RANGE_SIZE >> kRegionSizeLog;
  typedef TwoLevelByteMap<(kNumRegions >> 12), 1 << 12> ByteMap;
-  typedef CompactSizeClassMap SizeClassMap;
-
-  typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, sizeof(Metadata),
-                               SizeClassMap, kRegionSizeLog, ByteMap,
-                               MsanMapUnmapCallback> PrimaryAllocator;

+  struct AP32 {
+    static const uptr kSpaceBeg = 0;
+    static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+    static const uptr kMetadataSize = sizeof(Metadata);
+    typedef __sanitizer::CompactSizeClassMap SizeClassMap;
+    static const uptr kRegionSizeLog = __msan::kRegionSizeLog;
+    typedef __msan::ByteMap ByteMap;
+    typedef MsanMapUnmapCallback MapUnmapCallback;
+    static const uptr kFlags = 0;
+  };
+  typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #elif defined(__x86_64__)
 #if SANITIZER_LINUX && !defined(MSAN_LINUX_X86_64_OLD_MAPPING)
  static const uptr kAllocatorSpace = 0x700000000000ULL;
@ -90,11 +96,18 @@ struct MsanMapUnmapCallback {
  static const uptr kRegionSizeLog = 20;
  static const uptr kNumRegions = SANITIZER_MMAP_RANGE_SIZE >> kRegionSizeLog;
  typedef TwoLevelByteMap<(kNumRegions >> 12), 1 << 12> ByteMap;
-  typedef CompactSizeClassMap SizeClassMap;

-  typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, sizeof(Metadata),
-                               SizeClassMap, kRegionSizeLog, ByteMap,
-                               MsanMapUnmapCallback> PrimaryAllocator;
+  struct AP32 {
+    static const uptr kSpaceBeg = 0;
+    static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+    static const uptr kMetadataSize = sizeof(Metadata);
+    typedef __sanitizer::CompactSizeClassMap SizeClassMap;
+    static const uptr kRegionSizeLog = __msan::kRegionSizeLog;
+    typedef __msan::ByteMap ByteMap;
+    typedef MsanMapUnmapCallback MapUnmapCallback;
+    static const uptr kFlags = 0;
+  };
+  typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #endif
 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
 typedef LargeMmapAllocator<MsanMapUnmapCallback> SecondaryAllocator;
--- a/contrib/compiler-rt/lib/msan/msan_interceptors.cc
+++ b/contrib/compiler-rt/lib/msan/msan_interceptors.cc
@ -341,33 +341,6 @@ INTERCEPTOR(char *, __strdup, char *src) {
 #define MSAN_MAYBE_INTERCEPT___STRDUP
 #endif

-INTERCEPTOR(char *, strndup, char *src, SIZE_T n) {
-  ENSURE_MSAN_INITED();
-  GET_STORE_STACK_TRACE;
-  // On FreeBSD strndup() leverages strnlen().
-  InterceptorScope interceptor_scope;
-  SIZE_T copy_size = REAL(strnlen)(src, n);
-  char *res = REAL(strndup)(src, n);
-  CopyShadowAndOrigin(res, src, copy_size, &stack);
-  __msan_unpoison(res + copy_size, 1); // \0
-  return res;
-}
-
-#if !SANITIZER_FREEBSD
-INTERCEPTOR(char *, __strndup, char *src, SIZE_T n) {
-  ENSURE_MSAN_INITED();
-  GET_STORE_STACK_TRACE;
-  SIZE_T copy_size = REAL(strnlen)(src, n);
-  char *res = REAL(__strndup)(src, n);
-  CopyShadowAndOrigin(res, src, copy_size, &stack);
-  __msan_unpoison(res + copy_size, 1); // \0
-  return res;
-}
-#define MSAN_MAYBE_INTERCEPT___STRNDUP INTERCEPT_FUNCTION(__strndup)
-#else
-#define MSAN_MAYBE_INTERCEPT___STRNDUP
-#endif
-
 INTERCEPTOR(char *, gcvt, double number, SIZE_T ndigit, char *buf) {
  ENSURE_MSAN_INITED();
  char *res = REAL(gcvt)(number, ndigit, buf);
@ -1371,6 +1344,13 @@ int OnExit() {
    return __msan_memcpy(to, from, size);                   \
  }

+#define COMMON_INTERCEPTOR_COPY_STRING(ctx, to, from, size)                    \
+  do {                                                                         \
+    GET_STORE_STACK_TRACE;                                                     \
+    CopyShadowAndOrigin(to, from, size, &stack);                               \
+    __msan_unpoison(to + size, 1);                                             \
+  } while (false)
+
 #include "sanitizer_common/sanitizer_platform_interceptors.h"
 #include "sanitizer_common/sanitizer_common_interceptors.inc"

@ -1538,8 +1518,6 @@ void InitializeInterceptors() {
  INTERCEPT_FUNCTION(stpcpy);  // NOLINT
  INTERCEPT_FUNCTION(strdup);
  MSAN_MAYBE_INTERCEPT___STRDUP;
-  INTERCEPT_FUNCTION(strndup);
-  MSAN_MAYBE_INTERCEPT___STRNDUP;
  INTERCEPT_FUNCTION(strncpy);  // NOLINT
  INTERCEPT_FUNCTION(gcvt);
  INTERCEPT_FUNCTION(strcat);  // NOLINT
--- a/contrib/compiler-rt/lib/sanitizer_common/sanitizer_allocator_internal.h
+++ b/contrib/compiler-rt/lib/sanitizer_common/sanitizer_allocator_internal.h
@ -23,21 +23,25 @@ namespace __sanitizer {
 // purposes.
 typedef CompactSizeClassMap InternalSizeClassMap;

-static const uptr kInternalAllocatorSpace = 0;
-static const u64 kInternalAllocatorSize = SANITIZER_MMAP_RANGE_SIZE;
 static const uptr kInternalAllocatorRegionSizeLog = 20;
-#if SANITIZER_WORDSIZE == 32
 static const uptr kInternalAllocatorNumRegions =
-    kInternalAllocatorSize >> kInternalAllocatorRegionSizeLog;
+    SANITIZER_MMAP_RANGE_SIZE >> kInternalAllocatorRegionSizeLog;
+#if SANITIZER_WORDSIZE == 32
 typedef FlatByteMap<kInternalAllocatorNumRegions> ByteMap;
 #else
-static const uptr kInternalAllocatorNumRegions =
-    kInternalAllocatorSize >> kInternalAllocatorRegionSizeLog;
 typedef TwoLevelByteMap<(kInternalAllocatorNumRegions >> 12), 1 << 12> ByteMap;
 #endif
-typedef SizeClassAllocator32<
-    kInternalAllocatorSpace, kInternalAllocatorSize, 0, InternalSizeClassMap,
-    kInternalAllocatorRegionSizeLog, ByteMap> PrimaryInternalAllocator;
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = 0;
+  typedef InternalSizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = kInternalAllocatorRegionSizeLog;
+  typedef __sanitizer::ByteMap ByteMap;
+  typedef NoOpMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+typedef SizeClassAllocator32<AP32> PrimaryInternalAllocator;

 typedef SizeClassAllocatorLocalCache<PrimaryInternalAllocator>
    InternalAllocatorCache;
--- a/contrib/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h
+++ b/contrib/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h
@ -36,13 +36,27 @@ template<class SizeClassAllocator> struct SizeClassAllocator32LocalCache;
 //
 // In order to avoid false sharing the objects of this class should be
 // chache-line aligned.
-template <const uptr kSpaceBeg, const u64 kSpaceSize,
-          const uptr kMetadataSize, class SizeClassMap,
-          const uptr kRegionSizeLog,
-          class ByteMap,
-          class MapUnmapCallback = NoOpMapUnmapCallback>
+
+struct SizeClassAllocator32FlagMasks {  //  Bit masks.
+  enum {
+    kRandomShuffleChunks = 1,
+  };
+};
+
+template <class Params>
 class SizeClassAllocator32 {
 public:
+  static const uptr kSpaceBeg = Params::kSpaceBeg;
+  static const u64 kSpaceSize = Params::kSpaceSize;
+  static const uptr kMetadataSize = Params::kMetadataSize;
+  typedef typename Params::SizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = Params::kRegionSizeLog;
+  typedef typename Params::ByteMap ByteMap;
+  typedef typename Params::MapUnmapCallback MapUnmapCallback;
+
+  static const bool kRandomShuffleChunks =
+      Params::kFlags & SizeClassAllocator32FlagMasks::kRandomShuffleChunks;
+
  struct TransferBatch {
    static const uptr kMaxNumCached = SizeClassMap::kMaxNumCachedHint - 2;
    void SetFromArray(uptr region_beg_unused, void *batch[], uptr count) {
@ -86,8 +100,7 @@ class SizeClassAllocator32 {
    return SizeClassMap::Size(class_id);
  }

-  typedef SizeClassAllocator32<kSpaceBeg, kSpaceSize, kMetadataSize,
-      SizeClassMap, kRegionSizeLog, ByteMap, MapUnmapCallback> ThisT;
+  typedef SizeClassAllocator32<Params> ThisT;
  typedef SizeClassAllocator32LocalCache<ThisT> AllocatorCache;

  void Init(s32 release_to_os_interval_ms) {
--- a/contrib/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/contrib/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@ -34,6 +34,8 @@
 //   COMMON_INTERCEPTOR_MEMSET_IMPL
 //   COMMON_INTERCEPTOR_MEMMOVE_IMPL
 //   COMMON_INTERCEPTOR_MEMCPY_IMPL
+//   COMMON_INTERCEPTOR_COPY_STRING
+//   COMMON_INTERCEPTOR_STRNDUP_IMPL
 //===----------------------------------------------------------------------===//

 #include "interception/interception.h"
@ -217,6 +219,25 @@ bool PlatformHasDifferentMemcpyAndMemmove();
  }
 #endif

+#ifndef COMMON_INTERCEPTOR_COPY_STRING
+#define COMMON_INTERCEPTOR_COPY_STRING(ctx, to, from, size) {}
+#endif
+
+#ifndef COMMON_INTERCEPTOR_STRNDUP_IMPL
+#define COMMON_INTERCEPTOR_STRNDUP_IMPL(ctx, s, size)                          \
+  COMMON_INTERCEPTOR_ENTER(ctx, strndup, s, size);                             \
+  uptr from_length = internal_strnlen(s, size);                                \
+  uptr copy_length = Min(size, from_length);                                   \
+  char *new_mem = (char *)WRAP(malloc)(copy_length + 1);                       \
+  if (common_flags()->intercept_strndup) {                                     \
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, s, copy_length + 1);                    \
+  }                                                                            \
+  COMMON_INTERCEPTOR_COPY_STRING(ctx, new_mem, s, copy_length);                \
+  internal_memcpy(new_mem, s, copy_length);                                    \
+  new_mem[copy_length] = '\0';                                                 \
+  return new_mem;
+#endif
+
 struct FileMetadata {
  // For open_memstream().
  char **addr;
@ -300,6 +321,26 @@ INTERCEPTOR(SIZE_T, strnlen, const char *s, SIZE_T maxlen) {
 #define INIT_STRNLEN
 #endif

+#if SANITIZER_INTERCEPT_STRNDUP
+INTERCEPTOR(char*, strndup, const char *s, uptr size) {
+  void *ctx;
+  COMMON_INTERCEPTOR_STRNDUP_IMPL(ctx, s, size);
+}
+#define INIT_STRNDUP COMMON_INTERCEPT_FUNCTION(strndup)
+#else
+#define INIT_STRNDUP
+#endif // SANITIZER_INTERCEPT_STRNDUP
+
+#if SANITIZER_INTERCEPT___STRNDUP
+INTERCEPTOR(char*, __strndup, const char *s, uptr size) {
+  void *ctx;
+  COMMON_INTERCEPTOR_STRNDUP_IMPL(ctx, s, size);
+}
+#define INIT___STRNDUP COMMON_INTERCEPT_FUNCTION(__strndup)
+#else
+#define INIT___STRNDUP
+#endif // SANITIZER_INTERCEPT___STRNDUP
+
 #if SANITIZER_INTERCEPT_TEXTDOMAIN
 INTERCEPTOR(char*, textdomain, const char *domainname) {
  void *ctx;
@ -6163,6 +6204,8 @@ static void InitializeCommonInterceptors() {
  INIT_TEXTDOMAIN;
  INIT_STRLEN;
  INIT_STRNLEN;
+  INIT_STRNDUP;
+  INIT___STRNDUP;
  INIT_STRCMP;
  INIT_STRNCMP;
  INIT_STRCASECMP;
--- a/contrib/compiler-rt/lib/sanitizer_common/sanitizer_flags.inc
+++ b/contrib/compiler-rt/lib/sanitizer_common/sanitizer_flags.inc
@ -195,6 +195,9 @@ COMMON_FLAG(bool, intercept_strpbrk, true,
 COMMON_FLAG(bool, intercept_strlen, true,
            "If set, uses custom wrappers for strlen and strnlen functions "
            "to find more errors.")
+COMMON_FLAG(bool, intercept_strndup, true,
+            "If set, uses custom wrappers for strndup functions "
+            "to find more errors.")
 COMMON_FLAG(bool, intercept_strchr, true,
            "If set, uses custom wrappers for strchr, strchrnul, and strrchr "
            "functions to find more errors.")
--- a/contrib/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/contrib/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@ -25,6 +25,12 @@
 # define SI_NOT_WINDOWS 0
 #endif

+#if SANITIZER_POSIX
+# define SI_POSIX 1
+#else
+# define SI_POSIX 0
+#endif
+
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
 # define SI_LINUX_NOT_ANDROID 1
 #else
@ -69,6 +75,12 @@
 # define SI_UNIX_NOT_MAC 0
 #endif

+#if SANITIZER_LINUX && !SANITIZER_FREEBSD
+# define SI_LINUX_NOT_FREEBSD 1
+# else
+# define SI_LINUX_NOT_FREEBSD 0
+#endif
+
 #define SANITIZER_INTERCEPT_STRLEN 1
 #define SANITIZER_INTERCEPT_STRNLEN SI_NOT_MAC
 #define SANITIZER_INTERCEPT_STRCMP 1
@ -86,6 +98,8 @@
 #define SANITIZER_INTERCEPT_MEMMOVE 1
 #define SANITIZER_INTERCEPT_MEMCPY 1
 #define SANITIZER_INTERCEPT_MEMCMP 1
+#define SANITIZER_INTERCEPT_STRNDUP SI_POSIX
+#define SANITIZER_INTERCEPT___STRNDUP SI_LINUX_NOT_FREEBSD
 #if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && \
    __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 1070
 # define SI_MAC_DEPLOYMENT_BELOW_10_7 1
--- a/contrib/compiler-rt/lib/sanitizer_common/sanitizer_procmaps.h
+++ b/contrib/compiler-rt/lib/sanitizer_common/sanitizer_procmaps.h
@ -70,6 +70,7 @@ class MemoryMappingLayout {
  bool NextSegmentLoad(uptr *start, uptr *end, uptr *offset, char filename[],
                       uptr filename_size, ModuleArch *arch, u8 *uuid,
                       uptr *protection);
+  void GetSegmentAddrRange(uptr *start, uptr *end, uptr vmaddr, uptr vmsize);
  int current_image_;
  u32 current_magic_;
  u32 current_filetype_;
--- a/contrib/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_linux.cc
+++ b/contrib/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_linux.cc
@ -18,8 +18,8 @@
 namespace __sanitizer {

 void ReadProcMaps(ProcSelfMapsBuff *proc_maps) {
-  CHECK(ReadFileToBuffer("/proc/self/maps", &proc_maps->data,
-                         &proc_maps->mmaped_size, &proc_maps->len));
+  ReadFileToBuffer("/proc/self/maps", &proc_maps->data, &proc_maps->mmaped_size,
+                   &proc_maps->len);
 }

 static bool IsOneOf(char c, char c1, char c2) {
--- a/contrib/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cc
+++ b/contrib/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cc
@ -18,6 +18,7 @@

 #include <mach-o/dyld.h>
 #include <mach-o/loader.h>
+#include <mach/mach.h>

 // These are not available in older macOS SDKs.
 #ifndef CPU_SUBTYPE_X86_64_H
@ -71,6 +72,13 @@ void MemoryMappingLayout::Reset() {
  internal_memset(current_uuid_, 0, kModuleUUIDSize);
 }

+// The dyld load address should be unchanged throughout process execution,
+// and it is expensive to compute once many libraries have been loaded,
+// so cache it here and do not reset.
+static mach_header *dyld_hdr = 0;
+static const char kDyldPath[] = "/usr/lib/dyld";
+static const int kDyldImageIdx = -1;
+
 // static
 void MemoryMappingLayout::CacheMemoryMappings() {
  // No-op on Mac for now.
@ -95,14 +103,12 @@ bool MemoryMappingLayout::NextSegmentLoad(uptr *start, uptr *end, uptr *offset,
  const char *lc = current_load_cmd_addr_;
  current_load_cmd_addr_ += ((const load_command *)lc)->cmdsize;
  if (((const load_command *)lc)->cmd == kLCSegment) {
-    const sptr dlloff = _dyld_get_image_vmaddr_slide(current_image_);
    const SegmentCommand* sc = (const SegmentCommand *)lc;
-    if (start) *start = sc->vmaddr + dlloff;
+    GetSegmentAddrRange(start, end, sc->vmaddr, sc->vmsize);
    if (protection) {
      // Return the initial protection.
      *protection = sc->initprot;
    }
-    if (end) *end = sc->vmaddr + sc->vmsize + dlloff;
    if (offset) {
      if (current_filetype_ == /*MH_EXECUTE*/ 0x2) {
        *offset = sc->vmaddr;
@ -111,8 +117,12 @@ bool MemoryMappingLayout::NextSegmentLoad(uptr *start, uptr *end, uptr *offset,
      }
    }
    if (filename) {
-      internal_strncpy(filename, _dyld_get_image_name(current_image_),
-                       filename_size);
+      if (current_image_ == kDyldImageIdx) {
+        internal_strncpy(filename, kDyldPath, filename_size);
+      } else {
+        internal_strncpy(filename, _dyld_get_image_name(current_image_),
+                         filename_size);
+      }
    }
    if (arch) {
      *arch = current_arch_;
@ -180,11 +190,74 @@ static bool IsModuleInstrumented(const load_command *first_lc) {
  return false;
 }

+// _dyld_get_image_header() and related APIs don't report dyld itself.
+// We work around this by manually recursing through the memory map
+// until we hit a Mach header matching dyld instead. These recurse
+// calls are expensive, but the first memory map generation occurs
+// early in the process, when dyld is one of the only images loaded,
+// so it will be hit after only a few iterations.
+static mach_header *get_dyld_image_header() {
+  mach_port_name_t port;
+  if (task_for_pid(mach_task_self(), internal_getpid(), &port) !=
+      KERN_SUCCESS) {
+    return nullptr;
+  }
+
+  unsigned depth = 1;
+  vm_size_t size = 0;
+  vm_address_t address = 0;
+  kern_return_t err = KERN_SUCCESS;
+  mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64;
+
+  while (true) {
+    struct vm_region_submap_info_64 info;
+    err = vm_region_recurse_64(port, &address, &size, &depth,
+                               (vm_region_info_t)&info, &count);
+    if (err != KERN_SUCCESS) return nullptr;
+
+    if (size >= sizeof(mach_header) &&
+        info.protection & MemoryMappingLayout::kProtectionRead) {
+      mach_header *hdr = (mach_header *)address;
+      if ((hdr->magic == MH_MAGIC || hdr->magic == MH_MAGIC_64) &&
+          hdr->filetype == MH_DYLINKER) {
+        return hdr;
+      }
+    }
+    address += size;
+  }
+}
+
+const mach_header *get_dyld_hdr() {
+  if (!dyld_hdr) dyld_hdr = get_dyld_image_header();
+
+  return dyld_hdr;
+}
+
+void MemoryMappingLayout::GetSegmentAddrRange(uptr *start, uptr *end,
+                                              uptr vmaddr, uptr vmsize) {
+  if (current_image_ == kDyldImageIdx) {
+    // vmaddr is masked with 0xfffff because on macOS versions < 10.12,
+    // it contains an absolute address rather than an offset for dyld.
+    // To make matters even more complicated, this absolute address
+    // isn't actually the absolute segment address, but the offset portion
+    // of the address is accurate when combined with the dyld base address,
+    // and the mask will give just this offset.
+    if (start) *start = (vmaddr & 0xfffff) + (uptr)get_dyld_hdr();
+    if (end) *end = (vmaddr & 0xfffff) + vmsize + (uptr)get_dyld_hdr();
+  } else {
+    const sptr dlloff = _dyld_get_image_vmaddr_slide(current_image_);
+    if (start) *start = vmaddr + dlloff;
+    if (end) *end = vmaddr + vmsize + dlloff;
+  }
+}
+
 bool MemoryMappingLayout::Next(uptr *start, uptr *end, uptr *offset,
                               char filename[], uptr filename_size,
                               uptr *protection, ModuleArch *arch, u8 *uuid) {
-  for (; current_image_ >= 0; current_image_--) {
-    const mach_header* hdr = _dyld_get_image_header(current_image_);
+  for (; current_image_ >= kDyldImageIdx; current_image_--) {
+    const mach_header *hdr = (current_image_ == kDyldImageIdx)
+                                 ? get_dyld_hdr()
+                                 : _dyld_get_image_header(current_image_);
    if (!hdr) continue;
    if (current_load_cmd_count_ < 0) {
      // Set up for this image;
--- a/contrib/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_mac.cc
+++ b/contrib/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_mac.cc
@ -170,6 +170,10 @@ PtraceRegistersStatus SuspendedThreadsListMac::GetRegistersAndSP(
  internal_memcpy(buffer, &regs, sizeof(regs));
  *sp = regs.SP_REG;

+  // On x86_64 and aarch64, we must account for the stack redzone, which is 128
+  // bytes.
+  if (SANITIZER_WORDSIZE == 64) *sp -= 128;
+
  return REGISTERS_AVAILABLE;
 }

--- a/contrib/compiler-rt/lib/scudo/scudo_allocator.cpp
+++ b/contrib/compiler-rt/lib/scudo/scudo_allocator.cpp
@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//

 #include "scudo_allocator.h"
+#include "scudo_crc32.h"
 #include "scudo_tls.h"
 #include "scudo_utils.h"

@ -34,21 +35,28 @@ static uptr Cookie;
 // at compilation or at runtime.
 static atomic_uint8_t HashAlgorithm = { CRC32Software };

-SANITIZER_WEAK_ATTRIBUTE u32 computeHardwareCRC32(u32 Crc, uptr Data);
-
-INLINE u32 computeCRC32(u32 Crc, uptr Data, u8 HashType) {
-  // If SSE4.2 is defined here, it was enabled everywhere, as opposed to only
-  // for scudo_crc32.cpp. This means that other SSE instructions were likely
-  // emitted at other places, and as a result there is no reason to not use
-  // the hardware version of the CRC32.
+INLINE u32 computeCRC32(uptr Crc, uptr Value, uptr *Array, uptr ArraySize) {
+  // If the hardware CRC32 feature is defined here, it was enabled everywhere,
+  // as opposed to only for scudo_crc32.cpp. This means that other hardware
+  // specific instructions were likely emitted at other places, and as a
+  // result there is no reason to not use it here.
 #if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
-  return computeHardwareCRC32(Crc, Data);
+  Crc = CRC32_INTRINSIC(Crc, Value);
+  for (uptr i = 0; i < ArraySize; i++)
+    Crc = CRC32_INTRINSIC(Crc, Array[i]);
+  return Crc;
 #else
-  if (computeHardwareCRC32 && HashType == CRC32Hardware)
-    return computeHardwareCRC32(Crc, Data);
-  else
-    return computeSoftwareCRC32(Crc, Data);
-#endif  // defined(__SSE4_2__)
+  if (atomic_load_relaxed(&HashAlgorithm) == CRC32Hardware) {
+    Crc = computeHardwareCRC32(Crc, Value);
+    for (uptr i = 0; i < ArraySize; i++)
+      Crc = computeHardwareCRC32(Crc, Array[i]);
+    return Crc;
+  }
+  Crc = computeSoftwareCRC32(Crc, Value);
+  for (uptr i = 0; i < ArraySize; i++)
+    Crc = computeSoftwareCRC32(Crc, Array[i]);
+  return Crc;
+#endif  // defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
 }

 static ScudoBackendAllocator &getBackendAllocator();
@ -65,8 +73,9 @@ struct ScudoChunk : UnpackedHeader {
  // Returns the usable size for a chunk, meaning the amount of bytes from the
  // beginning of the user data to the end of the backend allocated chunk.
  uptr getUsableSize(UnpackedHeader *Header) {
-    uptr Size = getBackendAllocator().GetActuallyAllocatedSize(
-        getAllocBeg(Header));
+    uptr Size =
+        getBackendAllocator().GetActuallyAllocatedSize(getAllocBeg(Header),
+                                                       Header->FromPrimary);
    if (Size == 0)
      return 0;
    return Size - AlignedChunkHeaderSize - (Header->Offset << MinAlignmentLog);
@ -78,10 +87,8 @@ struct ScudoChunk : UnpackedHeader {
    ZeroChecksumHeader.Checksum = 0;
    uptr HeaderHolder[sizeof(UnpackedHeader) / sizeof(uptr)];
    memcpy(&HeaderHolder, &ZeroChecksumHeader, sizeof(HeaderHolder));
-    u8 HashType = atomic_load_relaxed(&HashAlgorithm);
-    u32 Crc = computeCRC32(Cookie, reinterpret_cast<uptr>(this), HashType);
-    for (uptr i = 0; i < ARRAY_SIZE(HeaderHolder); i++)
-      Crc = computeCRC32(Crc, HeaderHolder[i], HashType);
+    u32 Crc = computeCRC32(Cookie, reinterpret_cast<uptr>(this), HeaderHolder,
+                           ARRAY_SIZE(HeaderHolder));
    return static_cast<u16>(Crc);
  }

@ -195,10 +202,10 @@ void initScudo() {
  CHECK(!ScudoInitIsRunning && "Scudo init calls itself!");
  ScudoInitIsRunning = true;

-  // Check is SSE4.2 is supported, if so, opt for the CRC32 hardware version.
-  if (testCPUFeature(CRC32CPUFeature)) {
+  // Check if hardware CRC32 is supported in the binary and by the platform, if
+  // so, opt for the CRC32 hardware version of the checksum.
+  if (computeHardwareCRC32 && testCPUFeature(CRC32CPUFeature))
    atomic_store_relaxed(&HashAlgorithm, CRC32Hardware);
-  }

  initFlags();

@ -215,7 +222,8 @@ struct QuarantineCallback {
  explicit QuarantineCallback(AllocatorCache *Cache)
    : Cache_(Cache) {}

-  // Chunk recycling function, returns a quarantined chunk to the backend.
+  // Chunk recycling function, returns a quarantined chunk to the backend,
+  // first making sure it hasn't been tampered with.
  void Recycle(ScudoChunk *Chunk) {
    UnpackedHeader Header;
    Chunk->loadHeader(&Header);
@ -225,17 +233,19 @@ struct QuarantineCallback {
    }
    Chunk->eraseHeader();
    void *Ptr = Chunk->getAllocBeg(&Header);
-    getBackendAllocator().Deallocate(Cache_, Ptr);
+    getBackendAllocator().Deallocate(Cache_, Ptr, Header.FromPrimary);
  }

-  /// Internal quarantine allocation and deallocation functions.
+  // Internal quarantine allocation and deallocation functions. We first check
+  // that the batches are indeed serviced by the Primary.
+  // TODO(kostyak): figure out the best way to protect the batches.
+  COMPILER_CHECK(sizeof(QuarantineBatch) < SizeClassMap::kMaxSize);
  void *Allocate(uptr Size) {
-    // TODO(kostyak): figure out the best way to protect the batches.
-    return getBackendAllocator().Allocate(Cache_, Size, MinAlignment);
+    return getBackendAllocator().Allocate(Cache_, Size, MinAlignment, true);
  }

  void Deallocate(void *Ptr) {
-    getBackendAllocator().Deallocate(Cache_, Ptr);
+    getBackendAllocator().Deallocate(Cache_, Ptr, true);
  }

  AllocatorCache *Cache_;
@ -353,58 +363,55 @@ struct ScudoAllocator {
      Size = 1;

    uptr NeededSize = RoundUpTo(Size, MinAlignment) + AlignedChunkHeaderSize;
-    if (Alignment > MinAlignment)
-      NeededSize += Alignment;
-    if (NeededSize >= MaxAllowedMallocSize)
+    uptr AlignedSize = (Alignment > MinAlignment) ?
+        NeededSize + (Alignment - AlignedChunkHeaderSize) : NeededSize;
+    if (AlignedSize >= MaxAllowedMallocSize)
      return BackendAllocator.ReturnNullOrDieOnBadRequest();

-    // Primary backed and Secondary backed allocations have a different
-    // treatment. We deal with alignment requirements of Primary serviced
-    // allocations here, but the Secondary will take care of its own alignment
-    // needs, which means we also have to work around some limitations of the
-    // combined allocator to accommodate the situation.
-    bool FromPrimary = PrimaryAllocator::CanAllocate(NeededSize, MinAlignment);
+    // Primary and Secondary backed allocations have a different treatment. We
+    // deal with alignment requirements of Primary serviced allocations here,
+    // but the Secondary will take care of its own alignment needs.
+    bool FromPrimary = PrimaryAllocator::CanAllocate(AlignedSize, MinAlignment);

    void *Ptr;
    uptr Salt;
+    uptr AllocationSize = FromPrimary ? AlignedSize : NeededSize;
    uptr AllocationAlignment = FromPrimary ? MinAlignment : Alignment;
    ScudoThreadContext *ThreadContext = getThreadContextAndLock();
    if (LIKELY(ThreadContext)) {
      Salt = getPrng(ThreadContext)->getNext();
      Ptr = BackendAllocator.Allocate(getAllocatorCache(ThreadContext),
-                                      NeededSize, AllocationAlignment);
+                                      AllocationSize, AllocationAlignment,
+                                      FromPrimary);
      ThreadContext->unlock();
    } else {
      SpinMutexLock l(&FallbackMutex);
      Salt = FallbackPrng.getNext();
-      Ptr = BackendAllocator.Allocate(&FallbackAllocatorCache, NeededSize,
-                                      AllocationAlignment);
+      Ptr = BackendAllocator.Allocate(&FallbackAllocatorCache, AllocationSize,
+                                      AllocationAlignment, FromPrimary);
    }
    if (!Ptr)
      return BackendAllocator.ReturnNullOrDieOnOOM();

-    uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
-    // If the allocation was serviced by the secondary, the returned pointer
-    // accounts for ChunkHeaderSize to pass the alignment check of the combined
-    // allocator. Adjust it here.
-    if (!FromPrimary) {
-      AllocBeg -= AlignedChunkHeaderSize;
-      if (Alignment > MinAlignment)
-        NeededSize -= Alignment;
-    }
-
    // If requested, we will zero out the entire contents of the returned chunk.
    if ((ForceZeroContents || ZeroContents) && FromPrimary)
-       memset(Ptr, 0, BackendAllocator.GetActuallyAllocatedSize(Ptr));
+       memset(Ptr, 0,
+              BackendAllocator.GetActuallyAllocatedSize(Ptr, FromPrimary));

-    uptr UserBeg = AllocBeg + AlignedChunkHeaderSize;
-    if (!IsAligned(UserBeg, Alignment))
-      UserBeg = RoundUpTo(UserBeg, Alignment);
-    CHECK_LE(UserBeg + Size, AllocBeg + NeededSize);
    UnpackedHeader Header = {};
+    uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
+    uptr UserBeg = AllocBeg + AlignedChunkHeaderSize;
+    if (!IsAligned(UserBeg, Alignment)) {
+      // Since the Secondary takes care of alignment, a non-aligned pointer
+      // means it is from the Primary. It is also the only case where the offset
+      // field of the header would be non-zero.
+      CHECK(FromPrimary);
+      UserBeg = RoundUpTo(UserBeg, Alignment);
+      uptr Offset = UserBeg - AlignedChunkHeaderSize - AllocBeg;
+      Header.Offset = Offset >> MinAlignmentLog;
+    }
+    CHECK_LE(UserBeg + Size, AllocBeg + AllocationSize);
    Header.State = ChunkAllocated;
-    uptr Offset = UserBeg - AlignedChunkHeaderSize - AllocBeg;
-    Header.Offset = Offset >> MinAlignmentLog;
    Header.AllocType = Type;
    if (FromPrimary) {
      Header.FromPrimary = FromPrimary;
@ -431,17 +438,20 @@ struct ScudoAllocator {
  // with no additional security value.
  void quarantineOrDeallocateChunk(ScudoChunk *Chunk, UnpackedHeader *Header,
                                   uptr Size) {
+    bool FromPrimary = Header->FromPrimary;
    bool BypassQuarantine = (AllocatorQuarantine.GetCacheSize() == 0);
    if (BypassQuarantine) {
      Chunk->eraseHeader();
      void *Ptr = Chunk->getAllocBeg(Header);
      ScudoThreadContext *ThreadContext = getThreadContextAndLock();
      if (LIKELY(ThreadContext)) {
-        getBackendAllocator().Deallocate(getAllocatorCache(ThreadContext), Ptr);
+        getBackendAllocator().Deallocate(getAllocatorCache(ThreadContext), Ptr,
+                                         FromPrimary);
        ThreadContext->unlock();
      } else {
        SpinMutexLock Lock(&FallbackMutex);
-        getBackendAllocator().Deallocate(&FallbackAllocatorCache, Ptr);
+        getBackendAllocator().Deallocate(&FallbackAllocatorCache, Ptr,
+                                         FromPrimary);
      }
    } else {
      UnpackedHeader NewHeader = *Header;
--- a/contrib/compiler-rt/lib/scudo/scudo_allocator.h
+++ b/contrib/compiler-rt/lib/scudo/scudo_allocator.h
@ -80,7 +80,7 @@ const uptr AllocatorSize = 0x10000000000ULL;  // 1T.
 const uptr AllocatorSize = 0x40000000000ULL;  // 4T.
 # endif
 typedef DefaultSizeClassMap SizeClassMap;
-struct AP {
+struct AP64 {
  static const uptr kSpaceBeg = AllocatorSpace;
  static const uptr kSpaceSize = AllocatorSize;
  static const uptr kMetadataSize = 0;
@ -89,7 +89,7 @@ struct AP {
  static const uptr kFlags =
      SizeClassAllocator64FlagMasks::kRandomShuffleChunks;
 };
-typedef SizeClassAllocator64<AP> PrimaryAllocator;
+typedef SizeClassAllocator64<AP64> PrimaryAllocator;
 #else
 // Currently, the 32-bit Sanitizer allocator has not yet benefited from all the
 // security improvements brought to the 64-bit one. This makes the 32-bit
@ -102,16 +102,27 @@ typedef FlatByteMap<NumRegions> ByteMap;
 typedef TwoLevelByteMap<(NumRegions >> 12), 1 << 12> ByteMap;
 # endif  // SANITIZER_WORDSIZE
 typedef DefaultSizeClassMap SizeClassMap;
-typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, 0, SizeClassMap,
-    RegionSizeLog, ByteMap> PrimaryAllocator;
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = 0;
+  typedef __scudo::SizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = RegionSizeLog;
+  typedef __scudo::ByteMap ByteMap;
+  typedef NoOpMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags =
+      SizeClassAllocator32FlagMasks::kRandomShuffleChunks;
+};
+typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #endif  // SANITIZER_CAN_USE_ALLOCATOR64

 #include "scudo_allocator_secondary.h"
+#include "scudo_allocator_combined.h"

 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
 typedef ScudoLargeMmapAllocator SecondaryAllocator;
-typedef CombinedAllocator<PrimaryAllocator, AllocatorCache, SecondaryAllocator>
-    ScudoBackendAllocator;
+typedef ScudoCombinedAllocator<PrimaryAllocator, AllocatorCache,
+    SecondaryAllocator> ScudoBackendAllocator;

 void initScudo();

--- a/contrib/compiler-rt/lib/scudo/scudo_allocator_combined.h
+++ b/contrib/compiler-rt/lib/scudo/scudo_allocator_combined.h
@ -0,0 +1,84 @@
+//===-- scudo_allocator_combined.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Scudo Combined Allocator, dispatches allocation & deallocation requests to
+/// the Primary or the Secondary backend allocators.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_ALLOCATOR_COMBINED_H_
+#define SCUDO_ALLOCATOR_COMBINED_H_
+
+#ifndef SCUDO_ALLOCATOR_H_
+#error "This file must be included inside scudo_allocator.h."
+#endif
+
+template <class PrimaryAllocator, class AllocatorCache,
+    class SecondaryAllocator>
+class ScudoCombinedAllocator {
+ public:
+  void Init(bool AllocatorMayReturnNull, s32 ReleaseToOSIntervalMs) {
+    Primary.Init(ReleaseToOSIntervalMs);
+    Secondary.Init(AllocatorMayReturnNull);
+    Stats.Init();
+    atomic_store_relaxed(&MayReturnNull, AllocatorMayReturnNull);
+  }
+
+  void *Allocate(AllocatorCache *Cache, uptr Size, uptr Alignment,
+                 bool FromPrimary) {
+    if (FromPrimary)
+      return Cache->Allocate(&Primary, Primary.ClassID(Size));
+    return Secondary.Allocate(&Stats, Size, Alignment);
+  }
+
+  void *ReturnNullOrDieOnBadRequest() {
+    if (atomic_load_relaxed(&MayReturnNull))
+      return nullptr;
+    ReportAllocatorCannotReturnNull(false);
+  }
+
+  void *ReturnNullOrDieOnOOM() {
+    if (atomic_load_relaxed(&MayReturnNull))
+      return nullptr;
+    ReportAllocatorCannotReturnNull(true);
+  }
+
+  void Deallocate(AllocatorCache *Cache, void *Ptr, bool FromPrimary) {
+    if (FromPrimary)
+      Cache->Deallocate(&Primary, Primary.GetSizeClass(Ptr), Ptr);
+    else
+      Secondary.Deallocate(&Stats, Ptr);
+  }
+
+  uptr GetActuallyAllocatedSize(void *Ptr, bool FromPrimary) {
+    if (FromPrimary)
+      return Primary.GetActuallyAllocatedSize(Ptr);
+    return Secondary.GetActuallyAllocatedSize(Ptr);
+  }
+
+  void InitCache(AllocatorCache *Cache) {
+    Cache->Init(&Stats);
+  }
+
+  void DestroyCache(AllocatorCache *Cache) {
+    Cache->Destroy(&Primary, &Stats);
+  }
+
+  void GetStats(AllocatorStatCounters StatType) const {
+    Stats.Get(StatType);
+  }
+
+ private:
+  PrimaryAllocator Primary;
+  SecondaryAllocator Secondary;
+  AllocatorGlobalStats Stats;
+  atomic_uint8_t MayReturnNull;
+};
+
+#endif  // SCUDO_ALLOCATOR_COMBINED_H_
--- a/contrib/compiler-rt/lib/scudo/scudo_allocator_secondary.h
+++ b/contrib/compiler-rt/lib/scudo/scudo_allocator_secondary.h
@ -26,20 +26,19 @@ class ScudoLargeMmapAllocator {

  void Init(bool AllocatorMayReturnNull) {
    PageSize = GetPageSizeCached();
-    atomic_store(&MayReturnNull, AllocatorMayReturnNull, memory_order_relaxed);
+    atomic_store_relaxed(&MayReturnNull, AllocatorMayReturnNull);
  }

  void *Allocate(AllocatorStats *Stats, uptr Size, uptr Alignment) {
+    uptr UserSize = Size - AlignedChunkHeaderSize;
    // The Scudo frontend prevents us from allocating more than
    // MaxAllowedMallocSize, so integer overflow checks would be superfluous.
    uptr MapSize = Size + SecondaryHeaderSize;
+    if (Alignment > MinAlignment)
+      MapSize += Alignment;
    MapSize = RoundUpTo(MapSize, PageSize);
    // Account for 2 guard pages, one before and one after the chunk.
    MapSize += 2 * PageSize;
-    // The size passed to the Secondary comprises the alignment, if large
-    // enough. Subtract it here to get the requested size, including header.
-    if (Alignment > MinAlignment)
-      Size -= Alignment;

    uptr MapBeg = reinterpret_cast<uptr>(MmapNoAccess(MapSize));
    if (MapBeg == ~static_cast<uptr>(0))
@ -51,32 +50,32 @@ class ScudoLargeMmapAllocator {
    // initial guard page, and both headers. This is the pointer that has to
    // abide by alignment requirements.
    uptr UserBeg = MapBeg + PageSize + HeadersSize;
+    uptr UserEnd = UserBeg + UserSize;

    // In the rare event of larger alignments, we will attempt to fit the mmap
    // area better and unmap extraneous memory. This will also ensure that the
    // offset and unused bytes field of the header stay small.
    if (Alignment > MinAlignment) {
-      if (UserBeg & (Alignment - 1))
-        UserBeg += Alignment - (UserBeg & (Alignment - 1));
-      CHECK_GE(UserBeg, MapBeg);
-      uptr NewMapBeg = RoundDownTo(UserBeg - HeadersSize, PageSize) - PageSize;
-      CHECK_GE(NewMapBeg, MapBeg);
-      uptr NewMapEnd = RoundUpTo(UserBeg + (Size - AlignedChunkHeaderSize),
-                                 PageSize) + PageSize;
-      CHECK_LE(NewMapEnd, MapEnd);
-      // Unmap the extra memory if it's large enough, on both sides.
-      uptr Diff = NewMapBeg - MapBeg;
-      if (Diff > PageSize)
-        UnmapOrDie(reinterpret_cast<void *>(MapBeg), Diff);
-      Diff = MapEnd - NewMapEnd;
-      if (Diff > PageSize)
-        UnmapOrDie(reinterpret_cast<void *>(NewMapEnd), Diff);
-      MapBeg = NewMapBeg;
-      MapEnd = NewMapEnd;
-      MapSize = NewMapEnd - NewMapBeg;
+      if (!IsAligned(UserBeg, Alignment)) {
+        UserBeg = RoundUpTo(UserBeg, Alignment);
+        CHECK_GE(UserBeg, MapBeg);
+        uptr NewMapBeg = RoundDownTo(UserBeg - HeadersSize, PageSize) -
+            PageSize;
+        CHECK_GE(NewMapBeg, MapBeg);
+        if (NewMapBeg != MapBeg) {
+          UnmapOrDie(reinterpret_cast<void *>(MapBeg), NewMapBeg - MapBeg);
+          MapBeg = NewMapBeg;
+        }
+        UserEnd = UserBeg + UserSize;
+      }
+      uptr NewMapEnd = RoundUpTo(UserEnd, PageSize) + PageSize;
+      if (NewMapEnd != MapEnd) {
+        UnmapOrDie(reinterpret_cast<void *>(NewMapEnd), MapEnd - NewMapEnd);
+        MapEnd = NewMapEnd;
+      }
+      MapSize = MapEnd - MapBeg;
    }

-    uptr UserEnd = UserBeg + (Size - AlignedChunkHeaderSize);
    CHECK_LE(UserEnd, MapEnd - PageSize);
    // Actually mmap the memory, preserving the guard pages on either side.
    CHECK_EQ(MapBeg + PageSize, reinterpret_cast<uptr>(
@ -94,25 +93,15 @@ class ScudoLargeMmapAllocator {
      Stats->Add(AllocatorStatMapped, MapSize - 2 * PageSize);
    }

-    return reinterpret_cast<void *>(UserBeg);
-  }
-
-  void *ReturnNullOrDieOnBadRequest() {
-    if (atomic_load(&MayReturnNull, memory_order_acquire))
-      return nullptr;
-    ReportAllocatorCannotReturnNull(false);
+    return reinterpret_cast<void *>(Ptr);
  }

  void *ReturnNullOrDieOnOOM() {
-    if (atomic_load(&MayReturnNull, memory_order_acquire))
+    if (atomic_load_relaxed(&MayReturnNull))
      return nullptr;
    ReportAllocatorCannotReturnNull(true);
  }

-  void SetMayReturnNull(bool AllocatorMayReturnNull) {
-    atomic_store(&MayReturnNull, AllocatorMayReturnNull, memory_order_release);
-  }
-
  void Deallocate(AllocatorStats *Stats, void *Ptr) {
    SecondaryHeader *Header = getHeader(Ptr);
    {
@ -123,14 +112,6 @@ class ScudoLargeMmapAllocator {
    UnmapOrDie(reinterpret_cast<void *>(Header->MapBeg), Header->MapSize);
  }

-  uptr TotalMemoryUsed() {
-    UNIMPLEMENTED();
-  }
-
-  bool PointerIsMine(const void *Ptr) {
-    UNIMPLEMENTED();
-  }
-
  uptr GetActuallyAllocatedSize(void *Ptr) {
    SecondaryHeader *Header = getHeader(Ptr);
    // Deduct PageSize as MapSize includes the trailing guard page.
@ -138,39 +119,9 @@ class ScudoLargeMmapAllocator {
    return MapEnd - reinterpret_cast<uptr>(Ptr);
  }

-  void *GetMetaData(const void *Ptr) {
-    UNIMPLEMENTED();
-  }
-
-  void *GetBlockBegin(const void *Ptr) {
-    UNIMPLEMENTED();
-  }
-
-  void *GetBlockBeginFastLocked(void *Ptr) {
-    UNIMPLEMENTED();
-  }
-
-  void PrintStats() {
-    UNIMPLEMENTED();
-  }
-
-  void ForceLock() {
-    UNIMPLEMENTED();
-  }
-
-  void ForceUnlock() {
-    UNIMPLEMENTED();
-  }
-
-  void ForEachChunk(ForEachChunkCallback Callback, void *Arg) {
-    UNIMPLEMENTED();
-  }
-
 private:
  // A Secondary allocated chunk header contains the base of the mapping and
-  // its size. Currently, the base is always a page before the header, but
-  // we might want to extend that number in the future based on the size of
-  // the allocation.
+  // its size, which comprises the guard pages.
  struct SecondaryHeader {
    uptr MapBeg;
    uptr MapSize;
--- a/contrib/compiler-rt/lib/scudo/scudo_crc32.cpp
+++ b/contrib/compiler-rt/lib/scudo/scudo_crc32.cpp
@ -12,24 +12,7 @@
 ///
 //===----------------------------------------------------------------------===//

-#include "sanitizer_common/sanitizer_internal_defs.h"
-
-// Hardware CRC32 is supported at compilation via the following:
-// - for i386 & x86_64: -msse4.2
-// - for ARM & AArch64: -march=armv8-a+crc or -mcrc
-// An additional check must be performed at runtime as well to make sure the
-// emitted instructions are valid on the target host.
-
-#if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
-# ifdef __SSE4_2__
-#  include <smmintrin.h>
-#  define CRC32_INTRINSIC FIRST_32_SECOND_64(_mm_crc32_u32, _mm_crc32_u64)
-# endif
-# ifdef __ARM_FEATURE_CRC32
-#  include <arm_acle.h>
-#  define CRC32_INTRINSIC FIRST_32_SECOND_64(__crc32cw, __crc32cd)
-# endif
-#endif  // defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+#include "scudo_crc32.h"

 namespace __scudo {

--- a/contrib/compiler-rt/lib/scudo/scudo_crc32.h
+++ b/contrib/compiler-rt/lib/scudo/scudo_crc32.h
@ -0,0 +1,101 @@
+//===-- scudo_crc32.h -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Scudo chunk header checksum related definitions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_CRC32_H_
+#define SCUDO_CRC32_H_
+
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+// Hardware CRC32 is supported at compilation via the following:
+// - for i386 & x86_64: -msse4.2
+// - for ARM & AArch64: -march=armv8-a+crc or -mcrc
+// An additional check must be performed at runtime as well to make sure the
+// emitted instructions are valid on the target host.
+
+#if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+# ifdef __SSE4_2__
+#  include <smmintrin.h>
+#  define CRC32_INTRINSIC FIRST_32_SECOND_64(_mm_crc32_u32, _mm_crc32_u64)
+# endif
+# ifdef __ARM_FEATURE_CRC32
+#  include <arm_acle.h>
+#  define CRC32_INTRINSIC FIRST_32_SECOND_64(__crc32cw, __crc32cd)
+# endif
+#endif  // defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+
+namespace __scudo {
+
+enum : u8 {
+  CRC32Software = 0,
+  CRC32Hardware = 1,
+};
+
+const static u32 CRC32Table[] = {
+  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
+  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
+  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+INLINE u32 computeSoftwareCRC32(u32 Crc, uptr Data) {
+  for (uptr i = 0; i < sizeof(Data); i++) {
+    Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
+    Data >>= 8;
+  }
+  return Crc;
+}
+
+SANITIZER_WEAK_ATTRIBUTE u32 computeHardwareCRC32(u32 Crc, uptr Data);
+
+}  // namespace __scudo
+
+#endif  // SCUDO_CRC32_H_
--- a/contrib/compiler-rt/lib/scudo/scudo_utils.h
+++ b/contrib/compiler-rt/lib/scudo/scudo_utils.h
@ -53,65 +53,6 @@ struct Xorshift128Plus {
  u64 State[2];
 };

-enum : u8 {
-  CRC32Software = 0,
-  CRC32Hardware = 1,
-};
-
-const static u32 CRC32Table[] = {
-  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
-  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
-  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
-  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
-  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
-  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
-  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
-  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
-  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
-  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
-  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
-  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
-  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
-  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
-  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
-  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
-  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
-  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
-  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
-  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
-  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
-  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
-};
-
-INLINE u32 computeSoftwareCRC32(u32 Crc, uptr Data) {
-  for (uptr i = 0; i < sizeof(Data); i++) {
-    Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
-    Data >>= 8;
-  }
-  return Crc;
-}
-
 }  // namespace __scudo

 #endif  // SCUDO_UTILS_H_
--- a/contrib/compiler-rt/lib/tsan/rtl/tsan_rtl.h
+++ b/contrib/compiler-rt/lib/tsan/rtl/tsan_rtl.h
@ -55,16 +55,22 @@ namespace __tsan {
 #if !SANITIZER_GO
 struct MapUnmapCallback;
 #if defined(__mips64) || defined(__aarch64__) || defined(__powerpc__)
-static const uptr kAllocatorSpace = 0;
-static const uptr kAllocatorSize = SANITIZER_MMAP_RANGE_SIZE;
 static const uptr kAllocatorRegionSizeLog = 20;
 static const uptr kAllocatorNumRegions =
-    kAllocatorSize >> kAllocatorRegionSizeLog;
+    SANITIZER_MMAP_RANGE_SIZE >> kAllocatorRegionSizeLog;
 typedef TwoLevelByteMap<(kAllocatorNumRegions >> 12), 1 << 12,
    MapUnmapCallback> ByteMap;
-typedef SizeClassAllocator32<kAllocatorSpace, kAllocatorSize, 0,
-    CompactSizeClassMap, kAllocatorRegionSizeLog, ByteMap,
-    MapUnmapCallback> PrimaryAllocator;
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = 0;
+  typedef __sanitizer::CompactSizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = kAllocatorRegionSizeLog;
+  typedef __tsan::ByteMap ByteMap;
+  typedef __tsan::MapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+typedef SizeClassAllocator32<AP32> PrimaryAllocator;
 #else
 struct AP64 {  // Allocator64 parameters. Deliberately using a short name.
  static const uptr kSpaceBeg = Mapping::kHeapMemBeg;
--- a/contrib/compiler-rt/lib/xray/xray_AArch64.cc
+++ b/contrib/compiler-rt/lib/xray/xray_AArch64.cc
@ -18,8 +18,7 @@
 #include <atomic>
 #include <cassert>

-
-extern "C" void __clear_cache(void* start, void* end);
+extern "C" void __clear_cache(void *start, void *end);

 namespace __xray {

@ -86,8 +85,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
        uint32_t(PatchOpcodes::PO_B32), std::memory_order_release);
  }
-  __clear_cache(reinterpret_cast<char*>(FirstAddress),
-      reinterpret_cast<char*>(CurAddress));
+  __clear_cache(reinterpret_cast<char *>(FirstAddress),
+                reinterpret_cast<char *>(CurAddress));
  return true;
 }

@ -107,6 +106,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
  return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
 }

+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled)
+    XRAY_NEVER_INSTRUMENT { // FIXME: Implement in aarch64?
+  return false;
+}
+
 // FIXME: Maybe implement this better?
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }

--- a/contrib/compiler-rt/lib/xray/xray_arm.cc
+++ b/contrib/compiler-rt/lib/xray/xray_arm.cc
@ -18,7 +18,7 @@
 #include <atomic>
 #include <cassert>

-extern "C" void __clear_cache(void* start, void* end);
+extern "C" void __clear_cache(void *start, void *end);

 namespace __xray {

@ -122,8 +122,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
        uint32_t(PatchOpcodes::PO_B20), std::memory_order_release);
  }
-  __clear_cache(reinterpret_cast<char*>(FirstAddress),
-      reinterpret_cast<char*>(CurAddress));
+  __clear_cache(reinterpret_cast<char *>(FirstAddress),
+                reinterpret_cast<char *>(CurAddress));
  return true;
 }

@ -143,6 +143,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
  return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
 }

+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled)
+    XRAY_NEVER_INSTRUMENT { // FIXME: Implement in arm?
+  return false;
+}
+
 // FIXME: Maybe implement this better?
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }

--- a/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h
+++ b/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h
@ -29,6 +29,7 @@ struct alignas(16) MetadataRecord {
    NewCPUId,
    TSCWrap,
    WalltimeMarker,
+    CustomEventMarker,
  };
  // Use 7 bits to identify this record type.
  /* RecordKinds */ uint8_t RecordKind : 7;
--- a/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc
+++ b/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc
@ -41,45 +41,12 @@ namespace __xray {
 // Global BufferQueue.
 std::shared_ptr<BufferQueue> BQ;

-__sanitizer::atomic_sint32_t LoggingStatus = {
-    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
-
 __sanitizer::atomic_sint32_t LogFlushStatus = {
    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};

-std::unique_ptr<FDRLoggingOptions> FDROptions;
+FDRLoggingOptions FDROptions;

-XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax,
-                                 void *Options,
-                                 size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
-  if (OptionsSize != sizeof(FDRLoggingOptions))
-    return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load(
-        &LoggingStatus, __sanitizer::memory_order_acquire));
-  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  if (!__sanitizer::atomic_compare_exchange_strong(
-          &LoggingStatus, &CurrentStatus,
-          XRayLogInitStatus::XRAY_LOG_INITIALIZING,
-          __sanitizer::memory_order_release))
-    return static_cast<XRayLogInitStatus>(CurrentStatus);
-
-  FDROptions.reset(new FDRLoggingOptions());
-  memcpy(FDROptions.get(), Options, OptionsSize);
-  bool Success = false;
-  BQ = std::make_shared<BufferQueue>(BufferSize, BufferMax, Success);
-  if (!Success) {
-    Report("BufferQueue init failed.\n");
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  }
-
-  // Install the actual handleArg0 handler after initialising the buffers.
-  __xray_set_handler(fdrLoggingHandleArg0);
-
-  __sanitizer::atomic_store(&LoggingStatus,
-                            XRayLogInitStatus::XRAY_LOG_INITIALIZED,
-                            __sanitizer::memory_order_release);
-  Report("XRay FDR init successful.\n");
-  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
-}
+__sanitizer::SpinMutex FDROptionsMutex;

 // Must finalize before flushing.
 XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
@ -108,7 +75,11 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
  //      (fixed-sized) and let the tools reading the buffers deal with the data
  //      afterwards.
  //
-  int Fd = FDROptions->Fd;
+  int Fd = -1;
+  {
+    __sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
+    Fd = FDROptions.Fd;
+  }
  if (Fd == -1)
    Fd = getLogFD();
  if (Fd == -1) {
@ -120,8 +91,8 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {

  // Test for required CPU features and cache the cycle frequency
  static bool TSCSupported = probeRequiredCPUFeatures();
-  static uint64_t CycleFrequency = TSCSupported ? getTSCFrequency()
-                                   : __xray::NanosecondsPerSecond;
+  static uint64_t CycleFrequency =
+      TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond;

  XRayFileHeader Header;
  Header.Version = 1;
@ -192,8 +163,8 @@ XRayLogInitStatus fdrLoggingReset() XRAY_NEVER_INSTRUMENT {
  return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
 }

-void fdrLoggingHandleArg0(int32_t FuncId,
-                          XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+static std::tuple<uint64_t, unsigned char>
+getTimestamp() XRAY_NEVER_INSTRUMENT {
  // We want to get the TSC as early as possible, so that we can check whether
  // we've seen this CPU before. We also do it before we load anything else, to
  // allow for forward progress with the scheduling.
@ -203,7 +174,7 @@ void fdrLoggingHandleArg0(int32_t FuncId,
  // Test once for required CPU features
  static bool TSCSupported = probeRequiredCPUFeatures();

-  if(TSCSupported) {
+  if (TSCSupported) {
    TSC = __xray::readTSC(CPU);
  } else {
    // FIXME: This code needs refactoring as it appears in multiple locations
@ -216,9 +187,102 @@ void fdrLoggingHandleArg0(int32_t FuncId,
    CPU = 0;
    TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec;
  }
+  return std::make_tuple(TSC, CPU);
+}

-  __xray_fdr_internal::processFunctionHook(FuncId, Entry, TSC, CPU,
-                                           clock_gettime, LoggingStatus, BQ);
+void fdrLoggingHandleArg0(int32_t FuncId,
+                          XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+  auto TSC_CPU = getTimestamp();
+  __xray_fdr_internal::processFunctionHook(FuncId, Entry, std::get<0>(TSC_CPU),
+                                           std::get<1>(TSC_CPU), clock_gettime,
+                                           LoggingStatus, BQ);
+}
+
+void fdrLoggingHandleCustomEvent(void *Event,
+                                 std::size_t EventSize) XRAY_NEVER_INSTRUMENT {
+  using namespace __xray_fdr_internal;
+  auto TSC_CPU = getTimestamp();
+  auto &TSC = std::get<0>(TSC_CPU);
+  auto &CPU = std::get<1>(TSC_CPU);
+  thread_local bool Running = false;
+  RecursionGuard Guard{Running};
+  if (!Guard) {
+    assert(Running && "RecursionGuard is buggy!");
+    return;
+  }
+  if (EventSize > std::numeric_limits<int32_t>::max()) {
+    using Empty = struct {};
+    static Empty Once = [&] {
+      Report("Event size too large = %zu ; > max = %d\n", EventSize,
+             std::numeric_limits<int32_t>::max());
+      return Empty();
+    }();
+    (void)Once;
+  }
+  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+  if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, clock_gettime))
+    return;
+
+  // Here we need to prepare the log to handle:
+  //   - The metadata record we're going to write. (16 bytes)
+  //   - The additional data we're going to write. Currently, that's the size of
+  //   the event we're going to dump into the log as free-form bytes.
+  if (!prepareBuffer(clock_gettime, MetadataRecSize + EventSize)) {
+    LocalBQ = nullptr;
+    return;
+  }
+
+  // Write the custom event metadata record, which consists of the following
+  // information:
+  //   - 8 bytes (64-bits) for the full TSC when the event started.
+  //   - 4 bytes (32-bits) for the length of the data.
+  MetadataRecord CustomEvent;
+  CustomEvent.Type = uint8_t(RecordType::Metadata);
+  CustomEvent.RecordKind =
+      uint8_t(MetadataRecord::RecordKinds::CustomEventMarker);
+  constexpr auto TSCSize = sizeof(std::get<0>(TSC_CPU));
+  std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
+  std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
+  std::memcpy(RecordPtr, &CustomEvent, sizeof(CustomEvent));
+  RecordPtr += sizeof(CustomEvent);
+  std::memcpy(RecordPtr, Event, ReducedEventSize);
+  endBufferIfFull();
+}
+
+XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax,
+                                 void *Options,
+                                 size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+  if (OptionsSize != sizeof(FDRLoggingOptions))
+    return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load(
+        &LoggingStatus, __sanitizer::memory_order_acquire));
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  if (!__sanitizer::atomic_compare_exchange_strong(
+          &LoggingStatus, &CurrentStatus,
+          XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+          __sanitizer::memory_order_release))
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+
+  {
+    __sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
+    memcpy(&FDROptions, Options, OptionsSize);
+  }
+
+  bool Success = false;
+  BQ = std::make_shared<BufferQueue>(BufferSize, BufferMax, Success);
+  if (!Success) {
+    Report("BufferQueue init failed.\n");
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  }
+
+  // Install the actual handleArg0 handler after initialising the buffers.
+  __xray_set_handler(fdrLoggingHandleArg0);
+  __xray_set_customevent_handler(fdrLoggingHandleCustomEvent);
+
+  __sanitizer::atomic_store(&LoggingStatus,
+                            XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+                            __sanitizer::memory_order_release);
+  Report("XRay FDR init successful.\n");
+  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
 }

 } // namespace __xray
--- a/contrib/compiler-rt/lib/xray/xray_fdr_logging_impl.h
+++ b/contrib/compiler-rt/lib/xray/xray_fdr_logging_impl.h
@ -37,6 +37,9 @@

 namespace __xray {

+__sanitizer::atomic_sint32_t LoggingStatus = {
+    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
 /// We expose some of the state transitions when FDR logging mode is operating
 /// such that we can simulate a series of log events that may occur without
 /// and test with determinism without worrying about the real CPU time.
@ -123,12 +126,21 @@ thread_local uint8_t NumTailCalls = 0;
 constexpr auto MetadataRecSize = sizeof(MetadataRecord);
 constexpr auto FunctionRecSize = sizeof(FunctionRecord);

+// We use a thread_local variable to keep track of which CPUs we've already
+// run, and the TSC times for these CPUs. This allows us to stop repeating the
+// CPU field in the function records.
+//
+// We assume that we'll support only 65536 CPUs for x86_64.
+thread_local uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
+thread_local uint64_t LastTSC = 0;
+thread_local uint64_t LastFunctionEntryTSC = 0;
+
 class ThreadExitBufferCleanup {
-  std::weak_ptr<BufferQueue> Buffers;
+  std::shared_ptr<BufferQueue> &Buffers;
  BufferQueue::Buffer &Buffer;

 public:
-  explicit ThreadExitBufferCleanup(std::weak_ptr<BufferQueue> BQ,
+  explicit ThreadExitBufferCleanup(std::shared_ptr<BufferQueue> &BQ,
                                   BufferQueue::Buffer &Buffer)
      XRAY_NEVER_INSTRUMENT : Buffers(BQ),
                              Buffer(Buffer) {}
@ -142,17 +154,24 @@ class ThreadExitBufferCleanup {
    // the queue.
    assert((RecordPtr + MetadataRecSize) - static_cast<char *>(Buffer.Buffer) >=
           static_cast<ptrdiff_t>(MetadataRecSize));
-    if (auto BQ = Buffers.lock()) {
+    if (Buffers) {
      writeEOBMetadata();
-      auto EC = BQ->releaseBuffer(Buffer);
+      auto EC = Buffers->releaseBuffer(Buffer);
      if (EC != BufferQueue::ErrorCode::Ok)
        Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer,
               BufferQueue::getErrorString(EC));
+      Buffers = nullptr;
      return;
    }
  }
 };

+// Make sure a thread that's ever called handleArg0 has a thread-local
+// live reference to the buffer queue for this particular instance of
+// FDRLogging, and that we're going to clean it up when the thread exits.
+thread_local std::shared_ptr<BufferQueue> LocalBQ = nullptr;
+thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer);
+
 class RecursionGuard {
  bool &Running;
  const bool Valid;
@ -176,7 +195,7 @@ class RecursionGuard {
  }
 };

-static inline bool loggingInitialized(
+inline bool loggingInitialized(
    const __sanitizer::atomic_sint32_t &LoggingStatus) XRAY_NEVER_INSTRUMENT {
  return __sanitizer::atomic_load(&LoggingStatus,
                                  __sanitizer::memory_order_acquire) ==
@ -185,8 +204,8 @@ static inline bool loggingInitialized(

 } // namespace

-static inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
-                                          char *&MemPtr) XRAY_NEVER_INSTRUMENT {
+inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
+                                   char *&MemPtr) XRAY_NEVER_INSTRUMENT {
  static constexpr int InitRecordsCount = 2;
  std::aligned_storage<sizeof(MetadataRecord)>::type Records[InitRecordsCount];
  {
@ -222,9 +241,8 @@ static inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
  NumTailCalls = 0;
 }

-static inline void setupNewBuffer(int (*wall_clock_reader)(clockid_t,
-                                                           struct timespec *))
-    XRAY_NEVER_INSTRUMENT {
+inline void setupNewBuffer(int (*wall_clock_reader)(
+    clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT {
  RecordPtr = static_cast<char *>(Buffer.Buffer);
  pid_t Tid = syscall(SYS_gettid);
  timespec TS{0, 0};
@ -235,8 +253,8 @@ static inline void setupNewBuffer(int (*wall_clock_reader)(clockid_t,
  NumTailCalls = 0;
 }

-static inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
-                                         char *&MemPtr) XRAY_NEVER_INSTRUMENT {
+inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
+                                  char *&MemPtr) XRAY_NEVER_INSTRUMENT {
  MetadataRecord NewCPUId;
  NewCPUId.Type = uint8_t(RecordType::Metadata);
  NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId);
@ -253,12 +271,12 @@ static inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
  NumTailCalls = 0;
 }

-static inline void writeNewCPUIdMetadata(uint16_t CPU,
-                                         uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+inline void writeNewCPUIdMetadata(uint16_t CPU,
+                                  uint64_t TSC) XRAY_NEVER_INSTRUMENT {
  writeNewCPUIdMetadata(CPU, TSC, RecordPtr);
 }

-static inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
+inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
  MetadataRecord EOBMeta;
  EOBMeta.Type = uint8_t(RecordType::Metadata);
  EOBMeta.RecordKind = uint8_t(MetadataRecord::RecordKinds::EndOfBuffer);
@ -269,12 +287,12 @@ static inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
  NumTailCalls = 0;
 }

-static inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT {
+inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT {
  writeEOBMetadata(RecordPtr);
 }

-static inline void writeTSCWrapMetadata(uint64_t TSC,
-                                        char *&MemPtr) XRAY_NEVER_INSTRUMENT {
+inline void writeTSCWrapMetadata(uint64_t TSC,
+                                 char *&MemPtr) XRAY_NEVER_INSTRUMENT {
  MetadataRecord TSCWrap;
  TSCWrap.Type = uint8_t(RecordType::Metadata);
  TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap);
@ -289,13 +307,13 @@ static inline void writeTSCWrapMetadata(uint64_t TSC,
  NumTailCalls = 0;
 }

-static inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
  writeTSCWrapMetadata(TSC, RecordPtr);
 }

-static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
-                                       XRayEntryType EntryType,
-                                       char *&MemPtr) XRAY_NEVER_INSTRUMENT {
+inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
+                                XRayEntryType EntryType,
+                                char *&MemPtr) XRAY_NEVER_INSTRUMENT {
  std::aligned_storage<sizeof(FunctionRecord), alignof(FunctionRecord)>::type
      AlignedFuncRecordBuffer;
  auto &FuncRecord =
@ -339,6 +357,17 @@ static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
    FuncRecord.RecordKind =
        uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
    break;
+  case XRayEntryType::CUSTOM_EVENT: {
+    // This is a bug in patching, so we'll report it once and move on.
+    static bool Once = [&] {
+      Report("Internal error: patched an XRay custom event call as a function; "
+             "func id = %d\n",
+             FuncId);
+      return true;
+    }();
+    (void)Once;
+    return;
+  }
  }

  std::memcpy(MemPtr, &AlignedFuncRecordBuffer, sizeof(FunctionRecord));
@ -346,8 +375,9 @@ static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
 }

 static uint64_t thresholdTicks() {
-  static uint64_t TicksPerSec = probeRequiredCPUFeatures() ? getTSCFrequency() :
-                                __xray::NanosecondsPerSecond;
+  static uint64_t TicksPerSec = probeRequiredCPUFeatures()
+                                    ? getTSCFrequency()
+                                    : __xray::NanosecondsPerSecond;
  static const uint64_t ThresholdTicks =
      TicksPerSec * flags()->xray_fdr_log_func_duration_threshold_us / 1000000;
  return ThresholdTicks;
@ -397,9 +427,8 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
    RewindingRecordPtr -= FunctionRecSize;
    RewindingTSC -= ExpectedTailExit.TSCDelta;
    AlignedFuncStorage FunctionEntryBuffer;
-    const auto &ExpectedFunctionEntry =
-        *reinterpret_cast<FunctionRecord *>(std::memcpy(
-            &FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize));
+    const auto &ExpectedFunctionEntry = *reinterpret_cast<FunctionRecord *>(
+        std::memcpy(&FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize));
    assert(ExpectedFunctionEntry.RecordKind ==
               uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
           "Expected to find function entry when rewinding tail call.");
@ -422,7 +451,7 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
  }
 }

-static inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
+inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
  auto EC = BQ->releaseBuffer(Buffer);
  if (EC != BufferQueue::ErrorCode::Ok) {
    Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer,
@ -432,11 +461,29 @@ static inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
  return true;
 }

-static inline void processFunctionHook(
-    int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU,
-    int (*wall_clock_reader)(clockid_t, struct timespec *),
-    __sanitizer::atomic_sint32_t &LoggingStatus,
-    const std::shared_ptr<BufferQueue> &BQ) XRAY_NEVER_INSTRUMENT {
+inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t,
+                                                   struct timespec *),
+                          size_t MaxSize) XRAY_NEVER_INSTRUMENT {
+  char *BufferStart = static_cast<char *>(Buffer.Buffer);
+  if ((RecordPtr + MaxSize) > (BufferStart + Buffer.Size - MetadataRecSize)) {
+    writeEOBMetadata();
+    if (!releaseThreadLocalBuffer(LocalBQ.get()))
+      return false;
+    auto EC = LocalBQ->getBuffer(Buffer);
+    if (EC != BufferQueue::ErrorCode::Ok) {
+      Report("Failed to acquire a buffer; error=%s\n",
+             BufferQueue::getErrorString(EC));
+      return false;
+    }
+    setupNewBuffer(wall_clock_reader);
+  }
+  return true;
+}
+
+inline bool isLogInitializedAndReady(
+    std::shared_ptr<BufferQueue> &LocalBQ, uint64_t TSC, unsigned char CPU,
+    int (*wall_clock_reader)(clockid_t,
+                             struct timespec *)) XRAY_NEVER_INSTRUMENT {
  // Bail out right away if logging is not initialized yet.
  // We should take the opportunity to release the buffer though.
  auto Status = __sanitizer::atomic_load(&LoggingStatus,
@ -446,44 +493,19 @@ static inline void processFunctionHook(
        (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
         Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) {
      writeEOBMetadata();
-      if (!releaseThreadLocalBuffer(BQ.get()))
-        return;
+      if (!releaseThreadLocalBuffer(LocalBQ.get()))
+        return false;
      RecordPtr = nullptr;
+      LocalBQ = nullptr;
+      return false;
    }
-    return;
-  }
-
-  // We use a thread_local variable to keep track of which CPUs we've already
-  // run, and the TSC times for these CPUs. This allows us to stop repeating the
-  // CPU field in the function records.
-  //
-  // We assume that we'll support only 65536 CPUs for x86_64.
-  thread_local uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
-  thread_local uint64_t LastTSC = 0;
-  thread_local uint64_t LastFunctionEntryTSC = 0;
-
-  // Make sure a thread that's ever called handleArg0 has a thread-local
-  // live reference to the buffer queue for this particular instance of
-  // FDRLogging, and that we're going to clean it up when the thread exits.
-  thread_local auto LocalBQ = BQ;
-  thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer);
-
-  // Prevent signal handler recursion, so in case we're already in a log writing
-  // mode and the signal handler comes in (and is also instrumented) then we
-  // don't want to be clobbering potentially partial writes already happening in
-  // the thread. We use a simple thread_local latch to only allow one on-going
-  // handleArg0 to happen at any given time.
-  thread_local bool Running = false;
-  RecursionGuard Guard{Running};
-  if (!Guard) {
-    assert(Running == true && "RecursionGuard is buggy!");
-    return;
+    return false;
  }

  if (!loggingInitialized(LoggingStatus) || LocalBQ->finalizing()) {
    writeEOBMetadata();
-    if (!releaseThreadLocalBuffer(BQ.get()))
-      return;
+    if (!releaseThreadLocalBuffer(LocalBQ.get()))
+      return false;
    RecordPtr = nullptr;
  }

@ -496,19 +518,57 @@ static inline void processFunctionHook(
          LS != XRayLogInitStatus::XRAY_LOG_FINALIZED)
        Report("Failed to acquire a buffer; error=%s\n",
               BufferQueue::getErrorString(EC));
-      return;
+      return false;
    }

    setupNewBuffer(wall_clock_reader);
  }

  if (CurrentCPU == std::numeric_limits<uint16_t>::max()) {
-    // This means this is the first CPU this thread has ever run on. We set the
-    // current CPU and record this as the first TSC we've seen.
+    // This means this is the first CPU this thread has ever run on. We set
+    // the current CPU and record this as the first TSC we've seen.
    CurrentCPU = CPU;
    writeNewCPUIdMetadata(CPU, TSC);
  }

+  return true;
+} // namespace __xray_fdr_internal
+
+inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT {
+  auto BufferStart = static_cast<char *>(Buffer.Buffer);
+  if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) {
+    writeEOBMetadata();
+    if (!releaseThreadLocalBuffer(LocalBQ.get()))
+      return;
+    RecordPtr = nullptr;
+  }
+}
+
+inline void processFunctionHook(
+    int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU,
+    int (*wall_clock_reader)(clockid_t, struct timespec *),
+    __sanitizer::atomic_sint32_t &LoggingStatus,
+    const std::shared_ptr<BufferQueue> &BQ) XRAY_NEVER_INSTRUMENT {
+  // Prevent signal handler recursion, so in case we're already in a log writing
+  // mode and the signal handler comes in (and is also instrumented) then we
+  // don't want to be clobbering potentially partial writes already happening in
+  // the thread. We use a simple thread_local latch to only allow one on-going
+  // handleArg0 to happen at any given time.
+  thread_local bool Running = false;
+  RecursionGuard Guard{Running};
+  if (!Guard) {
+    assert(Running == true && "RecursionGuard is buggy!");
+    return;
+  }
+
+  // In case the reference has been cleaned up before, we make sure we
+  // initialize it to the provided BufferQueue.
+  if (LocalBQ == nullptr)
+    LocalBQ = BQ;
+
+  if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, wall_clock_reader))
+    return;
+
  // Before we go setting up writing new function entries, we need to be really
  // careful about the pointer math we're doing. This means we need to ensure
  // that the record we are about to write is going to fit into the buffer,
@ -545,25 +605,15 @@ static inline void processFunctionHook(
  // bytes in the end of the buffer, we need to write out the EOB, get a new
  // Buffer, set it up properly before doing any further writing.
  //
-  char *BufferStart = static_cast<char *>(Buffer.Buffer);
-  if ((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart <
-      static_cast<ptrdiff_t>(MetadataRecSize)) {
-    writeEOBMetadata();
-    if (!releaseThreadLocalBuffer(LocalBQ.get()))
-      return;
-    auto EC = LocalBQ->getBuffer(Buffer);
-    if (EC != BufferQueue::ErrorCode::Ok) {
-      Report("Failed to acquire a buffer; error=%s\n",
-             BufferQueue::getErrorString(EC));
-      return;
-    }
-    setupNewBuffer(wall_clock_reader);
+  if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) {
+    LocalBQ = nullptr;
+    return;
  }

  // By this point, we are now ready to write at most 24 bytes (one metadata
  // record and one function record).
-  BufferStart = static_cast<char *>(Buffer.Buffer);
-  assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart >=
+  assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) -
+                 static_cast<char *>(Buffer.Buffer) >=
             static_cast<ptrdiff_t>(MetadataRecSize) &&
         "Misconfigured BufferQueue provided; Buffer size not large enough.");

@ -586,7 +636,6 @@ static inline void processFunctionHook(
  //     FunctionRecord. In this case we write down just a FunctionRecord with
  //     the correct TSC delta.
  //
-
  uint32_t RecordTSCDelta = 0;
  if (CPU != CurrentCPU) {
    // We've moved to a new CPU.
@ -619,21 +668,27 @@ static inline void processFunctionHook(
      break;
    rewindRecentCall(TSC, LastTSC, LastFunctionEntryTSC, FuncId);
    return; // without writing log.
+  case XRayEntryType::CUSTOM_EVENT: {
+    // This is a bug in patching, so we'll report it once and move on.
+    static bool Once = [&] {
+      Report("Internal error: patched an XRay custom event call as a function; "
+             "func id = %d",
+             FuncId);
+      return true;
+    }();
+    (void)Once;
+    return;
+  }
  }

  writeFunctionRecord(FuncId, RecordTSCDelta, Entry, RecordPtr);

  // If we've exhausted the buffer by this time, we then release the buffer to
  // make sure that other threads may start using this buffer.
-  if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) {
-    writeEOBMetadata();
-    if (!releaseThreadLocalBuffer(LocalBQ.get()))
-      return;
-    RecordPtr = nullptr;
-  }
+  endBufferIfFull();
 }

 } // namespace __xray_fdr_internal
-
 } // namespace __xray
+
 #endif // XRAY_XRAY_FDR_LOGGING_IMPL_H
--- a/contrib/compiler-rt/lib/xray/xray_interface.cc
+++ b/contrib/compiler-rt/lib/xray/xray_interface.cc
@ -50,6 +50,9 @@ __sanitizer::atomic_uintptr_t XRayPatchedFunction{0};
 // This is the function to call from the arg1-enabled sleds/trampolines.
 __sanitizer::atomic_uintptr_t XRayArgLogger{0};

+// This is the function to call when we encounter a custom event log call.
+__sanitizer::atomic_uintptr_t XRayPatchedCustomEvent{0};
+
 // MProtectHelper is an RAII wrapper for calls to mprotect(...) that will undo
 // any successful mprotect(...) changes. This is used to make a page writeable
 // and executable, and upon destruction if it was successful in doing so returns
@ -97,7 +100,19 @@ int __xray_set_handler(void (*entry)(int32_t,
                               __sanitizer::memory_order_acquire)) {

    __sanitizer::atomic_store(&__xray::XRayPatchedFunction,
-                              reinterpret_cast<uint64_t>(entry),
+                              reinterpret_cast<uintptr_t>(entry),
+                              __sanitizer::memory_order_release);
+    return 1;
+  }
+  return 0;
+}
+
+int __xray_set_customevent_handler(void (*entry)(void *, size_t))
+    XRAY_NEVER_INSTRUMENT {
+  if (__sanitizer::atomic_load(&XRayInitialized,
+                               __sanitizer::memory_order_acquire)) {
+    __sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent,
+                              reinterpret_cast<uintptr_t>(entry),
                              __sanitizer::memory_order_release);
    return 1;
  }
@ -161,6 +176,9 @@ inline bool patchSled(const XRaySledEntry &Sled, bool Enable,
  case XRayEntryType::LOG_ARGS_ENTRY:
    Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry);
    break;
+  case XRayEntryType::CUSTOM_EVENT:
+    Success = patchCustomEvent(Enable, FuncId, Sled);
+    break;
  default:
    Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind));
    return false;
@ -301,6 +319,7 @@ int __xray_set_handler_arg1(void (*Handler)(int32_t, XRayEntryType, uint64_t)) {
                            __sanitizer::memory_order_release);
  return 1;
 }
+
 int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); }

 uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
--- a/contrib/compiler-rt/lib/xray/xray_interface_internal.h
+++ b/contrib/compiler-rt/lib/xray/xray_interface_internal.h
@ -60,6 +60,7 @@ bool patchFunctionEntry(bool Enable, uint32_t FuncId,
 bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
 bool patchFunctionTailExit(bool Enable, uint32_t FuncId,
                           const XRaySledEntry &Sled);
+bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);

 } // namespace __xray

@ -70,6 +71,7 @@ extern void __xray_FunctionEntry();
 extern void __xray_FunctionExit();
 extern void __xray_FunctionTailExit();
 extern void __xray_ArgLoggerEntry();
+extern void __xray_CustomEvent();
 }

 #endif
--- a/contrib/compiler-rt/lib/xray/xray_mips.cc
+++ b/contrib/compiler-rt/lib/xray/xray_mips.cc
@ -95,7 +95,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
  //   B #44

  if (Enable) {
-    uint32_t LoTracingHookAddr = reinterpret_cast<int32_t>(TracingHook) & 0xffff;
+    uint32_t LoTracingHookAddr =
+        reinterpret_cast<int32_t>(TracingHook) & 0xffff;
    uint32_t HiTracingHookAddr =
        (reinterpret_cast<int32_t>(TracingHook) >> 16) & 0xffff;
    uint32_t LoFunctionID = FuncId & 0xffff;
@ -151,6 +152,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
 }

+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips?
+  return false;
+}
+
 } // namespace __xray

 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
--- a/contrib/compiler-rt/lib/xray/xray_mips64.cc
+++ b/contrib/compiler-rt/lib/xray/xray_mips64.cc
@ -93,7 +93,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
  if (Enable) {
    uint32_t LoTracingHookAddr =
        reinterpret_cast<int64_t>(TracingHook) & 0xffff;
-    uint32_t HiTracingHookAddr = (reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff;
+    uint32_t HiTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff;
    uint32_t HigherTracingHookAddr =
        (reinterpret_cast<int64_t>(TracingHook) >> 32) & 0xffff;
    uint32_t HighestTracingHookAddr =
@ -160,6 +161,11 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
 }

+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips64?
+  return false;
+}
 } // namespace __xray

 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
--- a/contrib/compiler-rt/lib/xray/xray_powerpc64.cc
+++ b/contrib/compiler-rt/lib/xray/xray_powerpc64.cc
@ -93,6 +93,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
 // FIXME: Maybe implement this better?
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }

+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in powerpc64?
+  return false;
+}
+
 } // namespace __xray

 extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
--- a/contrib/compiler-rt/lib/xray/xray_trampoline_powerpc64_asm.S
+++ b/contrib/compiler-rt/lib/xray/xray_trampoline_powerpc64_asm.S
@ -145,27 +145,91 @@ __xray_FunctionEntry:
 	.p2align	4
 __xray_FunctionExit:
 	std 0, 16(1)
-	ld 0, -8(1) # FuncId
-	stdu 1, -72(1)
-# Spill r3, f1, and vsr34, the return value registers.
+	stdu 1, -256(1)
+# Spill r3-r4, f1-f8, and vsr34-vsr41, which are return registers.
+# If this appears to be slow, the caller needs to pass in number of generic,
+# floating point, and vector parameters, so that we only spill those live ones.
 	std 3, 32(1)
-	mr 3, 0
-	addi 4, 1, 40
-	stxsdx 1, 0, 4
+	ld 3, 248(1) # FuncId
+	std 4, 40(1)
 	addi 4, 1, 48
+	stxsdx 1, 0, 4
+	addi 4, 1, 56
+	stxsdx 2, 0, 4
+	addi 4, 1, 64
+	stxsdx 3, 0, 4
+	addi 4, 1, 72
+	stxsdx 4, 0, 4
+	addi 4, 1, 80
+	stxsdx 5, 0, 4
+	addi 4, 1, 88
+	stxsdx 6, 0, 4
+	addi 4, 1, 96
+	stxsdx 7, 0, 4
+	addi 4, 1, 104
+	stxsdx 8, 0, 4
+	addi 4, 1, 112
 	stxvd2x 34, 0, 4
+	addi 4, 1, 128
+	stxvd2x 35, 0, 4
+	addi 4, 1, 144
+	stxvd2x 36, 0, 4
+	addi 4, 1, 160
+	stxvd2x 37, 0, 4
+	addi 4, 1, 176
+	stxvd2x 38, 0, 4
+	addi 4, 1, 192
+	stxvd2x 39, 0, 4
+	addi 4, 1, 208
+	stxvd2x 40, 0, 4
+	addi 4, 1, 224
+	stxvd2x 41, 0, 4
+	std 2, 240(1)
 	mflr 0
-	std 0, 64(1)
+	std 0, 248(1)
+
 	li 4, 1
 	bl _ZN6__xray23CallXRayPatchedFunctionEi13XRayEntryType
 	nop
-	ld 0, 64(1)
-	mtlr 0
-	ld 3, 32(1)
-	addi 4, 1, 40
-	lxsdx 1, 0, 4
+
 	addi 4, 1, 48
+	lxsdx 1, 0, 4
+	addi 4, 1, 56
+	lxsdx 2, 0, 4
+	addi 4, 1, 64
+	lxsdx 3, 0, 4
+	addi 4, 1, 72
+	lxsdx 4, 0, 4
+	addi 4, 1, 80
+	lxsdx 5, 0, 4
+	addi 4, 1, 88
+	lxsdx 6, 0, 4
+	addi 4, 1, 96
+	lxsdx 7, 0, 4
+	addi 4, 1, 104
+	lxsdx 8, 0, 4
+	addi 4, 1, 112
 	lxvd2x 34, 0, 4
-	addi 1, 1, 72
+	addi 4, 1, 128
+	lxvd2x 35, 0, 4
+	addi 4, 1, 144
+	lxvd2x 36, 0, 4
+	addi 4, 1, 160
+	lxvd2x 37, 0, 4
+	addi 4, 1, 176
+	lxvd2x 38, 0, 4
+	addi 4, 1, 192
+	lxvd2x 39, 0, 4
+	addi 4, 1, 208
+	lxvd2x 40, 0, 4
+	addi 4, 1, 224
+	lxvd2x 41, 0, 4
+	ld 0, 248(1)
+	mtlr 0
+	ld 2, 240(1)
+	ld 3, 32(1)
+	ld 4, 40(1)
+
+	addi 1, 1, 256
 	ld 0, 16(1)
 	blr
--- a/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S
+++ b/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S
@ -176,9 +176,15 @@ __xray_ArgLoggerEntry:
 	je	.Larg1entryFail

 .Larg1entryLog:
-	movq	%rdi, %rdx	// first argument will become the third
-	xorq	%rsi, %rsi	// XRayEntryType::ENTRY into the second
-	movl	%r10d, %edi	// 32-bit function ID becomes the first
+
+  // First argument will become the third
+	movq	%rdi, %rdx
+
+  // XRayEntryType::ENTRY into the second
+	xorq	%rsi, %rsi
+
+	// 32-bit function ID becomes the first
+	movl	%r10d, %edi
 	callq	*%rax

 .Larg1entryFail:
@ -189,4 +195,38 @@ __xray_ArgLoggerEntry:
 	.size __xray_ArgLoggerEntry, .Larg1entryEnd-__xray_ArgLoggerEntry
 	.cfi_endproc

+//===----------------------------------------------------------------------===//
+
+	.global __xray_CustomEvent
+	.align 16, 0x90
+	.type __xray_CustomEvent,@function
+__xray_CustomEvent:
+  .cfi_startproc
+	subq $16, %rsp
+	.cfi_def_cfa_offset 24
+	movq %rbp, 8(%rsp)
+	movq %rax, 0(%rsp)
+
+	// We take two arguments to this trampoline, which should be in rdi	and rsi
+	// already. We also make sure that we stash %rax because we use that register
+	// to call the logging handler.
+	movq _ZN6__xray22XRayPatchedCustomEventE(%rip), %rax
+	testq %rax,%rax
+	je .LcustomEventCleanup
+
+	// At this point we know that rcx and rdx already has the data, so we just
+	// call the logging handler.
+  callq *%rax
+
+.LcustomEventCleanup:
+	movq 0(%rsp), %rax
+	movq 8(%rsp), %rbp
+	addq $16, %rsp
+	.cfi_def_cfa_offset 8
+	retq
+
+.Ltmp8:
+	.size __xray_CustomEvent, .Ltmp8-__xray_CustomEvent
+	.cfi_endproc
+
 NO_EXEC_STACK_DIRECTIVE
--- a/contrib/compiler-rt/lib/xray/xray_x86_64.cc
+++ b/contrib/compiler-rt/lib/xray/xray_x86_64.cc
@ -75,8 +75,10 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
 static constexpr uint8_t CallOpCode = 0xe8;
 static constexpr uint16_t MovR10Seq = 0xba41;
 static constexpr uint16_t Jmp9Seq = 0x09eb;
+static constexpr uint16_t Jmp20Seq = 0x14eb;
 static constexpr uint8_t JmpOpCode = 0xe9;
 static constexpr uint8_t RetOpCode = 0xc3;
+static constexpr uint16_t NopwSeq = 0x9066;

 static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
 static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
@ -201,6 +203,40 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
  return true;
 }

+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   jmp +19          // 2 bytes
+  //   ...
+  //
+  // With the following:
+  //
+  //   nopw             // 2 bytes*
+  //   ...
+  //
+  // We need to do this in the following order:
+  //
+  // 1. Overwrite the 5-byte nop with the call (relative), where (relative) is
+  //    the relative offset to the __xray_CustomEvent trampoline.
+  // 2. Do a two-byte atomic write over the 'jmp +24' to turn it into a 'nopw'.
+  //    This allows us to "enable" this code once the changes have committed.
+  //
+  // The "unpatch" should just turn the 'nopw' back to a 'jmp +24'.
+  //
+  if (Enable) {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq,
+        std::memory_order_release);
+  }
+  return false;
+}
+
 // We determine whether the CPU we're running on has the correct features we
 // need. In x86_64 this will be rdtscp support.
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
--- a/contrib/libc++/include/__bsd_locale_fallbacks.h
+++ b/contrib/libc++/include/__bsd_locale_fallbacks.h
@ -19,27 +19,24 @@

 _LIBCPP_BEGIN_NAMESPACE_STD

-typedef _VSTD::remove_pointer<locale_t>::type __use_locale_struct;
-typedef _VSTD::unique_ptr<__use_locale_struct, decltype(&uselocale)> __locale_raii;
-
 inline _LIBCPP_ALWAYS_INLINE
 decltype(MB_CUR_MAX) __libcpp_mb_cur_max_l(locale_t __l)
 {
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    return MB_CUR_MAX;
 }

 inline _LIBCPP_ALWAYS_INLINE
 wint_t __libcpp_btowc_l(int __c, locale_t __l)
 {
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    return btowc(__c);
 }

 inline _LIBCPP_ALWAYS_INLINE
 int __libcpp_wctob_l(wint_t __c, locale_t __l)
 {
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    return wctob(__c);
 }

@ -47,14 +44,14 @@ inline _LIBCPP_ALWAYS_INLINE
 size_t __libcpp_wcsnrtombs_l(char *__dest, const wchar_t **__src, size_t __nwc,
                         size_t __len, mbstate_t *__ps, locale_t __l)
 {
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    return wcsnrtombs(__dest, __src, __nwc, __len, __ps);
 }

 inline _LIBCPP_ALWAYS_INLINE
 size_t __libcpp_wcrtomb_l(char *__s, wchar_t __wc, mbstate_t *__ps, locale_t __l)
 {
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    return wcrtomb(__s, __wc, __ps);
 }

@ -62,7 +59,7 @@ inline _LIBCPP_ALWAYS_INLINE
 size_t __libcpp_mbsnrtowcs_l(wchar_t * __dest, const char **__src, size_t __nms,
                      size_t __len, mbstate_t *__ps, locale_t __l)
 {
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    return mbsnrtowcs(__dest, __src, __nms, __len, __ps);
 }

@ -70,28 +67,28 @@ inline _LIBCPP_ALWAYS_INLINE
 size_t __libcpp_mbrtowc_l(wchar_t *__pwc, const char *__s, size_t __n,
                   mbstate_t *__ps, locale_t __l)
 {
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    return mbrtowc(__pwc, __s, __n, __ps);
 }

 inline _LIBCPP_ALWAYS_INLINE
 int __libcpp_mbtowc_l(wchar_t *__pwc, const char *__pmb, size_t __max, locale_t __l)
 {
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    return mbtowc(__pwc, __pmb, __max);
 }

 inline _LIBCPP_ALWAYS_INLINE
 size_t __libcpp_mbrlen_l(const char *__s, size_t __n, mbstate_t *__ps, locale_t __l)
 {
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    return mbrlen(__s, __n, __ps);
 }

 inline _LIBCPP_ALWAYS_INLINE
 lconv *__libcpp_localeconv_l(locale_t __l)
 {
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    return localeconv();
 }

@ -99,7 +96,7 @@ inline _LIBCPP_ALWAYS_INLINE
 size_t __libcpp_mbsrtowcs_l(wchar_t *__dest, const char **__src, size_t __len,
                     mbstate_t *__ps, locale_t __l)
 {
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    return mbsrtowcs(__dest, __src, __len, __ps);
 }

@ -107,7 +104,7 @@ inline
 int __libcpp_snprintf_l(char *__s, size_t __n, locale_t __l, const char *__format, ...) {
    va_list __va;
    va_start(__va, __format);
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    int __res = vsnprintf(__s, __n, __format, __va);
    va_end(__va);
    return __res;
@ -117,7 +114,7 @@ inline
 int __libcpp_asprintf_l(char **__s, locale_t __l, const char *__format, ...) {
    va_list __va;
    va_start(__va, __format);
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    int __res = vasprintf(__s, __format, __va);
    va_end(__va);
    return __res;
@ -127,7 +124,7 @@ inline
 int __libcpp_sscanf_l(const char *__s, locale_t __l, const char *__format, ...) {
    va_list __va;
    va_start(__va, __format);
-    __locale_raii __current( uselocale(__l), uselocale );
+    __libcpp_locale_guard __current(__l);
    int __res = vsscanf(__s, __format, __va);
    va_end(__va);
    return __res;
--- a/contrib/libc++/include/__config
+++ b/contrib/libc++/include/__config
@ -129,6 +129,12 @@

 #define __has_keyword(__x) !(__is_identifier(__x))

+#ifdef __has_include
+#define __libcpp_has_include(__x) __has_include(__x)
+#else
+#define __libcpp_has_include(__x) 0
+#endif
+
 #if defined(__clang__)
 #define _LIBCPP_COMPILER_CLANG
 # ifndef __apple_build_version__
@ -968,7 +974,7 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
 #  if defined(__GNUC__) && ((__GNUC__ >= 5) || (__GNUC__ == 4 && \
   (__GNUC_MINOR__ >= 3 || __GNUC_PATCHLEVEL__ >= 2))) && !defined(__GXX_RTTI)
 #    define _LIBCPP_NO_RTTI
-#  elif defined(_LIBCPP_MSVC) && !defined(_CPPRTTI)
+#  elif defined(_LIBCPP_COMPILER_MSVC) && !defined(_CPPRTTI)
 #    define _LIBCPP_NO_RTTI
 #  endif
 #endif
@ -980,6 +986,7 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
 // Thread API
 #if !defined(_LIBCPP_HAS_NO_THREADS) && \
    !defined(_LIBCPP_HAS_THREAD_API_PTHREAD) && \
+    !defined(_LIBCPP_HAS_THREAD_API_WIN32) && \
    !defined(_LIBCPP_HAS_THREAD_API_EXTERNAL)
 # if defined(__FreeBSD__) || \
    defined(__Fuchsia__) || \
@ -987,7 +994,8 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container(
    defined(__linux__) || \
    defined(__APPLE__) || \
    defined(__CloudABI__) || \
-    defined(__sun__)
+    defined(__sun__) || \
+    (defined(__MINGW32__) && __libcpp_has_include(<pthread.h>))
 #   define _LIBCPP_HAS_THREAD_API_PTHREAD
 # elif defined(_LIBCPP_WIN32API)
 #  define _LIBCPP_HAS_THREAD_API_WIN32
--- a/contrib/libc++/include/__locale
+++ b/contrib/libc++/include/__locale
@ -49,6 +49,25 @@

 _LIBCPP_BEGIN_NAMESPACE_STD

+#if !defined(_LIBCPP_LOCALE__L_EXTENSIONS) || defined(_LIBCPP_MSVCRT)
+struct __libcpp_locale_guard {
+  _LIBCPP_INLINE_VISIBILITY
+  __libcpp_locale_guard(locale_t& __loc) : __old_loc_(uselocale(__loc)) {}
+
+  _LIBCPP_INLINE_VISIBILITY
+  ~__libcpp_locale_guard() {
+    if (__old_loc_)
+      uselocale(__old_loc_);
+  }
+
+  locale_t __old_loc_;
+private:
+  __libcpp_locale_guard(__libcpp_locale_guard const&);
+  __libcpp_locale_guard& operator=(__libcpp_locale_guard const&);
+};
+#endif
+
+
 class _LIBCPP_TYPE_VIS locale;

 template <class _Facet>
--- a/contrib/libc++/include/__mutex_base
+++ b/contrib/libc++/include/__mutex_base
@ -15,6 +15,7 @@
 #include <chrono>
 #include <system_error>
 #include <__threading_support>
+#include <__undef_min_max>

 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
--- a/contrib/libc++/include/__threading_support
+++ b/contrib/libc++/include/__threading_support
@ -30,6 +30,7 @@
 #include <Windows.h>
 #include <process.h>
 #include <fibersapi.h>
+#include <__undef_min_max>
 #endif

 #if defined(_LIBCPP_HAS_THREAD_LIBRARY_EXTERNAL) || \
--- a/contrib/libc++/include/__undef_min_max
+++ b/contrib/libc++/include/__undef_min_max
@ -10,7 +10,7 @@

 #ifdef min
 #if !defined(_LIBCPP_DISABLE_MACRO_CONFLICT_WARNINGS)
-#if defined(_LIBCPP_MSVC)
+#if defined(_LIBCPP_WARNING)
 _LIBCPP_WARNING("macro min is incompatible with C++.  Try #define NOMINMAX "
                "before any Windows header. #undefing min")
 #else
@ -22,7 +22,7 @@ _LIBCPP_WARNING("macro min is incompatible with C++.  Try #define NOMINMAX "

 #ifdef max
 #if !defined(_LIBCPP_DISABLE_MACRO_CONFLICT_WARNINGS)
-#if defined(_LIBCPP_MSVC)
+#if defined(_LIBCPP_WARNING)
 _LIBCPP_WARNING("macro max is incompatible with C++.  Try #define NOMINMAX "
                "before any Windows header. #undefing max")
 #else
--- a/contrib/libc++/include/algorithm
+++ b/contrib/libc++/include/algorithm
@ -644,8 +644,8 @@ template <class BidirectionalIterator, class Compare>
 #if defined(__IBMCPP__)
 #include "support/ibm/support.h"
 #endif
-#if defined(_LIBCPP_MSVCRT) || defined(__MINGW32__)
-#include "support/win32/support.h"
+#if defined(_LIBCPP_COMPILER_MSVC)
+#include <intrin.h>
 #endif

 #include <__undef_min_max>
@ -783,51 +783,132 @@ struct __debug_less

 // Precondition:  __x != 0
 inline _LIBCPP_INLINE_VISIBILITY
-unsigned
-__ctz(unsigned __x)
-{
+unsigned __ctz(unsigned __x) {
+#ifndef _LIBCPP_COMPILER_MSVC
    return static_cast<unsigned>(__builtin_ctz(__x));
+#else
+  static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
+  static_assert(sizeof(unsigned long) == 4, "");
+  unsigned long where;
+  // Search from LSB to MSB for first set bit.
+  // Returns zero if no set bit is found.
+  if (_BitScanForward(&where, mask))
+    return where;
+  return 32;
+#endif
 }

 inline _LIBCPP_INLINE_VISIBILITY
-unsigned long
-__ctz(unsigned long __x)
-{
+unsigned long __ctz(unsigned long __x) {
+#ifndef _LIBCPP_COMPILER_MSVC
    return static_cast<unsigned long>(__builtin_ctzl(__x));
+#else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned), "");
+    return __ctz(static_cast<unsigned>(__x));
+#endif
 }

 inline _LIBCPP_INLINE_VISIBILITY
-unsigned long long
-__ctz(unsigned long long __x)
-{
+unsigned long long __ctz(unsigned long long __x) {
+#ifndef _LIBCPP_COMPILER_MSVC
    return static_cast<unsigned long long>(__builtin_ctzll(__x));
+#else
+    unsigned long where;
+// Search from LSB to MSB for first set bit.
+// Returns zero if no set bit is found.
+#if defined(_LIBCPP_HAS_BITSCAN64)
+    (defined(_M_AMD64) || defined(__x86_64__))
+  if (_BitScanForward64(&where, mask))
+    return static_cast<int>(where);
+#else
+  // Win32 doesn't have _BitScanForward64 so emulate it with two 32 bit calls.
+  // Scan the Low Word.
+  if (_BitScanForward(&where, static_cast<unsigned long>(mask)))
+    return where;
+  // Scan the High Word.
+  if (_BitScanForward(&where, static_cast<unsigned long>(mask >> 32)))
+    return where + 32; // Create a bit offset from the LSB.
+#endif
+  return 64;
+#endif // _LIBCPP_COMPILER_MSVC
 }

 // Precondition:  __x != 0
 inline _LIBCPP_INLINE_VISIBILITY
-unsigned
-__clz(unsigned __x)
-{
+unsigned __clz(unsigned __x) {
+#ifndef _LIBCPP_COMPILER_MSVC
    return static_cast<unsigned>(__builtin_clz(__x));
+#else
+  static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
+  static_assert(sizeof(unsigned long) == 4, "");
+  unsigned long where;
+  // Search from LSB to MSB for first set bit.
+  // Returns zero if no set bit is found.
+  if (_BitScanReverse(&where, mask))
+    return 31 - where;
+  return 32; // Undefined Behavior.
+#endif
 }

 inline _LIBCPP_INLINE_VISIBILITY
-unsigned long
-__clz(unsigned long __x)
-{
+unsigned long __clz(unsigned long __x) {
+#ifndef _LIBCPP_COMPILER_MSVC
    return static_cast<unsigned long>(__builtin_clzl (__x));
+#else
+    static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
+    return __clz(static_cast<unsigned>(__x));
+#endif
 }

 inline _LIBCPP_INLINE_VISIBILITY
-unsigned long long
-__clz(unsigned long long __x)
-{
+unsigned long long __clz(unsigned long long __x) {
+#ifndef _LIBCPP_COMPILER_MSVC
    return static_cast<unsigned long long>(__builtin_clzll(__x));
+#else
+  unsigned long where;
+// BitScanReverse scans from MSB to LSB for first set bit.
+// Returns 0 if no set bit is found.
+#if defined(_LIBCPP_HAS_BITSCAN64)
+  if (_BitScanReverse64(&where, mask))
+    return static_cast<int>(63 - where);
+#else
+  // Scan the high 32 bits.
+  if (_BitScanReverse(&where, static_cast<unsigned long>(mask >> 32)))
+    return 63 - (where + 32); // Create a bit offset from the MSB.
+  // Scan the low 32 bits.
+  if (_BitScanReverse(&where, static_cast<unsigned long>(mask)))
+    return 63 - where;
+#endif
+  return 64; // Undefined Behavior.
+#endif // _LIBCPP_COMPILER_MSVC
 }

-inline _LIBCPP_INLINE_VISIBILITY int __pop_count(unsigned           __x) {return __builtin_popcount  (__x);}
-inline _LIBCPP_INLINE_VISIBILITY int __pop_count(unsigned      long __x) {return __builtin_popcountl (__x);}
-inline _LIBCPP_INLINE_VISIBILITY int __pop_count(unsigned long long __x) {return __builtin_popcountll(__x);}
+inline _LIBCPP_INLINE_VISIBILITY int __pop_count(unsigned __x) {
+#ifndef _LIBCPP_COMPILER_MSVC
+  return __builtin_popcount  (__x);
+#else
+  static_assert(sizeof(unsigned) == 4, "");
+  return __popcnt(__x);
+#endif
+}
+
+inline _LIBCPP_INLINE_VISIBILITY int __pop_count(unsigned long __x) {
+#ifndef _LIBCPP_COMPILER_MSVC
+  return __builtin_popcountl (__x);
+#else
+  static_assert(sizeof(unsigned long) == 4, "");
+  return __popcnt(__x);
+#endif
+}
+
+inline _LIBCPP_INLINE_VISIBILITY int __pop_count(unsigned long long __x) {
+#ifndef _LIBCPP_COMPILER_MSVC
+  return __builtin_popcountll(__x);
+#else
+  static_assert(sizeof(unsigned long long) == 8, "");
+  return __popcnt64(__x);
+#endif
+}

 // all_of

--- a/contrib/libc++/include/ctype.h
+++ b/contrib/libc++/include/ctype.h
@ -40,15 +40,6 @@ int toupper(int c);

 #ifdef __cplusplus

-#if defined(_LIBCPP_MSVCRT)
-// We support including .h headers inside 'extern "C"' contexts, so switch
-// back to C++ linkage before including these C++ headers.
-extern "C++" {
-  #include "support/win32/support.h"
-  #include "support/win32/locale_win32.h"
-}
-#endif // _LIBCPP_MSVCRT
-
 #undef isalnum
 #undef isalpha
 #undef isblank
--- a/contrib/libc++/include/experimental/numeric
+++ b/contrib/libc++/include/experimental/numeric
@ -66,11 +66,11 @@ struct __abs<_Result, _Source, false> {


 template<class _Tp>
-_LIBCPP_CONSTEXPR _LIBCPP_INLINE_VISIBILITY
-_Tp __gcd(_Tp __m, _Tp __n)
+_LIBCPP_CONSTEXPR _LIBCPP_HIDDEN
+inline _Tp __gcd(_Tp __m, _Tp __n)
 {
    static_assert((!is_signed<_Tp>::value), "" );
-    return __n == 0 ? __m : __gcd<_Tp>(__n, __m % __n);
+    return __n == 0 ? __m : _VSTD_LFTS_V2::__gcd<_Tp>(__n, __m % __n);
 }


@ -84,8 +84,9 @@ gcd(_Tp __m, _Up __n)
    static_assert((!is_same<typename remove_cv<_Up>::type, bool>::value), "Second argument to gcd cannot be bool" );
    using _Rp = common_type_t<_Tp,_Up>;
    using _Wp = make_unsigned_t<_Rp>;
-    return static_cast<_Rp>(__gcd(static_cast<_Wp>(__abs<_Rp, _Tp>()(__m)),
-                                  static_cast<_Wp>(__abs<_Rp, _Up>()(__n))));
+    return static_cast<_Rp>(_VSTD_LFTS_V2::__gcd(
+      static_cast<_Wp>(__abs<_Rp, _Tp>()(__m)),
+      static_cast<_Wp>(__abs<_Rp, _Up>()(__n))));
 }

 template<class _Tp, class _Up>
@ -100,7 +101,7 @@ lcm(_Tp __m, _Up __n)
        return 0;

    using _Rp = common_type_t<_Tp,_Up>;
-    _Rp __val1 = __abs<_Rp, _Tp>()(__m) / gcd(__m, __n);
+    _Rp __val1 = __abs<_Rp, _Tp>()(__m) / _VSTD_LFTS_V2::gcd(__m, __n);
    _Rp __val2 = __abs<_Rp, _Up>()(__n);
    _LIBCPP_ASSERT((numeric_limits<_Rp>::max() / __val1 > __val2), "Overflow in lcm");
    return __val1 * __val2;
--- a/contrib/libc++/include/ext/hash_map
+++ b/contrib/libc++/include/ext/hash_map
@ -207,7 +207,7 @@ template <class Key, class T, class Hash, class Pred, class Alloc>
 #include <ext/__hash>

 #if __DEPRECATED
-#if defined(_LIBCPP_MSVC)
+#if defined(_LIBCPP_WARNING)
    _LIBCPP_WARNING("Use of the header <ext/hash_map> is deprecated.  Migrate to <unordered_map>")
 #else
 #   warning Use of the header <ext/hash_map> is deprecated.  Migrate to <unordered_map>
--- a/contrib/libc++/include/ext/hash_set
+++ b/contrib/libc++/include/ext/hash_set
@ -199,7 +199,7 @@ template <class Value, class Hash, class Pred, class Alloc>
 #include <ext/__hash>

 #if __DEPRECATED
-#if defined(_LIBCPP_MSVC)
+#if defined(_LIBCPP_WARNING)
    _LIBCPP_WARNING("Use of the header <ext/hash_set> is deprecated.  Migrate to <unordered_set>")
 #else
 #   warning Use of the header <ext/hash_set> is deprecated.  Migrate to <unordered_set>
--- a/contrib/libc++/include/limits
+++ b/contrib/libc++/include/limits
@ -111,8 +111,8 @@ template<> class numeric_limits<cv long double>;

 #include <__undef_min_max>

-#if defined(_LIBCPP_MSVCRT)
-#include "support/win32/limits_win32.h"
+#if defined(_LIBCPP_COMPILER_MSVC)
+#include "support/win32/limits_msvc_win32.h"
 #endif // _LIBCPP_MSVCRT

 #if defined(__IBMCPP__)
--- a/contrib/libc++/include/locale
+++ b/contrib/libc++/include/locale
@ -233,9 +233,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #define __cloc_defined
 #endif

-typedef _VSTD::remove_pointer<locale_t>::type __locale_struct;
-typedef _VSTD::unique_ptr<__locale_struct, decltype(&freelocale)> __locale_unique_ptr;
-
 // __scan_keyword
 // Scans [__b, __e) until a match is found in the basic_strings range
 //  [__kb, __ke) or until it can be shown that there is no match in [__kb, __ke).
--- a/contrib/libc++/include/memory
+++ b/contrib/libc++/include/memory
@ -996,11 +996,11 @@ struct __rebind_pointer {

 // allocator_traits

-namespace __has_pointer_type_imp
+struct __has_pointer_type_imp
 {
    template <class _Up> static __two __test(...);
    template <class _Up> static char __test(typename _Up::pointer* = 0);
-}
+};

 template <class _Tp>
 struct __has_pointer_type
@ -3924,7 +3924,10 @@ private:

    template <class _Yp, class _OrigPtr>
        _LIBCPP_INLINE_VISIBILITY
-        void
+        typename enable_if<is_convertible<_OrigPtr*,
+                                          const enable_shared_from_this<_Yp>*
+        >::value,
+            void>::type
        __enable_weak_this(const enable_shared_from_this<_Yp>* __e,
                           _OrigPtr* __ptr) _NOEXCEPT
        {
@ -3943,6 +3946,7 @@ private:
    template <class _Up> friend class _LIBCPP_TEMPLATE_VIS weak_ptr;
 };

+
 template<class _Tp>
 inline
 _LIBCPP_CONSTEXPR
--- a/contrib/libc++/include/numeric
+++ b/contrib/libc++/include/numeric
@ -222,11 +222,11 @@ struct __abs<_Result, _Source, false> {


 template<class _Tp>
-_LIBCPP_CONSTEXPR _LIBCPP_INLINE_VISIBILITY
+_LIBCPP_CONSTEXPR _LIBCPP_HIDDEN
 _Tp __gcd(_Tp __m, _Tp __n)
 {
    static_assert((!is_signed<_Tp>::value), "");
-    return __n == 0 ? __m : __gcd<_Tp>(__n, __m % __n);
+    return __n == 0 ? __m : _VSTD::__gcd<_Tp>(__n, __m % __n);
 }


@ -240,8 +240,9 @@ gcd(_Tp __m, _Up __n)
    static_assert((!is_same<typename remove_cv<_Up>::type, bool>::value), "Second argument to gcd cannot be bool" );
    using _Rp = common_type_t<_Tp,_Up>;
    using _Wp = make_unsigned_t<_Rp>;
-    return static_cast<_Rp>(__gcd(static_cast<_Wp>(__abs<_Rp, _Tp>()(__m)),
-                                  static_cast<_Wp>(__abs<_Rp, _Up>()(__n))));
+    return static_cast<_Rp>(_VSTD::__gcd(
+        static_cast<_Wp>(__abs<_Rp, _Tp>()(__m)),
+        static_cast<_Wp>(__abs<_Rp, _Up>()(__n))));
 }

 template<class _Tp, class _Up>
@ -256,7 +257,7 @@ lcm(_Tp __m, _Up __n)
        return 0;

    using _Rp = common_type_t<_Tp,_Up>;
-    _Rp __val1 = __abs<_Rp, _Tp>()(__m) / gcd(__m, __n);
+    _Rp __val1 = __abs<_Rp, _Tp>()(__m) / _VSTD::gcd(__m, __n);
    _Rp __val2 = __abs<_Rp, _Up>()(__n);
    _LIBCPP_ASSERT((numeric_limits<_Rp>::max() / __val1 > __val2), "Overflow in lcm");
    return __val1 * __val2;
--- a/Show More
+++ b/Show More