Vendor import of compiler-rt trunk r338150:

https://llvm.org/svn/llvm-project/compiler-rt/trunk@338150
2018-07-28 11:06:48 +00:00 · 2018-07-28 11:06:48 +00:00 · 93c1b73a09
commit 93c1b73a09
parent 0d8e7490d6
870 changed files with 50343 additions and 6771 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -9,6 +9,7 @@ cmake_minimum_required(VERSION 3.4.3)
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR COMPILER_RT_STANDALONE_BUILD)
  project(CompilerRT C CXX ASM)
  set(COMPILER_RT_STANDALONE_BUILD TRUE)
+  set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 endif()

 # Add path for custom compiler-rt modules.
@ -63,6 +64,11 @@ set(COMPILER_RT_BAREMETAL_BUILD OFF CACHE BOOLEAN

 if (COMPILER_RT_STANDALONE_BUILD)
  load_llvm_config()
+  if (TARGET intrinsics_gen)
+    # Loading the llvm config causes this target to be imported so place it
+    # under the appropriate folder in an IDE.
+    set_target_properties(intrinsics_gen PROPERTIES FOLDER "Compiler-RT Misc")
+  endif()

  # Find Python interpreter.
  set(Python_ADDITIONAL_VERSIONS 2.7 2.6 2.5)
@ -96,6 +102,8 @@ pythonize_bool(ANDROID)
 set(COMPILER_RT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(COMPILER_RT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})

+pythonize_bool(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
+
 # We support running instrumented tests when we're not cross compiling
 # and target a UNIX-like system or Windows.
 # We can run tests on Android even when we are cross-compiling.
@ -112,9 +120,6 @@ option(COMPILER_RT_EXTERNALIZE_DEBUGINFO
 # COMPILER_RT_DEBUG_PYBOOL is used by lit.common.configured.in.
 pythonize_bool(COMPILER_RT_DEBUG)

-include(HandleCompilerRT)
-include(config-ix)
-
 if(APPLE AND SANITIZER_MIN_OSX_VERSION AND SANITIZER_MIN_OSX_VERSION VERSION_LESS "10.9")
  # Mac OS X prior to 10.9 had problems with exporting symbols from
  # libc++/libc++abi.
@ -133,41 +138,34 @@ pythonize_bool(SANITIZER_CAN_USE_CXXABI)

 set(SANITIZER_CXX_ABI "default" CACHE STRING
    "Specify C++ ABI library to use.")
-set(CXXABIS none default libcxxabi libstdc++ libc++)
+set(CXXABIS none default libstdc++ libc++)
 set_property(CACHE SANITIZER_CXX_ABI PROPERTY STRINGS ;${CXXABIS})

 if (SANITIZER_CXX_ABI STREQUAL "default")
-  if (HAVE_LIBCXXABI AND COMPILER_RT_DEFAULT_TARGET_ONLY)
-    set(SANITIZER_CXX_ABI_LIBNAME "libcxxabi")
-    set(SANITIZER_CXX_ABI_INTREE 1)
-  elseif (APPLE)
-    set(SANITIZER_CXX_ABI_LIBNAME "libcxxabi")
+  if (APPLE)
+    set(SANITIZER_CXX_ABI_LIBNAME "libc++")
    set(SANITIZER_CXX_ABI_SYSTEM 1)
+  elseif (FUCHSIA)
+    set(SANITIZER_CXX_ABI_LIBNAME "libc++")
+    set(SANITIZER_CXX_ABI_INTREE 1)
  else()
    set(SANITIZER_CXX_ABI_LIBNAME "libstdc++")
+    set(SANITIZER_CXX_ABI_SYSTEM 1)
  endif()
 else()
  set(SANITIZER_CXX_ABI_LIBNAME "${SANITIZER_CXX_ABI}")
+  set(SANITIZER_CXX_ABI_SYSTEM 1)
 endif()

-if (SANITIZER_CXX_ABI_LIBNAME STREQUAL "libcxxabi")
-  if (SANITIZER_CXX_ABI_INTREE)
-    if (TARGET unwind_shared OR HAVE_LIBUNWIND)
-      list(APPEND SANITIZER_CXX_ABI_LIBRARY unwind_shared)
-    endif()
-    if (TARGET cxxabi_shared OR HAVE_LIBCXXABI)
-      list(APPEND SANITIZER_CXX_ABI_LIBRARY cxxabi_shared)
-    endif()
-  else()
-    list(APPEND SANITIZER_CXX_ABI_LIBRARY "c++abi")
-  endif()
-elseif (SANITIZER_CXX_ABI_LIBNAME STREQUAL "libc++")
-  list(APPEND SANITIZER_CXX_ABI_LIBRARY "c++")
-elseif (SANITIZER_CXX_ABI_LIBNAME STREQUAL "libstdc++")
-  append_list_if(COMPILER_RT_HAS_LIBSTDCXX stdc++ SANITIZER_CXX_ABI_LIBRARY)
+set(DEFAULT_COMPILER_RT_USE_BUILTINS_LIBRARY OFF)
+if (FUCHSIA)
+  set(DEFAULT_COMPILER_RT_USE_BUILTINS_LIBRARY ON)
 endif()

-option(SANITIZER_USE_COMPILER_RT "Use compiler-rt builtins instead of libgcc" OFF)
+option(COMPILER_RT_USE_BUILTINS_LIBRARY
+  "Use compiler-rt builtins instead of libgcc" ${DEFAULT_COMPILER_RT_USE_BUILTINS_LIBRARY})
+
+include(config-ix)

 #================================
 # Setup Compiler Flags
@ -274,12 +272,14 @@ else()
  set(SANITIZER_LIMIT_FRAME_SIZE FALSE)
 endif()

+if(FUCHSIA OR UNIX)
+  set(SANITIZER_USE_SYMBOLS TRUE)
+else()
+  set(SANITIZER_USE_SYMBOLS FALSE)
+endif()
+
 # Build sanitizer runtimes with debug info.
-if(COMPILER_RT_HAS_GLINE_TABLES_ONLY_FLAG AND NOT COMPILER_RT_DEBUG)
-  list(APPEND SANITIZER_COMMON_CFLAGS -gline-tables-only)
-elseif(COMPILER_RT_HAS_G_FLAG)
-  list(APPEND SANITIZER_COMMON_CFLAGS -g)
-elseif(MSVC)
+if(MSVC)
  # Use /Z7 instead of /Zi for the asan runtime. This avoids the LNK4099
  # warning from the MS linker complaining that it can't find the 'vc140.pdb'
  # file used by our object library compilations.
@ -287,6 +287,10 @@ elseif(MSVC)
  llvm_replace_compiler_option(CMAKE_CXX_FLAGS "/Z[i7I]" "/Z7")
  llvm_replace_compiler_option(CMAKE_CXX_FLAGS_DEBUG "/Z[i7I]" "/Z7")
  llvm_replace_compiler_option(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/Z[i7I]" "/Z7")
+elseif(COMPILER_RT_HAS_GLINE_TABLES_ONLY_FLAG AND NOT COMPILER_RT_DEBUG)
+  list(APPEND SANITIZER_COMMON_CFLAGS -gline-tables-only)
+elseif(COMPILER_RT_HAS_G_FLAG)
+  list(APPEND SANITIZER_COMMON_CFLAGS -g)
 endif()

 if(LLVM_ENABLE_MODULES)
@ -309,9 +313,7 @@ append_list_if(COMPILER_RT_HAS_WD4800_FLAG /wd4800 SANITIZER_COMMON_CFLAGS)
 # Set common link flags.
 append_list_if(COMPILER_RT_HAS_NODEFAULTLIBS_FLAG -nodefaultlibs SANITIZER_COMMON_LINK_FLAGS)

-if (SANITIZER_USE_COMPILER_RT)
-  list(APPEND SANITIZER_COMMON_LINK_FLAGS -rtlib=compiler-rt)
-  find_compiler_rt_library(builtins COMPILER_RT_BUILTINS_LIBRARY)
+if (COMPILER_RT_USE_BUILTINS_LIBRARY)
  list(APPEND SANITIZER_COMMON_LINK_LIBS ${COMPILER_RT_BUILTINS_LIBRARY})
 else()
  if (ANDROID)
@ -323,11 +325,40 @@ endif()

 append_list_if(COMPILER_RT_HAS_LIBC c SANITIZER_COMMON_LINK_LIBS)

+if(ANDROID)
+# Put the Sanitizer shared libraries in the global group. For more details, see
+# android-changes-for-ndk-developers.md#changes-to-library-search-order
+  if (COMPILER_RT_HAS_Z_GLOBAL)
+    list(APPEND SANITIZER_COMMON_LINK_FLAGS -Wl,-z,global)
+  endif()
+endif()
+
 if("${CMAKE_SYSTEM_NAME}" STREQUAL "Fuchsia")
  list(APPEND SANITIZER_COMMON_LINK_FLAGS -Wl,-z,defs,-z,now,-z,relro)
  list(APPEND SANITIZER_COMMON_LINK_LIBS zircon)
 endif()

+if (SANITIZER_CXX_ABI_LIBNAME STREQUAL "libc++")
+  if (SANITIZER_CXX_ABI_INTREE)
+    if (NOT LIBCXXABI_ENABLE_STATIC_UNWINDER AND (TARGET unwind_shared OR HAVE_LIBUNWIND))
+      list(APPEND SANITIZER_CXX_ABI_LIBRARY unwind_shared)
+    elseif (LIBCXXABI_ENABLE_STATIC_UNWINDER AND (TARGET unwind_static OR HAVE_LIBUNWIND))
+      list(APPEND SANITIZER_CXX_ABI_LIBRARY unwind_static)
+    endif()
+    if (NOT LIBCXX_ENABLE_STATIC_ABI_LIBRARY AND (TARGET cxxabi_shared OR HAVE_LIBCXXABI))
+      list(APPEND SANITIZER_CXX_ABI_LIBRARY cxxabi_shared)
+    elseif (LIBCXX_ENABLE_STATIC_ABI_LIBRARY AND (TARGET cxxabi_static OR HAVE_LIBCXXABI))
+      list(APPEND SANITIZER_CXX_ABI_LIBRARY cxxabi_static)
+    endif()
+  else()
+    append_list_if(COMPILER_RT_HAS_LIBCXX c++ SANITIZER_CXX_ABI_LIBRARY)
+  endif()
+elseif (SANITIZER_CXX_ABI_LIBNAME STREQUAL "libcxxabi")
+  list(APPEND SANITIZER_CXX_ABI_LIBRARY "c++abi")
+elseif (SANITIZER_CXX_ABI_LIBNAME STREQUAL "libstdc++")
+  append_list_if(COMPILER_RT_HAS_LIBSTDCXX stdc++ SANITIZER_CXX_ABI_LIBRARY)
+endif()
+
 # Warnings to turn off for all libraries, not just sanitizers.
 append_string_if(COMPILER_RT_HAS_WUNUSED_PARAMETER_FLAG -Wno-unused-parameter CMAKE_C_FLAGS CMAKE_CXX_FLAGS)

@ -339,7 +370,7 @@ if (CMAKE_LINKER MATCHES "link.exe$")
  # it, but CMake doesn't seem to have a way to set linker flags for
  # individual static libraries, so we enable the suppression flag for
  # the whole compiler-rt project.
-  append("/IGNORE:4221" CMAKE_STATIC_LINKER_FLAGS)
+  set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} /IGNORE:4221")
 endif()

 add_subdirectory(include)
--- a/LICENSE.TXT
+++ b/LICENSE.TXT
@ -14,7 +14,7 @@ Full text of the relevant licenses is included below.
 University of Illinois/NCSA
 Open Source License

-Copyright (c) 2009-2016 by the contributors listed in CREDITS.TXT
+Copyright (c) 2009-2018 by the contributors listed in CREDITS.TXT

 All rights reserved.

--- a/cmake/Modules/AddCompilerRT.cmake
+++ b/cmake/Modules/AddCompilerRT.cmake
@ -31,9 +31,12 @@ endfunction()
 #                                  ARCHS <architectures>
 #                                  SOURCES <source files>
 #                                  CFLAGS <compile flags>
-#                                  DEFS <compile definitions>)
+#                                  DEFS <compile definitions>
+#                                  DEPS <dependencies>
+#                                  ADDITIONAL_HEADERS <header files>)
 function(add_compiler_rt_object_libraries name)
-  cmake_parse_arguments(LIB "" "" "OS;ARCHS;SOURCES;CFLAGS;DEFS" ${ARGN})
+  cmake_parse_arguments(LIB "" "" "OS;ARCHS;SOURCES;CFLAGS;DEFS;DEPS;ADDITIONAL_HEADERS"
+    ${ARGN})
  set(libnames)
  if(APPLE)
    foreach(os ${LIB_OS})
@ -54,8 +57,18 @@ function(add_compiler_rt_object_libraries name)
    endforeach()
  endif()

+  # Add headers to LIB_SOURCES for IDEs
+  compiler_rt_process_sources(LIB_SOURCES
+    ${LIB_SOURCES}
+    ADDITIONAL_HEADERS
+      ${LIB_ADDITIONAL_HEADERS}
+  )
+
  foreach(libname ${libnames})
    add_library(${libname} OBJECT ${LIB_SOURCES})
+    if(LIB_DEPS)
+      add_dependencies(${libname} ${LIB_DEPS})
+    endif()

    # Strip out -msse3 if this isn't macOS.
    set(target_flags ${LIB_CFLAGS})
@ -105,10 +118,14 @@ function(add_asm_sources output)
 endfunction()

 macro(set_output_name output name arch)
-  if(ANDROID AND ${arch} STREQUAL "i386")
-    set(${output} "${name}-i686${COMPILER_RT_OS_SUFFIX}")
+  if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
+    set(${output} ${name})
  else()
-    set(${output} "${name}-${arch}${COMPILER_RT_OS_SUFFIX}")
+    if(ANDROID AND ${arch} STREQUAL "i386")
+      set(${output} "${name}-i686${COMPILER_RT_OS_SUFFIX}")
+    else()
+      set(${output} "${name}-${arch}${COMPILER_RT_OS_SUFFIX}")
+    endif()
  endif()
 endmacro()

@ -124,7 +141,8 @@ endmacro()
 #                         DEFS <compile definitions>
 #                         LINK_LIBS <linked libraries> (only for shared library)
 #                         OBJECT_LIBS <object libraries to use as sources>
-#                         PARENT_TARGET <convenience parent target>)
+#                         PARENT_TARGET <convenience parent target>
+#                         ADDITIONAL_HEADERS <header files>)
 function(add_compiler_rt_runtime name type)
  if(NOT type MATCHES "^(STATIC|SHARED)$")
    message(FATAL_ERROR "type argument must be STATIC or SHARED")
@ -133,7 +151,7 @@ function(add_compiler_rt_runtime name type)
  cmake_parse_arguments(LIB
    ""
    "PARENT_TARGET"
-    "OS;ARCHS;SOURCES;CFLAGS;LINK_FLAGS;DEFS;LINK_LIBS;OBJECT_LIBS"
+    "OS;ARCHS;SOURCES;CFLAGS;LINK_FLAGS;DEFS;LINK_LIBS;OBJECT_LIBS;ADDITIONAL_HEADERS"
    ${ARGN})
  set(libnames)
  # Until we support this some other way, build compiler-rt runtime without LTO
@ -144,6 +162,18 @@ function(add_compiler_rt_runtime name type)
    set(NO_LTO_FLAGS "")
  endif()

+  list(LENGTH LIB_SOURCES LIB_SOURCES_LENGTH)
+  if (${LIB_SOURCES_LENGTH} GREATER 0)
+    # Add headers to LIB_SOURCES for IDEs. It doesn't make sense to
+    # do this for a runtime library that only consists of OBJECT
+    # libraries, so only add the headers when source files are present.
+    compiler_rt_process_sources(LIB_SOURCES
+      ${LIB_SOURCES}
+      ADDITIONAL_HEADERS
+        ${LIB_ADDITIONAL_HEADERS}
+    )
+  endif()
+
  if(APPLE)
    foreach(os ${LIB_OS})
      # Strip out -msse3 if this isn't macOS.
@ -164,6 +194,8 @@ function(add_compiler_rt_runtime name type)
        set(output_name_${libname} ${libname}${COMPILER_RT_OS_SUFFIX})
        set(sources_${libname} ${LIB_SOURCES})
        format_object_libs(sources_${libname} ${os} ${LIB_OBJECT_LIBS})
+        get_compiler_rt_output_dir(${COMPILER_RT_DEFAULT_TARGET_ARCH} output_dir_${libname})
+        get_compiler_rt_install_dir(${COMPILER_RT_DEFAULT_TARGET_ARCH} install_dir_${libname})
      endif()
    endforeach()
  else()
@ -189,6 +221,8 @@ function(add_compiler_rt_runtime name type)
      format_object_libs(sources_${libname} ${arch} ${LIB_OBJECT_LIBS})
      set(libnames ${libnames} ${libname})
      set(extra_cflags_${libname} ${TARGET_${arch}_CFLAGS} ${NO_LTO_FLAGS} ${LIB_CFLAGS})
+      get_compiler_rt_output_dir(${arch} output_dir_${libname})
+      get_compiler_rt_install_dir(${arch} install_dir_${libname})
    endforeach()
  endif()

@ -200,6 +234,8 @@ function(add_compiler_rt_runtime name type)
    # If the parent targets aren't created we should create them
    if(NOT TARGET ${LIB_PARENT_TARGET})
      add_custom_target(${LIB_PARENT_TARGET})
+      set_target_properties(${LIB_PARENT_TARGET} PROPERTIES
+                            FOLDER "Compiler-RT Misc")
    endif()
    if(NOT TARGET install-${LIB_PARENT_TARGET})
      # The parent install target specifies the parent component to scrape up
@ -239,7 +275,7 @@ function(add_compiler_rt_runtime name type)
    set_target_link_flags(${libname} ${extra_link_flags_${libname}})
    set_property(TARGET ${libname} APPEND PROPERTY
                COMPILE_DEFINITIONS ${LIB_DEFS})
-    set_target_output_directories(${libname} ${COMPILER_RT_LIBRARY_OUTPUT_DIR})
+    set_target_output_directories(${libname} ${output_dir_${libname}})
    set_target_properties(${libname} PROPERTIES
        OUTPUT_NAME ${output_name_${libname}})
    set_target_properties(${libname} PROPERTIES FOLDER "Compiler-RT Runtime")
@ -247,6 +283,9 @@ function(add_compiler_rt_runtime name type)
      target_link_libraries(${libname} ${LIB_LINK_LIBS})
    endif()
    if(${type} STREQUAL "SHARED")
+      if(COMMAND llvm_setup_rpath)
+        llvm_setup_rpath(${libname})
+      endif()
      if(WIN32 AND NOT CYGWIN AND NOT MINGW)
        set_target_properties(${libname} PROPERTIES IMPORT_PREFIX "")
        set_target_properties(${libname} PROPERTIES IMPORT_SUFFIX ".lib")
@ -261,11 +300,11 @@ function(add_compiler_rt_runtime name type)
      endif()
    endif()
    install(TARGETS ${libname}
-      ARCHIVE DESTINATION ${COMPILER_RT_LIBRARY_INSTALL_DIR}
+      ARCHIVE DESTINATION ${install_dir_${libname}}
              ${COMPONENT_OPTION}
-      LIBRARY DESTINATION ${COMPILER_RT_LIBRARY_INSTALL_DIR}
+      LIBRARY DESTINATION ${install_dir_${libname}}
              ${COMPONENT_OPTION}
-      RUNTIME DESTINATION ${COMPILER_RT_LIBRARY_INSTALL_DIR}
+      RUNTIME DESTINATION ${install_dir_${libname}}
              ${COMPONENT_OPTION})

    # We only want to generate per-library install targets if you aren't using
@ -431,7 +470,7 @@ endfunction()

 macro(add_compiler_rt_resource_file target_name file_name component)
  set(src_file "${CMAKE_CURRENT_SOURCE_DIR}/${file_name}")
-  set(dst_file "${COMPILER_RT_OUTPUT_DIR}/${file_name}")
+  set(dst_file "${COMPILER_RT_OUTPUT_DIR}/share/${file_name}")
  add_custom_command(OUTPUT ${dst_file}
    DEPENDS ${src_file}
    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src_file} ${dst_file}
@ -439,7 +478,7 @@ macro(add_compiler_rt_resource_file target_name file_name component)
  add_custom_target(${target_name} DEPENDS ${dst_file})
  # Install in Clang resource directory.
  install(FILES ${file_name}
-    DESTINATION ${COMPILER_RT_INSTALL_PATH}
+    DESTINATION ${COMPILER_RT_INSTALL_PATH}/share
    COMPONENT ${component})
  add_dependencies(${component} ${target_name})

@ -463,53 +502,123 @@ endmacro(add_compiler_rt_script src name)
 # Can be used to build sanitized versions of libc++ for running unit tests.
 # add_custom_libcxx(<name> <prefix>
 #                   DEPS <list of build deps>
-#                   CFLAGS <list of compile flags>)
+#                   CFLAGS <list of compile flags>
+#                   USE_TOOLCHAIN)
 macro(add_custom_libcxx name prefix)
  if(NOT COMPILER_RT_LIBCXX_PATH)
    message(FATAL_ERROR "libcxx not found!")
  endif()

-  cmake_parse_arguments(LIBCXX "" "" "DEPS;CFLAGS;CMAKE_ARGS" ${ARGN})
-  foreach(flag ${LIBCXX_CFLAGS})
-    set(flagstr "${flagstr} ${flag}")
-  endforeach()
-  set(LIBCXX_CFLAGS ${flagstr})
+  cmake_parse_arguments(LIBCXX "USE_TOOLCHAIN" "" "DEPS;CFLAGS;CMAKE_ARGS" ${ARGN})

-  if(NOT COMPILER_RT_STANDALONE_BUILD)
-    list(APPEND LIBCXX_DEPS clang)
+  if(LIBCXX_USE_TOOLCHAIN)
+    set(compiler_args -DCMAKE_C_COMPILER=${COMPILER_RT_TEST_COMPILER}
+                      -DCMAKE_CXX_COMPILER=${COMPILER_RT_TEST_CXX_COMPILER})
+    if(NOT COMPILER_RT_STANDALONE_BUILD)
+      set(toolchain_deps $<TARGET_FILE:clang>)
+      set(force_deps DEPENDS $<TARGET_FILE:clang>)
+    endif()
+  else()
+    set(compiler_args -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER})
  endif()

+  set(STAMP_DIR ${prefix}-stamps/)
+  set(BINARY_DIR ${prefix}-bins/)
+
+  add_custom_target(${name}-clear
+    COMMAND ${CMAKE_COMMAND} -E remove_directory ${BINARY_DIR}
+    COMMAND ${CMAKE_COMMAND} -E remove_directory ${STAMP_DIR}
+    COMMENT "Clobbering ${name} build and stamp directories"
+    USES_TERMINAL
+    )
+  set_target_properties(${name}-clear PROPERTIES FOLDER "Compiler-RT Misc")
+
+  add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${name}-clobber-stamp
+    DEPENDS ${LIBCXX_DEPS} ${toolchain_deps}
+    COMMAND ${CMAKE_COMMAND} -E touch ${BINARY_DIR}/CMakeCache.txt
+    COMMAND ${CMAKE_COMMAND} -E touch ${STAMP_DIR}/${name}-mkdir
+    COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/${name}-clobber-stamp
+    COMMENT "Clobbering bootstrap build and stamp directories"
+    )
+
+  add_custom_target(${name}-clobber
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${name}-clobber-stamp)
+  set_target_properties(${name}-clobber PROPERTIES FOLDER "Compiler-RT Misc")
+
+  set(PASSTHROUGH_VARIABLES
+    CMAKE_C_COMPILER_TARGET
+    CMAKE_CXX_COMPILER_TARGET
+    CMAKE_INSTALL_PREFIX
+    CMAKE_MAKE_PROGRAM
+    CMAKE_LINKER
+    CMAKE_AR
+    CMAKE_RANLIB
+    CMAKE_NM
+    CMAKE_OBJCOPY
+    CMAKE_OBJDUMP
+    CMAKE_STRIP
+    CMAKE_SYSROOT
+    CMAKE_SYSTEM_NAME)
+  foreach(variable ${PASSTHROUGH_VARIABLES})
+    if(${variable})
+      list(APPEND CMAKE_PASSTHROUGH_VARIABLES -D${variable}=${${variable}})
+    endif()
+  endforeach()
+
+  string(REPLACE ";" " " FLAGS_STRING "${LIBCXX_CFLAGS}")
+  set(LIBCXX_C_FLAGS "${FLAGS_STRING}")
+  set(LIBCXX_CXX_FLAGS "${FLAGS_STRING}")
+
  ExternalProject_Add(${name}
+    DEPENDS ${name}-clobber ${LIBCXX_DEPS}
    PREFIX ${prefix}
    SOURCE_DIR ${COMPILER_RT_LIBCXX_PATH}
-    CMAKE_ARGS -DCMAKE_MAKE_PROGRAM:STRING=${CMAKE_MAKE_PROGRAM}
-               -DCMAKE_C_COMPILER=${COMPILER_RT_TEST_COMPILER}
-               -DCMAKE_CXX_COMPILER=${COMPILER_RT_TEST_CXX_COMPILER}
-               -DCMAKE_C_FLAGS=${LIBCXX_CFLAGS}
-               -DCMAKE_CXX_FLAGS=${LIBCXX_CFLAGS}
+    STAMP_DIR ${STAMP_DIR}
+    BINARY_DIR ${BINARY_DIR}
+    CMAKE_ARGS ${CMAKE_PASSTHROUGH_VARIABLES}
+               ${compiler_args}
+               -DCMAKE_C_FLAGS=${LIBCXX_C_FLAGS}
+               -DCMAKE_CXX_FLAGS=${LIBCXX_CXX_FLAGS}
               -DCMAKE_BUILD_TYPE=Release
-               -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
               -DLLVM_PATH=${LLVM_MAIN_SRC_DIR}
-               -DLIBCXX_STANDALONE_BUILD=On
+               -DLLVM_BINARY_DIR=${prefix}
+               -DLLVM_LIBRARY_OUTPUT_INTDIR=${prefix}/lib
+               -DLIBCXX_STANDALONE_BUILD=ON
               ${LIBCXX_CMAKE_ARGS}
-    LOG_BUILD 1
-    LOG_CONFIGURE 1
-    LOG_INSTALL 1
-    )
-  set_target_properties(${name} PROPERTIES EXCLUDE_FROM_ALL TRUE)
-
-  ExternalProject_Add_Step(${name} force-reconfigure
-    DEPENDERS configure
-    ALWAYS 1
+    INSTALL_COMMAND ""
+    STEP_TARGETS configure build
+    BUILD_ALWAYS 1
+    USES_TERMINAL_CONFIGURE 1
+    USES_TERMINAL_BUILD 1
+    USES_TERMINAL_INSTALL 1
+    EXCLUDE_FROM_ALL TRUE
    )

-  ExternalProject_Add_Step(${name} clobber
-    COMMAND ${CMAKE_COMMAND} -E remove_directory <BINARY_DIR>
-    COMMAND ${CMAKE_COMMAND} -E make_directory <BINARY_DIR>
-    COMMENT "Clobberring ${name} build directory..."
-    DEPENDERS configure
-    DEPENDS ${LIBCXX_DEPS}
+  if (CMAKE_GENERATOR MATCHES "Make")
+    set(run_clean "$(MAKE)" "-C" "${BINARY_DIR}" "clean")
+  else()
+    set(run_clean ${CMAKE_COMMAND} --build ${BINARY_DIR} --target clean
+                                   --config "$<CONFIGURATION>")
+  endif()
+
+  ExternalProject_Add_Step(${name} clean
+    COMMAND ${run_clean}
+    COMMENT "Cleaning ${name}..."
+    DEPENDEES configure
+    ${force_deps}
+    WORKING_DIRECTORY ${BINARY_DIR}
+    EXCLUDE_FROM_MAIN 1
+    USES_TERMINAL 1
    )
+  ExternalProject_Add_StepTargets(${name} clean)
+
+  if(LIBCXX_USE_TOOLCHAIN)
+    add_dependencies(${name}-clean ${name}-clobber)
+    set_target_properties(${name}-clean PROPERTIES
+      SOURCES ${CMAKE_CURRENT_BINARY_DIR}/${name}-clobber-stamp)
+  endif()
 endmacro()

 function(rt_externalize_debuginfo name)
@ -542,8 +651,10 @@ endfunction()
 function(configure_compiler_rt_lit_site_cfg input output)
  set_llvm_build_mode()

+  get_compiler_rt_output_dir(${COMPILER_RT_DEFAULT_TARGET_ARCH} output_dir)
+
  string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} COMPILER_RT_RESOLVED_TEST_COMPILER ${COMPILER_RT_TEST_COMPILER})
-  string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR ${COMPILER_RT_LIBRARY_OUTPUT_DIR})
+  string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR ${output_dir})

  configure_lit_site_cfg(${input} ${output})
 endfunction()
--- a/cmake/Modules/CompilerRTDarwinUtils.cmake
+++ b/cmake/Modules/CompilerRTDarwinUtils.cmake
@ -43,7 +43,7 @@ endfunction()
 # link for.
 function(darwin_get_toolchain_supported_archs output_var)
  execute_process(
-    COMMAND ld -v
+    COMMAND "${CMAKE_LINKER}" -v
    ERROR_VARIABLE LINKER_VERSION)

  string(REGEX MATCH "configured to support archs: ([^\n]+)"
@ -230,6 +230,7 @@ macro(darwin_add_builtin_library name suffix)

  list(APPEND ${LIB_OS}_${suffix}_libs ${libname})
  list(APPEND ${LIB_OS}_${suffix}_lipo_flags -arch ${arch} $<TARGET_FILE:${libname}>)
+  set_target_properties(${libname} PROPERTIES FOLDER "Compiler-RT Libraries")
 endmacro()

 function(darwin_lipo_libs name)
@ -251,6 +252,7 @@ function(darwin_lipo_libs name)
    add_dependencies(${LIB_PARENT_TARGET} ${name})
    install(FILES ${LIB_OUTPUT_DIR}/lib${name}.a
      DESTINATION ${LIB_INSTALL_DIR})
+    set_target_properties(${name} PROPERTIES FOLDER "Compiler-RT Misc")
  else()
    message(WARNING "Not generating lipo target for ${name} because no input libraries exist.")
  endif()
--- a/cmake/Modules/CompilerRTUtils.cmake
+++ b/cmake/Modules/CompilerRTUtils.cmake
@ -168,6 +168,7 @@ macro(detect_target_arch)
  check_symbol_exists(__mips64__ "" __MIPS64)
  check_symbol_exists(__powerpc64__ "" __PPC64)
  check_symbol_exists(__powerpc64le__ "" __PPC64LE)
+  check_symbol_exists(__riscv "" __RISCV)
  check_symbol_exists(__s390x__ "" __S390X)
  check_symbol_exists(__wasm32__ "" __WEBASSEMBLY32)
  check_symbol_exists(__wasm64__ "" __WEBASSEMBLY64)
@ -187,6 +188,14 @@ macro(detect_target_arch)
    add_default_target_arch(powerpc64)
  elseif(__PPC64LE)
    add_default_target_arch(powerpc64le)
+  elseif(__RISCV)
+    if(CMAKE_SIZEOF_VOID_P EQUAL "4")
+      add_default_target_arch(riscv32)
+    elseif(CMAKE_SIZEOF_VOID_P EQUAL "8")
+      add_default_target_arch(riscv64)
+    else()
+      message(FATAL_ERROR "Unsupport XLEN for RISC-V")
+    endif()
  elseif(__S390X)
    add_default_target_arch(s390x)
  elseif(__WEBASSEMBLY32)
@ -305,3 +314,69 @@ function(filter_builtin_sources output_var exclude_or_include excluded_list)
  endforeach ()
  set(${output_var} ${intermediate} PARENT_SCOPE)
 endfunction()
+
+function(get_compiler_rt_target arch variable)
+  if(ANDROID AND ${arch} STREQUAL "i386")
+    set(target "i686${COMPILER_RT_OS_SUFFIX}-${COMPILER_RT_DEFAULT_TARGET_OS}")
+  else()
+    set(target "${arch}-${COMPILER_RT_DEFAULT_TARGET_OS}")
+  endif()
+  if(COMPILER_RT_DEFAULT_TARGET_ABI)
+    set(target "${target}-${COMPILER_RT_DEFAULT_TARGET_ABI}")
+  endif()
+  set(${variable} ${target} PARENT_SCOPE)
+endfunction()
+
+function(get_compiler_rt_install_dir arch install_dir)
+  if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE)
+    get_compiler_rt_target(${arch} target)
+    set(${install_dir} ${COMPILER_RT_INSTALL_PATH}/${target}/lib PARENT_SCOPE)
+  else()
+    set(${install_dir} ${COMPILER_RT_LIBRARY_INSTALL_DIR} PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(get_compiler_rt_output_dir arch output_dir)
+  if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE)
+    get_compiler_rt_target(${arch} target)
+    set(${output_dir} ${COMPILER_RT_OUTPUT_DIR}/${target}/lib PARENT_SCOPE)
+  else()
+    set(${output_dir} ${COMPILER_RT_LIBRARY_OUTPUT_DIR} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# compiler_rt_process_sources(
+#   <OUTPUT_VAR>
+#   <SOURCE_FILE> ...
+#  [ADDITIONAL_HEADERS <header> ...]
+# )
+#
+# Process the provided sources and write the list of new sources
+# into `<OUTPUT_VAR>`.
+#
+# ADDITIONAL_HEADERS     - Adds the supplied header to list of sources for IDEs.
+#
+# This function is very similar to `llvm_process_sources()` but exists here
+# because we need to support standalone builds of compiler-rt.
+function(compiler_rt_process_sources OUTPUT_VAR)
+  cmake_parse_arguments(
+    ARG
+    ""
+    ""
+    "ADDITIONAL_HEADERS"
+    ${ARGN}
+  )
+  set(sources ${ARG_UNPARSED_ARGUMENTS})
+  set(headers "")
+  if (XCODE OR MSVC_IDE OR CMAKE_EXTRA_GENERATOR)
+    # For IDEs we need to tell CMake about header files.
+    # Otherwise they won't show up in UI.
+    set(headers ${ARG_ADDITIONAL_HEADERS})
+    list(LENGTH headers headers_length)
+    if (${headers_length} GREATER 0)
+      set_source_files_properties(${headers}
+        PROPERTIES HEADER_FILE_ONLY ON)
+    endif()
+  endif()
+  set("${OUTPUT_VAR}" ${sources} ${headers} PARENT_SCOPE)
+endfunction()
--- a/cmake/Modules/HandleCompilerRT.cmake
+++ b/cmake/Modules/HandleCompilerRT.cmake
@ -1,5 +1,4 @@
-function(find_compiler_rt_library name dest)
-  set(dest "" PARENT_SCOPE)
+function(find_compiler_rt_library name variable)
  set(CLANG_COMMAND ${CMAKE_CXX_COMPILER} ${SANITIZER_COMMON_CFLAGS}
      "--rtlib=compiler-rt" "--print-libgcc-file-name")
  if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND CMAKE_CXX_COMPILER_TARGET)
@ -14,7 +13,7 @@ function(find_compiler_rt_library name dest)
  string(REPLACE "builtins" "${name}" LIBRARY_FILE "${LIBRARY_FILE}")
  if (NOT HAD_ERROR AND EXISTS "${LIBRARY_FILE}")
    message(STATUS "Found compiler-rt ${name} library: ${LIBRARY_FILE}")
-    set(${dest} "${LIBRARY_FILE}" PARENT_SCOPE)
+    set(${variable} "${LIBRARY_FILE}" PARENT_SCOPE)
  else()
    message(STATUS "Failed to find compiler-rt ${name} library")
  endif()
--- a/cmake/Modules/SanitizerUtils.cmake
+++ b/cmake/Modules/SanitizerUtils.cmake
@ -1,3 +1,5 @@
+include(CompilerRTUtils)
+
 set(SANITIZER_GEN_DYNAMIC_LIST
  ${COMPILER_RT_SOURCE_DIR}/lib/sanitizer_common/scripts/gen_dynamic_list.py)

@ -37,9 +39,9 @@ macro(add_sanitizer_rt_symbols name)
    add_custom_target(${target_name}-symbols ALL
      DEPENDS ${stamp}
      SOURCES ${SANITIZER_GEN_DYNAMIC_LIST} ${ARG_EXTRA})
-
+    get_compiler_rt_install_dir(${arch} install_dir)
    install(FILES $<TARGET_FILE:${target_name}>.syms
-            DESTINATION ${COMPILER_RT_LIBRARY_INSTALL_DIR})
+            DESTINATION ${install_dir})
    if(ARG_PARENT_TARGET)
      add_dependencies(${ARG_PARENT_TARGET} ${target_name}-symbols)
    endif()
@ -81,7 +83,7 @@ macro(add_sanitizer_rt_version_list name)
 endmacro()

 # Add target to check code style for sanitizer runtimes.
-if(CMAKE_HOST_UNIX)
+if(CMAKE_HOST_UNIX AND NOT OS_NAME MATCHES "OpenBSD")
  add_custom_target(SanitizerLintCheck
    COMMAND env LLVM_CHECKOUT=${LLVM_MAIN_SRC_DIR} SILENT=1 TMPDIR=
      PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}
@ -90,5 +92,9 @@ if(CMAKE_HOST_UNIX)
    DEPENDS ${SANITIZER_LINT_SCRIPT}
    COMMENT "Running lint check for sanitizer sources..."
    VERBATIM)
+else()
+  add_custom_target(SanitizerLintCheck
+    COMMAND echo "No lint check")
 endif()
-
+set_target_properties(SanitizerLintCheck
+  PROPERTIES FOLDER "Compiler-RT Misc")
--- a/cmake/base-config-ix.cmake
+++ b/cmake/base-config-ix.cmake
@ -12,7 +12,14 @@ check_include_file(unwind.h HAVE_UNWIND_H)
 add_custom_target(compiler-rt ALL)
 add_custom_target(install-compiler-rt)
 add_custom_target(install-compiler-rt-stripped)
-set_target_properties(compiler-rt PROPERTIES FOLDER "Compiler-RT Misc")
+set_property(
+  TARGET
+    compiler-rt
+    install-compiler-rt
+    install-compiler-rt-stripped
+  PROPERTY
+    FOLDER "Compiler-RT Misc"
+)

 # Setting these variables from an LLVM build is sufficient that compiler-rt can
 # construct the output paths, so it can behave as if it were in-tree here.
@ -69,10 +76,17 @@ endif()
 if(NOT DEFINED COMPILER_RT_OS_DIR)
  string(TOLOWER ${CMAKE_SYSTEM_NAME} COMPILER_RT_OS_DIR)
 endif()
-set(COMPILER_RT_LIBRARY_OUTPUT_DIR
-  ${COMPILER_RT_OUTPUT_DIR}/lib/${COMPILER_RT_OS_DIR})
-set(COMPILER_RT_LIBRARY_INSTALL_DIR
-  ${COMPILER_RT_INSTALL_PATH}/lib/${COMPILER_RT_OS_DIR})
+if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE)
+  set(COMPILER_RT_LIBRARY_OUTPUT_DIR
+    ${COMPILER_RT_OUTPUT_DIR})
+  set(COMPILER_RT_LIBRARY_INSTALL_DIR
+    ${COMPILER_RT_INSTALL_PATH})
+else(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
+  set(COMPILER_RT_LIBRARY_OUTPUT_DIR
+    ${COMPILER_RT_OUTPUT_DIR}/lib/${COMPILER_RT_OS_DIR})
+  set(COMPILER_RT_LIBRARY_INSTALL_DIR
+    ${COMPILER_RT_INSTALL_PATH}/lib/${COMPILER_RT_OS_DIR})
+endif()

 if(APPLE)
  # On Darwin if /usr/include doesn't exist, the user probably has Xcode but not
@ -139,8 +153,16 @@ macro(test_targets)
      add_default_target_arch(${COMPILER_RT_DEFAULT_TARGET_ARCH})
    elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "i[2-6]86|x86|amd64")
      if(NOT MSVC)
-        test_target_arch(x86_64 "" "-m64")
-        test_target_arch(i386 __i386__ "-m32")
+        if(CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+          if (CMAKE_SIZEOF_VOID_P EQUAL 4)
+            test_target_arch(i386 __i386__ "-m32")
+          else()
+            test_target_arch(x86_64 "" "-m64")
+          endif()
+        else()
+          test_target_arch(x86_64 "" "-m64")
+          test_target_arch(i386 __i386__ "-m32")
+        endif()
      else()
        if (CMAKE_SIZEOF_VOID_P EQUAL 4)
          test_target_arch(i386 "" "")
@ -186,6 +208,10 @@ macro(test_targets)
      test_target_arch(aarch32 "" "-march=armv8-a")
    elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "aarch64")
      test_target_arch(aarch64 "" "-march=armv8-a")
+    elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "riscv32")
+      test_target_arch(riscv32 "" "")
+    elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "riscv64")
+      test_target_arch(riscv64 "" "")
    elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "wasm32")
      test_target_arch(wasm32 "" "--target=wasm32-unknown-unknown")
    elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "wasm64")
--- a/cmake/builtin-config-ix.cmake
+++ b/cmake/builtin-config-ix.cmake
@ -25,11 +25,14 @@ int foo(int x, int y) {

 set(ARM64 aarch64)
 set(ARM32 arm armhf armv6m armv7m armv7em armv7 armv7s armv7k)
+set(HEXAGON hexagon)
 set(X86 i386)
 set(X86_64 x86_64)
 set(MIPS32 mips mipsel)
 set(MIPS64 mips64 mips64el)
 set(PPC64 powerpc64 powerpc64le)
+set(RISCV32 riscv32)
+set(RISCV64 riscv64)
 set(WASM32 wasm32)
 set(WASM64 wasm64)

@ -40,7 +43,7 @@ if(APPLE)
 endif()

 set(ALL_BUILTIN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64}
-    ${MIPS32} ${MIPS64} ${PPC64} ${WASM32} ${WASM64})
+    ${HEXAGON} ${MIPS32} ${MIPS64} ${PPC64} ${RISCV32} ${RISCV64} ${WASM32} ${WASM64})

 include(CompilerRTUtils)
 include(CompilerRTDarwinUtils)
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@ -13,7 +13,10 @@ function(check_linker_flag flag out_var)
 endfunction()

 check_library_exists(c fopen "" COMPILER_RT_HAS_LIBC)
-if (NOT SANITIZER_USE_COMPILER_RT)
+if (COMPILER_RT_USE_BUILTINS_LIBRARY)
+  include(HandleCompilerRT)
+  find_compiler_rt_library(builtins COMPILER_RT_BUILTINS_LIBRARY)
+else()
  if (ANDROID)
    check_library_exists(gcc __gcc_personality_v0 "" COMPILER_RT_HAS_GCC_LIB)
  else()
@ -27,9 +30,7 @@ if (COMPILER_RT_HAS_NODEFAULTLIBS_FLAG)
  if (COMPILER_RT_HAS_LIBC)
    list(APPEND CMAKE_REQUIRED_LIBRARIES c)
  endif ()
-  if (SANITIZER_USE_COMPILER_RT)
-    list(APPEND CMAKE_REQUIRED_FLAGS -rtlib=compiler-rt)
-    find_compiler_rt_library(builtins COMPILER_RT_BUILTINS_LIBRARY)
+  if (COMPILER_RT_USE_BUILTINS_LIBRARY)
    list(APPEND CMAKE_REQUIRED_LIBRARIES "${COMPILER_RT_BUILTINS_LIBRARY}")
  elseif (COMPILER_RT_HAS_GCC_S_LIB)
    list(APPEND CMAKE_REQUIRED_LIBRARIES gcc_s)
@ -108,6 +109,7 @@ if (ANDROID AND COMPILER_RT_HAS_LIBDL)
  # Android's libstdc++ has a dependency on libdl.
  list(APPEND CMAKE_REQUIRED_LIBRARIES dl)
 endif()
+check_library_exists(c++ __cxa_throw "" COMPILER_RT_HAS_LIBCXX)
 check_library_exists(stdc++ __cxa_throw "" COMPILER_RT_HAS_LIBSTDCXX)

 # Linker flags.
@ -174,11 +176,14 @@ endmacro()

 set(ARM64 aarch64)
 set(ARM32 arm armhf)
+set(HEXAGON hexagon)
 set(X86 i386)
 set(X86_64 x86_64)
 set(MIPS32 mips mipsel)
 set(MIPS64 mips64 mips64el)
 set(PPC64 powerpc64 powerpc64le)
+set(RISCV32 riscv32)
+set(RISCV64 riscv64)
 set(S390X s390x)
 set(WASM32 wasm32)
 set(WASM64 wasm64)
@ -194,7 +199,7 @@ set(ALL_SANITIZER_COMMON_SUPPORTED_ARCH ${X86} ${X86_64} ${PPC64}
 set(ALL_ASAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64}
    ${MIPS32} ${MIPS64} ${PPC64} ${S390X})
 set(ALL_DFSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64})
-set(ALL_FUZZER_SUPPORTED_ARCH x86_64)
+set(ALL_FUZZER_SUPPORTED_ARCH ${X86_64} ${ARM64})

 if(APPLE)
  set(ALL_LSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64} ${ARM64})
@ -202,7 +207,7 @@ else()
  set(ALL_LSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64} ${ARM64} ${ARM32} ${PPC64})
 endif()
 set(ALL_MSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${PPC64})
-set(ALL_HWASAN_SUPPORTED_ARCH ${ARM64})
+set(ALL_HWASAN_SUPPORTED_ARCH ${X86_64} ${ARM64})
 set(ALL_PROFILE_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${PPC64}
    ${MIPS32} ${MIPS64} ${S390X})
 set(ALL_TSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${PPC64})
@ -211,12 +216,13 @@ set(ALL_UBSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64}
 set(ALL_SAFESTACK_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM64} ${MIPS32} ${MIPS64})
 set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${MIPS64})
 set(ALL_ESAN_SUPPORTED_ARCH ${X86_64} ${MIPS64})
-set(ALL_SCUDO_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64})
+set(ALL_SCUDO_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64} ${PPC64})
 if(APPLE)
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64})
 else()
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64} powerpc64le)
 endif()
+set(ALL_SHADOWCALLSTACK_SUPPORTED_ARCH ${X86_64} ${ARM64})

 if(APPLE)
  include(CompilerRTDarwinUtils)
@ -365,7 +371,11 @@ if(APPLE)
        if(DARWIN_${platform}_ARCHS)
          list(APPEND SANITIZER_COMMON_SUPPORTED_OS ${platform})
          list(APPEND PROFILE_SUPPORTED_OS ${platform})
-          list(APPEND TSAN_SUPPORTED_OS ${platform})
+
+          list_intersect(DARWIN_${platform}_TSAN_ARCHS DARWIN_${platform}_ARCHS ALL_TSAN_SUPPORTED_ARCH)
+          if(DARWIN_${platform}_TSAN_ARCHS)
+            list(APPEND TSAN_SUPPORTED_OS ${platform})
+          endif()
        endif()
        foreach(arch ${DARWIN_${platform}_ARCHS})
          list(APPEND COMPILER_RT_SUPPORTED_ARCH ${arch})
@ -378,7 +388,6 @@ if(APPLE)
  # for list_intersect
  include(CompilerRTUtils)

-
  list_intersect(SANITIZER_COMMON_SUPPORTED_ARCH
    ALL_SANITIZER_COMMON_SUPPORTED_ARCH
    COMPILER_RT_SUPPORTED_ARCH
@ -423,10 +432,13 @@ if(APPLE)
    SANITIZER_COMMON_SUPPORTED_ARCH)
  list_intersect(FUZZER_SUPPORTED_ARCH
    ALL_FUZZER_SUPPORTED_ARCH
-    ALL_SANITIZER_COMMON_SUPPORTED_ARCH)
+    SANITIZER_COMMON_SUPPORTED_ARCH)
  list_intersect(XRAY_SUPPORTED_ARCH
    ALL_XRAY_SUPPORTED_ARCH
    SANITIZER_COMMON_SUPPORTED_ARCH)
+  list_intersect(SHADOWCALLSTACK_SUPPORTED_ARCH
+    ALL_SHADOWCALLSTACK_SUPPORTED_ARCH
+    SANITIZER_COMMON_SUPPORTED_ARCH)

 else()
  # Architectures supported by compiler-rt libraries.
@ -453,6 +465,8 @@ else()
  filter_available_targets(ESAN_SUPPORTED_ARCH ${ALL_ESAN_SUPPORTED_ARCH})
  filter_available_targets(SCUDO_SUPPORTED_ARCH ${ALL_SCUDO_SUPPORTED_ARCH})
  filter_available_targets(XRAY_SUPPORTED_ARCH ${ALL_XRAY_SUPPORTED_ARCH})
+  filter_available_targets(SHADOWCALLSTACK_SUPPORTED_ARCH
+    ${ALL_SHADOWCALLSTACK_SUPPORTED_ARCH})
 endif()

 if (MSVC)
@ -486,7 +500,7 @@ set(COMPILER_RT_SANITIZERS_TO_BUILD all CACHE STRING
 list_replace(COMPILER_RT_SANITIZERS_TO_BUILD all "${ALL_SANITIZERS}")

 if (SANITIZER_COMMON_SUPPORTED_ARCH AND NOT LLVM_USE_SANITIZER AND
-    (OS_NAME MATCHES "Android|Darwin|Linux|FreeBSD|NetBSD|Fuchsia|SunOS" OR
+    (OS_NAME MATCHES "Android|Darwin|Linux|FreeBSD|NetBSD|OpenBSD|Fuchsia|SunOS" OR
    (OS_NAME MATCHES "Windows" AND (NOT MINGW AND NOT CYGWIN))))
  set(COMPILER_RT_HAS_SANITIZER_COMMON TRUE)
 else()
@ -499,7 +513,8 @@ else()
  set(COMPILER_RT_HAS_INTERCEPTION FALSE)
 endif()

-if (COMPILER_RT_HAS_SANITIZER_COMMON AND ASAN_SUPPORTED_ARCH)
+if (COMPILER_RT_HAS_SANITIZER_COMMON AND ASAN_SUPPORTED_ARCH AND
+    NOT OS_NAME MATCHES "OpenBSD")
  set(COMPILER_RT_HAS_ASAN TRUE)
 else()
  set(COMPILER_RT_HAS_ASAN FALSE)
@ -528,7 +543,7 @@ else()
 endif()

 if (COMPILER_RT_HAS_SANITIZER_COMMON AND MSAN_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Linux|NetBSD")
+    OS_NAME MATCHES "Linux|FreeBSD|NetBSD")
  set(COMPILER_RT_HAS_MSAN TRUE)
 else()
  set(COMPILER_RT_HAS_MSAN FALSE)
@ -542,7 +557,7 @@ else()
 endif()

 if (PROFILE_SUPPORTED_ARCH AND NOT LLVM_USE_SANITIZER AND
-    OS_NAME MATCHES "Darwin|Linux|FreeBSD|Windows|Android|SunOS")
+    OS_NAME MATCHES "Darwin|Linux|FreeBSD|Windows|Android|Fuchsia|SunOS")
  set(COMPILER_RT_HAS_PROFILE TRUE)
 else()
  set(COMPILER_RT_HAS_PROFILE FALSE)
@ -556,14 +571,14 @@ else()
 endif()

 if (COMPILER_RT_HAS_SANITIZER_COMMON AND UBSAN_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Darwin|Linux|FreeBSD|NetBSD|Windows|Android|Fuchsia|SunOS")
+    OS_NAME MATCHES "Darwin|Linux|FreeBSD|NetBSD|OpenBSD|Windows|Android|Fuchsia|SunOS")
  set(COMPILER_RT_HAS_UBSAN TRUE)
 else()
  set(COMPILER_RT_HAS_UBSAN FALSE)
 endif()

 if (COMPILER_RT_HAS_SANITIZER_COMMON AND UBSAN_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Linux|FreeBSD|NetBSD|Android|Darwin")
+    OS_NAME MATCHES "Linux|FreeBSD|NetBSD|OpenBSD|Android|Darwin")
  set(COMPILER_RT_HAS_UBSAN_MINIMAL TRUE)
 else()
  set(COMPILER_RT_HAS_UBSAN_MINIMAL FALSE)
@ -590,22 +605,29 @@ else()
 endif()

 if (COMPILER_RT_HAS_SANITIZER_COMMON AND SCUDO_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Linux|Android")
+    OS_NAME MATCHES "Linux|Android|Fuchsia")
  set(COMPILER_RT_HAS_SCUDO TRUE)
 else()
  set(COMPILER_RT_HAS_SCUDO FALSE)
 endif()

 if (COMPILER_RT_HAS_SANITIZER_COMMON AND XRAY_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Darwin|Linux")
+    OS_NAME MATCHES "Darwin|Linux|FreeBSD|NetBSD|OpenBSD")
  set(COMPILER_RT_HAS_XRAY TRUE)
 else()
  set(COMPILER_RT_HAS_XRAY FALSE)
 endif()

 if (COMPILER_RT_HAS_SANITIZER_COMMON AND FUZZER_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Android|Darwin|Linux|NetBSD")
+    OS_NAME MATCHES "Android|Darwin|Linux|NetBSD|FreeBSD|OpenBSD|Fuchsia")
  set(COMPILER_RT_HAS_FUZZER TRUE)
 else()
  set(COMPILER_RT_HAS_FUZZER FALSE)
 endif()
+
+if (COMPILER_RT_HAS_SANITIZER_COMMON AND SHADOWCALLSTACK_SUPPORTED_ARCH AND
+    OS_NAME MATCHES "Linux|Android")
+  set(COMPILER_RT_HAS_SHADOWCALLSTACK TRUE)
+else()
+  set(COMPILER_RT_HAS_SHADOWCALLSTACK FALSE)
+endif()
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@ -10,6 +10,7 @@ if (COMPILER_RT_BUILD_SANITIZERS)
    sanitizer/linux_syscall_hooks.h
    sanitizer/lsan_interface.h
    sanitizer/msan_interface.h
+    sanitizer/netbsd_syscall_hooks.h
    sanitizer/scudo_interface.h
    sanitizer/tsan_interface.h
    sanitizer/tsan_interface_atomic.h)
--- a/include/sanitizer/common_interface_defs.h
+++ b/include/sanitizer/common_interface_defs.h
@ -65,6 +65,11 @@ extern "C" {
  void __sanitizer_unaligned_store32(void *p, uint32_t x);
  void __sanitizer_unaligned_store64(void *p, uint64_t x);

+  // Returns 1 on the first call, then returns 0 thereafter.  Called by the tool
+  // to ensure only one report is printed when multiple errors occur
+  // simultaneously.
+  int __sanitizer_acquire_crash_state();
+
  // Annotate the current state of a contiguous container, such as
  // std::vector, std::string or similar.
  // A contiguous container is a container that keeps all of its elements
--- a/include/sanitizer/msan_interface.h
+++ b/include/sanitizer/msan_interface.h
@ -104,6 +104,14 @@ extern "C" {
     copy. Source and destination regions can overlap. */
  void __msan_copy_shadow(const volatile void *dst, const volatile void *src,
                          size_t size);
+
+  /* Disables uninitialized memory checks in interceptors. */
+  void __msan_scoped_disable_interceptor_checks(void);
+
+  /* Re-enables uninitialized memory checks in interceptors after a previous
+     call to __msan_scoped_disable_interceptor_checks. */
+  void __msan_scoped_enable_interceptor_checks(void);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/include/sanitizer/netbsd_syscall_hooks.h
+++ b/include/sanitizer/netbsd_syscall_hooks.h
--- a/include/sanitizer/scudo_interface.h
+++ b/include/sanitizer/scudo_interface.h
@ -26,7 +26,12 @@ extern "C" {
  // the hard limit (HardLimit=1) or the soft limit (HardLimit=0). The limit
  // can be removed by setting LimitMb to 0. This function's parameters should
  // be fully trusted to avoid security mishaps.
-  void __scudo_set_rss_limit(unsigned long LimitMb, int HardLimit);
+  void __scudo_set_rss_limit(size_t LimitMb, int HardLimit);
+
+  // This function outputs various allocator statistics for both the Primary
+  // and Secondary allocators, including memory usage, number of allocations
+  // and deallocations.
+  void __scudo_print_stats(void);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/include/xray/xray_interface.h
+++ b/include/xray/xray_interface.h
@ -27,6 +27,7 @@ enum XRayEntryType {
  TAIL = 2,
  LOG_ARGS_ENTRY = 3,
  CUSTOM_EVENT = 4,
+  TYPED_EVENT = 5,
 };

 /// Provide a function to invoke for when instrumentation points are hit. This
@ -68,12 +69,23 @@ extern int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType,
 extern int __xray_remove_handler_arg1();

 /// Provide a function to invoke when XRay encounters a custom event.
-extern int __xray_set_customevent_handler(void (*entry)(void*, std::size_t));
+extern int __xray_set_customevent_handler(void (*entry)(void *, std::size_t));

 /// This removes whatever the currently provided custom event handler is.
 /// Returns 1 on success, 0 on error.
 extern int __xray_remove_customevent_handler();

+/// Set a handler for xray typed event logging. The first parameter is a type
+/// identifier, the second is a payload, and the third is the payload size.
+extern int __xray_set_typedevent_handler(void (*entry)(uint16_t, const void *,
+                                                       std::size_t));
+
+/// Removes the currently set typed event handler.
+/// Returns 1 on success, 0 on error.
+extern int __xray_remove_typedevent_handler();
+
+extern uint16_t __xray_register_event_type(const char *event_type);
+
 enum XRayPatchingStatus {
  NOT_INITIALIZED = 0,
  SUCCESS = 1,
--- a/include/xray/xray_log_interface.h
+++ b/include/xray/xray_log_interface.h
@ -21,27 +21,29 @@
 ///
 /// The high-level usage pattern for these APIs look like the following:
 ///
-///   // Before we try initializing the log implementation, we must set it as
-///   // the log implementation. We provide the function pointers that define
-///   // the various initialization, finalization, and other pluggable hooks
-///   // that we need.
-///   __xray_set_log_impl({...});
+///   // We choose the mode which we'd like to install, and check whether this
+///   // has succeeded. Each mode will have their own set of flags they will
+///   // support, outside of the global XRay configuration options that are
+///   // defined in the XRAY_OPTIONS environment variable.
+///   auto select_status = __xray_log_select_mode("xray-fdr");
+///   if (select_status != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) {
+///     // This failed, we should not proceed with attempting to initialise
+///     // the currently selected mode.
+///     return;
+///   }
 ///
-///   // Once that's done, we can now initialize the implementation. Each
-///   // implementation has a chance to let users customize the implementation
-///   // with a struct that their implementation supports. Roughly this might
-///   // look like:
-///   MyImplementationOptions opts;
-///   opts.enable_feature = true;
-///   ...
-///   auto init_status = __xray_log_init(
-///       BufferSize, MaxBuffers, &opts, sizeof opts);
-///   if (init_status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
+///   // Once that's done, we can now attempt to configure the implementation.
+///   // To do this, we provide the string flags configuration for the mode.
+///   auto config_status = __xray_log_init_mode(
+///       "xray-fdr", "verbosity=1 some_flag=1 another_flag=2");
+///   if (config_status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
 ///     // deal with the error here, if there is one.
 ///   }
 ///
 ///   // When the log implementation has had the chance to initialize, we can
-///   // now patch the sleds.
+///   // now patch the instrumentation points. Note that we could have patched
+///   // the instrumentation points first, but there's no strict ordering to
+///   // these operations.
 ///   auto patch_status = __xray_patch();
 ///   if (patch_status != XRayPatchingStatus::SUCCESS) {
 ///     // deal with the error here, if it is an error.
@ -56,12 +58,12 @@
 ///
 ///   // We can optionally wait before flushing the log to give other threads a
 ///   // chance to see that the implementation is already finalized. Also, at
-///   // this point we can optionally unpatch the sleds to reduce overheads at
-///   // runtime.
+///   // this point we can optionally unpatch the instrumentation points to
+///   // reduce overheads at runtime.
 ///   auto unpatch_status = __xray_unpatch();
 ///   if (unpatch_status != XRayPatchingStatus::SUCCESS) {
-//      // deal with the error here, if it is an error.
-//    }
+///     // deal with the error here, if it is an error.
+///   }
 ///
 ///   // If there are logs or data to be flushed somewhere, we can do so only
 ///   // after we've finalized the log. Some implementations may not actually
@ -72,6 +74,17 @@
 ///     // deal with the error here, if it is an error.
 ///   }
 ///
+///   // Alternatively, we can go through the buffers ourselves without
+///   // relying on the implementations' flushing semantics (if the
+///   // implementation supports exporting this data directly).
+///   auto MyBufferProcessor = +[](const char* mode, XRayBuffer buffer) {
+///     // Check the "mode" to see if it's something we know how to handle...
+///     // and/or do something with an XRayBuffer instance.
+///   };
+///   auto process_status = __xray_log_process_buffers(MyBufferProcessor);
+///   if (process_status != XRayLogFlushStatus::XRAY_LOG_FLUSHED) {
+///     // deal with the error here, if it is an error.
+///   }
 ///
 /// NOTE: Before calling __xray_patch() again, consider re-initializing the
 /// implementation first. Some implementations might stay in an "off" state when
@ -182,9 +195,13 @@ struct XRayLogImpl {
  XRayLogFlushStatus (*flush_log)();
 };

+/// DEPRECATED: Use the mode registration workflow instead with
+/// __xray_log_register_mode(...) and __xray_log_select_mode(...). See the
+/// documentation for those function.
+///
 /// This function installs a new logging implementation that XRay will use. In
 /// case there are any nullptr members in Impl, XRay will *uninstall any
-/// existing implementations*. It does NOT patch the instrumentation sleds.
+/// existing implementations*. It does NOT patch the instrumentation points.
 ///
 /// NOTE: This function does NOT attempt to finalize the currently installed
 /// implementation. Use with caution.
@ -227,9 +244,14 @@ XRayLogRegisterStatus __xray_log_register_mode(const char *Mode,
 ///     does not update the currently installed implementation.
 XRayLogRegisterStatus __xray_log_select_mode(const char *Mode);

+/// Returns an identifier for the currently selected XRay mode chosen through
+/// the __xray_log_select_mode(...) function call. Returns nullptr if there is
+/// no currently installed mode.
+const char *__xray_log_get_current_mode();
+
 /// This function removes the currently installed implementation. It will also
 /// uninstall any handlers that have been previously installed. It does NOT
-/// unpatch the instrumentation sleds.
+/// unpatch the instrumentation points.
 ///
 /// NOTE: This function does NOT attempt to finalize the currently installed
 /// implementation. Use with caution.
@ -244,11 +266,37 @@ XRayLogRegisterStatus __xray_log_select_mode(const char *Mode);
 /// called while in any other states.
 void __xray_remove_log_impl();

+/// DEPRECATED: Use __xray_log_init_mode() instead, and provide all the options
+/// in string form.
 /// Invokes the installed implementation initialization routine. See
 /// XRayLogInitStatus for what the return values mean.
 XRayLogInitStatus __xray_log_init(size_t BufferSize, size_t MaxBuffers,
                                  void *Args, size_t ArgsSize);

+/// Invokes the installed initialization routine, which *must* support the
+/// string based form.
+///
+/// NOTE: When this API is used, we still invoke the installed initialization
+/// routine, but we will call it with the following convention to signal that we
+/// are using the string form:
+///
+/// - BufferSize = 0
+/// - MaxBuffers = 0
+/// - ArgsSize = 0
+/// - Args will be the pointer to the character buffer representing the
+///   configuration.
+///
+/// FIXME: Updating the XRayLogImpl struct is an ABI breaking change. When we
+/// are ready to make a breaking change, we should clean this up appropriately.
+XRayLogInitStatus __xray_log_init_mode(const char *Mode, const char *Config);
+
+/// Like __xray_log_init_mode(...) this version allows for providing
+/// configurations that might have non-null-terminated strings. This will
+/// operate similarly to __xray_log_init_mode, with the exception that
+/// |ArgsSize| will be what |ConfigSize| is.
+XRayLogInitStatus __xray_log_init_mode_bin(const char *Mode, const char *Config,
+                                           size_t ConfigSize);
+
 /// Invokes the installed implementation finalization routine. See
 /// XRayLogInitStatus for what the return values mean.
 XRayLogInitStatus __xray_log_finalize();
@ -257,16 +305,68 @@ XRayLogInitStatus __xray_log_finalize();
 /// XRayLogFlushStatus for what the return values mean.
 XRayLogFlushStatus __xray_log_flushLog();

+/// An XRayBuffer represents a section of memory which can be treated by log
+/// processing functions as bytes stored in the logging implementation's
+/// buffers.
+struct XRayBuffer {
+  const void *Data;
+  size_t Size;
+};
+
+/// Registers an iterator function which takes an XRayBuffer argument, then
+/// returns another XRayBuffer function representing the next buffer. When the
+/// Iterator function returns an empty XRayBuffer (Data = nullptr, Size = 0),
+/// this signifies the end of the buffers.
+///
+/// The first invocation of this Iterator function will always take an empty
+/// XRayBuffer (Data = nullptr, Size = 0).
+void __xray_log_set_buffer_iterator(XRayBuffer (*Iterator)(XRayBuffer));
+
+/// Removes the currently registered buffer iterator function.
+void __xray_log_remove_buffer_iterator();
+
+/// Invokes the provided handler to process data maintained by the logging
+/// handler. This API will be provided raw access to the data available in
+/// memory from the logging implementation. The callback function must:
+///
+/// 1) Not modify the data, to avoid running into undefined behaviour.
+///
+/// 2) Either know the data layout, or treat the data as raw bytes for later
+///    interpretation.
+///
+/// This API is best used in place of the `__xray_log_flushLog()` implementation
+/// above to enable the caller to provide an alternative means of extracting the
+/// data from the XRay implementation.
+///
+/// Implementations MUST then provide:
+///
+/// 1) A function that will return an XRayBuffer. Functions that return an
+///    "empty" XRayBuffer signifies that there are no more buffers to be
+///    processed. This function should be registered through the
+///    `__xray_log_set_buffer_iterator(...)` function.
+///
+/// 2) Its own means of converting data it holds in memory into an XRayBuffer
+///    structure.
+///
+/// See XRayLogFlushStatus for what the return values mean.
+///
+XRayLogFlushStatus __xray_log_process_buffers(void (*Processor)(const char *,
+                                                                XRayBuffer));
+
 } // extern "C"

 namespace __xray {

+/// DEPRECATED: Use __xray_log_init_mode(...) instead, and provide flag
+/// configuration strings to set the options instead.
 /// Options used by the LLVM XRay FDR logging implementation.
 struct FDRLoggingOptions {
  bool ReportErrors = false;
  int Fd = -1;
 };

+/// DEPRECATED: Use __xray_log_init_mode(...) instead, and provide flag
+/// configuration strings to set the options instead.
 /// Options used by the LLVM XRay Basic (Naive) logging implementation.
 struct BasicLoggingOptions {
  int DurationFilterMicros = 0;
--- a/include/xray/xray_records.h
+++ b/include/xray/xray_records.h
@ -54,7 +54,7 @@ struct alignas(32) XRayFileHeader {

  union {
    char FreeForm[16];
-    // The current civiltime timestamp, as retrived from 'clock_gettime'. This
+    // The current civiltime timestamp, as retrieved from 'clock_gettime'. This
    // allows readers of the file to determine when the file was created or
    // written down.
    struct timespec TS;
@ -95,8 +95,11 @@ struct alignas(32) XRayRecord {
  // The thread ID for the currently running thread.
  uint32_t TId = 0;

+  // The ID of process that is currently running
+  uint32_t PId = 0;
+  
  // Use some bytes in the end of the record for buffers.
-  char Buffer[4] = {};
+  char Buffer[8] = {};
 } __attribute__((packed));

 static_assert(sizeof(XRayRecord) == 32, "XRayRecord != 32 bytes");
@ -115,8 +118,8 @@ struct alignas(32) XRayArgPayload {
  // The thread ID for the currently running thread.
  uint32_t TId = 0;

-  // Add more padding.
-  uint8_t Padding2[4] = {};
+  // The ID of process that is currently running
+  uint32_t PId = 0;

  // The argument payload.
  uint64_t Arg = 0;
--- a/lib/asan/.clang-format
+++ b/lib/asan/.clang-format
@ -1 +1,2 @@
 BasedOnStyle: Google
+AllowShortIfStatementsOnASingleLine: false
--- a/lib/asan/CMakeLists.txt
+++ b/lib/asan/CMakeLists.txt
@ -23,6 +23,7 @@ set(ASAN_SOURCES
  asan_posix.cc
  asan_premap_shadow.cc
  asan_report.cc
+  asan_rtems.cc
  asan_rtl.cc
  asan_shadow_setup.cc
  asan_stack.cc
@ -37,6 +38,34 @@ set(ASAN_CXX_SOURCES
 set(ASAN_PREINIT_SOURCES
  asan_preinit.cc)

+SET(ASAN_HEADERS
+  asan_activation.h
+  asan_activation_flags.inc
+  asan_allocator.h
+  asan_descriptions.h
+  asan_errors.h
+  asan_fake_stack.h
+  asan_flags.h
+  asan_flags.inc
+  asan_init_version.h
+  asan_interceptors.h
+  asan_interceptors_memintrinsics.h
+  asan_interface.inc
+  asan_interface_internal.h
+  asan_internal.h
+  asan_lock.h
+  asan_malloc_local.h
+  asan_mapping.h
+  asan_mapping_myriad.h
+  asan_poisoning.h
+  asan_premap_shadow.h
+  asan_report.h
+  asan_scariness_score.h
+  asan_stack.h
+  asan_stats.h
+  asan_suppressions.h
+  asan_thread.h)
+
 include_directories(..)

 set(ASAN_CFLAGS ${SANITIZER_COMMON_CFLAGS})
@ -46,20 +75,6 @@ append_rtti_flag(OFF ASAN_CFLAGS)

 set(ASAN_DYNAMIC_LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS})

-if(ANDROID)
-# On Android, -z global does not do what it is documented to do.
-# On Android, -z global moves the library ahead in the lookup order,
-# placing it right after the LD_PRELOADs. This is used to compensate for the fact
-# that Android linker does not look at the dependencies of the main executable
-# that aren't dependencies of the current DSO when resolving symbols from said DSO.
-# As a net result, this allows running ASan executables without LD_PRELOAD-ing the
-# ASan runtime library.
-# The above is applicable to L MR1 or newer.
-  if (COMPILER_RT_HAS_Z_GLOBAL)
-    list(APPEND ASAN_DYNAMIC_LINK_FLAGS -Wl,-z,global)
-  endif()
-endif()
-
 set(ASAN_DYNAMIC_DEFINITIONS
  ${ASAN_COMMON_DEFINITIONS} ASAN_DYNAMIC=1)
 append_list_if(WIN32 INTERCEPTION_DYNAMIC_CRT ASAN_DYNAMIC_DEFINITIONS)
@ -83,21 +98,28 @@ add_compiler_rt_object_libraries(RTAsan_dynamic
  OS ${SANITIZER_COMMON_SUPPORTED_OS}
  ARCHS ${ASAN_SUPPORTED_ARCH}
  SOURCES ${ASAN_SOURCES} ${ASAN_CXX_SOURCES}
+  ADDITIONAL_HEADERS ${ASAN_HEADERS}
  CFLAGS ${ASAN_DYNAMIC_CFLAGS}
  DEFS ${ASAN_DYNAMIC_DEFINITIONS})

 if(NOT APPLE)
  add_compiler_rt_object_libraries(RTAsan
    ARCHS ${ASAN_SUPPORTED_ARCH}
-    SOURCES ${ASAN_SOURCES} CFLAGS ${ASAN_CFLAGS}
+    SOURCES ${ASAN_SOURCES}
+    ADDITIONAL_HEADERS ${ASAN_HEADERS}
+    CFLAGS ${ASAN_CFLAGS}
    DEFS ${ASAN_COMMON_DEFINITIONS})
  add_compiler_rt_object_libraries(RTAsan_cxx
    ARCHS ${ASAN_SUPPORTED_ARCH}
-    SOURCES ${ASAN_CXX_SOURCES} CFLAGS ${ASAN_CFLAGS}
+    SOURCES ${ASAN_CXX_SOURCES}
+    ADDITIONAL_HEADERS ${ASAN_HEADERS}
+    CFLAGS ${ASAN_CFLAGS}
    DEFS ${ASAN_COMMON_DEFINITIONS})
  add_compiler_rt_object_libraries(RTAsan_preinit
    ARCHS ${ASAN_SUPPORTED_ARCH}
-    SOURCES ${ASAN_PREINIT_SOURCES} CFLAGS ${ASAN_CFLAGS}
+    SOURCES ${ASAN_PREINIT_SOURCES}
+    ADDITIONAL_HEADERS ${ASAN_HEADERS}
+    CFLAGS ${ASAN_CFLAGS}
    DEFS ${ASAN_COMMON_DEFINITIONS})

  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cc "")
@ -125,6 +147,8 @@ if(APPLE)
                RTInterception
                RTSanitizerCommon
                RTSanitizerCommonLibc
+                RTSanitizerCommonCoverage
+                RTSanitizerCommonSymbolizer
                RTLSanCommon
                RTUbsan
    CFLAGS ${ASAN_DYNAMIC_CFLAGS}
@ -138,6 +162,8 @@ else()
    RTInterception
    RTSanitizerCommon
    RTSanitizerCommonLibc
+    RTSanitizerCommonCoverage
+    RTSanitizerCommonSymbolizer
    RTLSanCommon
    RTUbsan)

@ -223,7 +249,7 @@ else()
      DEFS ${ASAN_DYNAMIC_DEFINITIONS}
      PARENT_TARGET asan)

-    if (UNIX AND NOT ${arch} STREQUAL "i386")
+    if (SANITIZER_USE_SYMBOLS AND NOT ${arch} STREQUAL "i386")
      add_sanitizer_rt_symbols(clang_rt.asan_cxx
        ARCHS ${arch})
      add_dependencies(asan clang_rt.asan_cxx-${arch}-symbols)
--- a/lib/asan/asan_allocator.cc
+++ b/lib/asan/asan_allocator.cc
@ -134,8 +134,9 @@ struct AsanChunk: ChunkBase {
 };

 struct QuarantineCallback {
-  explicit QuarantineCallback(AllocatorCache *cache)
-      : cache_(cache) {
+  QuarantineCallback(AllocatorCache *cache, BufferedStackTrace *stack)
+      : cache_(cache),
+        stack_(stack) {
  }

  void Recycle(AsanChunk *m) {
@ -168,7 +169,7 @@ struct QuarantineCallback {
    void *res = get_allocator().Allocate(cache_, size, 1);
    // TODO(alekseys): Consider making quarantine OOM-friendly.
    if (UNLIKELY(!res))
-      return DieOnFailure::OnOOM();
+      ReportOutOfMemory(size, stack_);
    return res;
  }

@ -176,7 +177,9 @@ struct QuarantineCallback {
    get_allocator().Deallocate(cache_, p);
  }

-  AllocatorCache *cache_;
+ private:
+  AllocatorCache* const cache_;
+  BufferedStackTrace* const stack_;
 };

 typedef Quarantine<QuarantineCallback, AsanChunk> AsanQuarantine;
@ -397,8 +400,11 @@ struct Allocator {
                 AllocType alloc_type, bool can_fill) {
    if (UNLIKELY(!asan_inited))
      AsanInitFromRtl();
-    if (RssLimitExceeded())
-      return AsanAllocator::FailureHandler::OnOOM();
+    if (RssLimitExceeded()) {
+      if (AllocatorMayReturnNull())
+        return nullptr;
+      ReportRssLimitExceeded(stack);
+    }
    Flags &fl = *flags();
    CHECK(stack);
    const uptr min_alignment = SHADOW_GRANULARITY;
@ -431,9 +437,13 @@ struct Allocator {
    }
    CHECK(IsAligned(needed_size, min_alignment));
    if (size > kMaxAllowedMallocSize || needed_size > kMaxAllowedMallocSize) {
-      Report("WARNING: AddressSanitizer failed to allocate 0x%zx bytes\n",
-             (void*)size);
-      return AsanAllocator::FailureHandler::OnBadRequest();
+      if (AllocatorMayReturnNull()) {
+        Report("WARNING: AddressSanitizer failed to allocate 0x%zx bytes\n",
+               (void*)size);
+        return nullptr;
+      }
+      ReportAllocationSizeTooBig(size, needed_size, kMaxAllowedMallocSize,
+                                 stack);
    }

    AsanThread *t = GetCurrentThread();
@ -446,8 +456,12 @@ struct Allocator {
      AllocatorCache *cache = &fallback_allocator_cache;
      allocated = allocator.Allocate(cache, needed_size, 8);
    }
-    if (!allocated)
-      return nullptr;
+    if (UNLIKELY(!allocated)) {
+      SetAllocatorOutOfMemory();
+      if (AllocatorMayReturnNull())
+        return nullptr;
+      ReportOutOfMemory(size, stack);
+    }

    if (*(u8 *)MEM_TO_SHADOW((uptr)allocated) == 0 && CanPoisonMemory()) {
      // Heap poisoning is enabled, but the allocator provides an unpoisoned
@ -583,13 +597,13 @@ struct Allocator {
    if (t) {
      AsanThreadLocalMallocStorage *ms = &t->malloc_storage();
      AllocatorCache *ac = GetAllocatorCache(ms);
-      quarantine.Put(GetQuarantineCache(ms), QuarantineCallback(ac), m,
-                           m->UsedSize());
+      quarantine.Put(GetQuarantineCache(ms), QuarantineCallback(ac, stack), m,
+                     m->UsedSize());
    } else {
      SpinMutexLock l(&fallback_mutex);
      AllocatorCache *ac = &fallback_allocator_cache;
-      quarantine.Put(&fallback_quarantine_cache, QuarantineCallback(ac), m,
-                           m->UsedSize());
+      quarantine.Put(&fallback_quarantine_cache, QuarantineCallback(ac, stack),
+                     m, m->UsedSize());
    }
  }

@ -660,8 +674,11 @@ struct Allocator {
  }

  void *Calloc(uptr nmemb, uptr size, BufferedStackTrace *stack) {
-    if (CheckForCallocOverflow(size, nmemb))
-      return AsanAllocator::FailureHandler::OnBadRequest();
+    if (UNLIKELY(CheckForCallocOverflow(size, nmemb))) {
+      if (AllocatorMayReturnNull())
+        return nullptr;
+      ReportCallocOverflow(nmemb, size, stack);
+    }
    void *ptr = Allocate(nmemb * size, 8, stack, FROM_MALLOC, false);
    // If the memory comes from the secondary allocator no need to clear it
    // as it comes directly from mmap.
@ -677,9 +694,9 @@ struct Allocator {
      ReportFreeNotMalloced((uptr)ptr, stack);
  }

-  void CommitBack(AsanThreadLocalMallocStorage *ms) {
+  void CommitBack(AsanThreadLocalMallocStorage *ms, BufferedStackTrace *stack) {
    AllocatorCache *ac = GetAllocatorCache(ms);
-    quarantine.Drain(GetQuarantineCache(ms), QuarantineCallback(ac));
+    quarantine.Drain(GetQuarantineCache(ms), QuarantineCallback(ac, stack));
    allocator.SwallowCache(ac);
  }

@ -739,17 +756,19 @@ struct Allocator {
    return AsanChunkView(m1);
  }

-  void Purge() {
+  void Purge(BufferedStackTrace *stack) {
    AsanThread *t = GetCurrentThread();
    if (t) {
      AsanThreadLocalMallocStorage *ms = &t->malloc_storage();
      quarantine.DrainAndRecycle(GetQuarantineCache(ms),
-                                 QuarantineCallback(GetAllocatorCache(ms)));
+                                 QuarantineCallback(GetAllocatorCache(ms),
+                                                    stack));
    }
    {
      SpinMutexLock l(&fallback_mutex);
      quarantine.DrainAndRecycle(&fallback_quarantine_cache,
-                                 QuarantineCallback(&fallback_allocator_cache));
+                                 QuarantineCallback(&fallback_allocator_cache,
+                                                    stack));
    }

    allocator.ForceReleaseToOS();
@ -836,7 +855,8 @@ AsanChunkView FindHeapChunkByAllocBeg(uptr addr) {
 }

 void AsanThreadLocalMallocStorage::CommitBack() {
-  instance.CommitBack(this);
+  GET_STACK_TRACE_MALLOC;
+  instance.CommitBack(this, &stack);
 }

 void PrintInternalAllocatorStats() {
@ -883,7 +903,9 @@ void *asan_pvalloc(uptr size, BufferedStackTrace *stack) {
  uptr PageSize = GetPageSizeCached();
  if (UNLIKELY(CheckForPvallocOverflow(size, PageSize))) {
    errno = errno_ENOMEM;
-    return AsanAllocator::FailureHandler::OnBadRequest();
+    if (AllocatorMayReturnNull())
+      return nullptr;
+    ReportPvallocOverflow(size, stack);
  }
  // pvalloc(0) should allocate one page.
  size = size ? RoundUpTo(size, PageSize) : PageSize;
@ -895,20 +917,35 @@ void *asan_memalign(uptr alignment, uptr size, BufferedStackTrace *stack,
                    AllocType alloc_type) {
  if (UNLIKELY(!IsPowerOfTwo(alignment))) {
    errno = errno_EINVAL;
-    return AsanAllocator::FailureHandler::OnBadRequest();
+    if (AllocatorMayReturnNull())
+      return nullptr;
+    ReportInvalidAllocationAlignment(alignment, stack);
  }
  return SetErrnoOnNull(
      instance.Allocate(size, alignment, stack, alloc_type, true));
 }

+void *asan_aligned_alloc(uptr alignment, uptr size, BufferedStackTrace *stack) {
+  if (UNLIKELY(!CheckAlignedAllocAlignmentAndSize(alignment, size))) {
+    errno = errno_EINVAL;
+    if (AllocatorMayReturnNull())
+      return nullptr;
+    ReportInvalidAlignedAllocAlignment(size, alignment, stack);
+  }
+  return SetErrnoOnNull(
+      instance.Allocate(size, alignment, stack, FROM_MALLOC, true));
+}
+
 int asan_posix_memalign(void **memptr, uptr alignment, uptr size,
                        BufferedStackTrace *stack) {
  if (UNLIKELY(!CheckPosixMemalignAlignment(alignment))) {
-    AsanAllocator::FailureHandler::OnBadRequest();
-    return errno_EINVAL;
+    if (AllocatorMayReturnNull())
+      return errno_EINVAL;
+    ReportInvalidPosixMemalignAlignment(alignment, stack);
  }
  void *ptr = instance.Allocate(size, alignment, stack, FROM_MALLOC, true);
  if (UNLIKELY(!ptr))
+    // OOM error is already taken care of by Allocate.
    return errno_ENOMEM;
  CHECK(IsAligned((uptr)ptr, alignment));
  *memptr = ptr;
@ -1054,7 +1091,8 @@ uptr __sanitizer_get_allocated_size(const void *p) {
 }

 void __sanitizer_purge_allocator() {
-  instance.Purge();
+  GET_STACK_TRACE_MALLOC;
+  instance.Purge(&stack);
 }

 #if !SANITIZER_SUPPORTS_WEAK_HOOKS
--- a/lib/asan/asan_allocator.h
+++ b/lib/asan/asan_allocator.h
@ -125,11 +125,12 @@ const uptr kAllocatorSpace = ~(uptr)0;
 const uptr kAllocatorSize  =  0x40000000000ULL;  // 4T.
 typedef DefaultSizeClassMap SizeClassMap;
 # elif defined(__powerpc64__)
-const uptr kAllocatorSpace =  0xa0000000000ULL;
+const uptr kAllocatorSpace = ~(uptr)0;
 const uptr kAllocatorSize  =  0x20000000000ULL;  // 2T.
 typedef DefaultSizeClassMap SizeClassMap;
 # elif defined(__aarch64__) && SANITIZER_ANDROID
-const uptr kAllocatorSpace =  0x3000000000ULL;
+// Android needs to support 39, 42 and 48 bit VMA.
+const uptr kAllocatorSpace =  ~(uptr)0;
 const uptr kAllocatorSize  =  0x2000000000ULL;  // 128G.
 typedef VeryCompactSizeClassMap SizeClassMap;
 # elif defined(__aarch64__)
@ -207,6 +208,7 @@ void *asan_realloc(void *p, uptr size, BufferedStackTrace *stack);
 void *asan_valloc(uptr size, BufferedStackTrace *stack);
 void *asan_pvalloc(uptr size, BufferedStackTrace *stack);

+void *asan_aligned_alloc(uptr alignment, uptr size, BufferedStackTrace *stack);
 int asan_posix_memalign(void **memptr, uptr alignment, uptr size,
                        BufferedStackTrace *stack);
 uptr asan_malloc_usable_size(const void *ptr, uptr pc, uptr bp);
--- a/lib/asan/asan_debugging.cc
+++ b/lib/asan/asan_debugging.cc
@ -27,7 +27,8 @@ using namespace __asan;
 static void FindInfoForStackVar(uptr addr, const char *frame_descr, uptr offset,
                                char *name, uptr name_size,
                                uptr &region_address, uptr &region_size) {
-  InternalMmapVector<StackVarDescr> vars(16);
+  InternalMmapVector<StackVarDescr> vars;
+  vars.reserve(16);
  if (!ParseFrameDescription(frame_descr, &vars)) {
    return;
  }
--- a/lib/asan/asan_descriptions.cc
+++ b/lib/asan/asan_descriptions.cc
@ -20,23 +20,25 @@

 namespace __asan {

-// Return " (thread_name) " or an empty string if the name is empty.
-const char *ThreadNameWithParenthesis(AsanThreadContext *t, char buff[],
-                                      uptr buff_len) {
-  const char *name = t->name;
-  if (name[0] == '\0') return "";
-  buff[0] = 0;
-  internal_strncat(buff, " (", 3);
-  internal_strncat(buff, name, buff_len - 4);
-  internal_strncat(buff, ")", 2);
-  return buff;
+AsanThreadIdAndName::AsanThreadIdAndName(AsanThreadContext *t) {
+  Init(t->tid, t->name);
 }

-const char *ThreadNameWithParenthesis(u32 tid, char buff[], uptr buff_len) {
-  if (tid == kInvalidTid) return "";
-  asanThreadRegistry().CheckLocked();
-  AsanThreadContext *t = GetThreadContextByTidLocked(tid);
-  return ThreadNameWithParenthesis(t, buff, buff_len);
+AsanThreadIdAndName::AsanThreadIdAndName(u32 tid) {
+  if (tid == kInvalidTid) {
+    Init(tid, "");
+  } else {
+    asanThreadRegistry().CheckLocked();
+    AsanThreadContext *t = GetThreadContextByTidLocked(tid);
+    Init(tid, t->name);
+  }
+}
+
+void AsanThreadIdAndName::Init(u32 tid, const char *tname) {
+  int len = internal_snprintf(name, sizeof(name), "T%d", tid);
+  CHECK(((unsigned int)len) < sizeof(name));
+  if (tname[0] != '\0')
+    internal_snprintf(&name[len], sizeof(name) - len, " (%s)", tname);
 }

 void DescribeThread(AsanThreadContext *context) {
@ -47,18 +49,15 @@ void DescribeThread(AsanThreadContext *context) {
    return;
  }
  context->announced = true;
-  char tname[128];
  InternalScopedString str(1024);
-  str.append("Thread T%d%s", context->tid,
-             ThreadNameWithParenthesis(context->tid, tname, sizeof(tname)));
+  str.append("Thread %s", AsanThreadIdAndName(context).c_str());
  if (context->parent_tid == kInvalidTid) {
    str.append(" created by unknown thread\n");
    Printf("%s", str.data());
    return;
  }
-  str.append(
-      " created by T%d%s here:\n", context->parent_tid,
-      ThreadNameWithParenthesis(context->parent_tid, tname, sizeof(tname)));
+  str.append(" created by %s here:\n",
+             AsanThreadIdAndName(context->parent_tid).c_str());
  Printf("%s", str.data());
  StackDepotGet(context->stack_id).Print();
  // Recursively described parent thread if needed.
@ -358,10 +357,9 @@ bool GlobalAddressDescription::PointsInsideTheSameVariable(

 void StackAddressDescription::Print() const {
  Decorator d;
-  char tname[128];
  Printf("%s", d.Location());
-  Printf("Address %p is located in stack of thread T%d%s", addr, tid,
-         ThreadNameWithParenthesis(tid, tname, sizeof(tname)));
+  Printf("Address %p is located in stack of thread %s", addr,
+         AsanThreadIdAndName(tid).c_str());

  if (!frame_descr) {
    Printf("%s\n", d.Default());
@ -380,7 +378,8 @@ void StackAddressDescription::Print() const {
  StackTrace alloca_stack(&frame_pc, 1);
  alloca_stack.Print();

-  InternalMmapVector<StackVarDescr> vars(16);
+  InternalMmapVector<StackVarDescr> vars;
+  vars.reserve(16);
  if (!ParseFrameDescription(frame_descr, &vars)) {
    Printf(
        "AddressSanitizer can't parse the stack frame "
@ -402,7 +401,7 @@ void StackAddressDescription::Print() const {
  }
  Printf(
      "HINT: this may be a false positive if your program uses "
-      "some custom stack unwind mechanism or swapcontext\n");
+      "some custom stack unwind mechanism, swapcontext or vfork\n");
  if (SANITIZER_WINDOWS)
    Printf("      (longjmp, SEH and C++ exceptions *are* supported)\n");
  else
@ -418,26 +417,19 @@ void HeapAddressDescription::Print() const {
  AsanThreadContext *alloc_thread = GetThreadContextByTidLocked(alloc_tid);
  StackTrace alloc_stack = GetStackTraceFromId(alloc_stack_id);

-  char tname[128];
  Decorator d;
  AsanThreadContext *free_thread = nullptr;
  if (free_tid != kInvalidTid) {
    free_thread = GetThreadContextByTidLocked(free_tid);
-    Printf("%sfreed by thread T%d%s here:%s\n", d.Allocation(),
-           free_thread->tid,
-           ThreadNameWithParenthesis(free_thread, tname, sizeof(tname)),
-           d.Default());
+    Printf("%sfreed by thread %s here:%s\n", d.Allocation(),
+           AsanThreadIdAndName(free_thread).c_str(), d.Default());
    StackTrace free_stack = GetStackTraceFromId(free_stack_id);
    free_stack.Print();
-    Printf("%spreviously allocated by thread T%d%s here:%s\n", d.Allocation(),
-           alloc_thread->tid,
-           ThreadNameWithParenthesis(alloc_thread, tname, sizeof(tname)),
-           d.Default());
+    Printf("%spreviously allocated by thread %s here:%s\n", d.Allocation(),
+           AsanThreadIdAndName(alloc_thread).c_str(), d.Default());
  } else {
-    Printf("%sallocated by thread T%d%s here:%s\n", d.Allocation(),
-           alloc_thread->tid,
-           ThreadNameWithParenthesis(alloc_thread, tname, sizeof(tname)),
-           d.Default());
+    Printf("%sallocated by thread %s here:%s\n", d.Allocation(),
+           AsanThreadIdAndName(alloc_thread).c_str(), d.Default());
  }
  alloc_stack.Print();
  DescribeThread(GetCurrentThread());
--- a/lib/asan/asan_descriptions.h
+++ b/lib/asan/asan_descriptions.h
@ -26,9 +26,20 @@ void DescribeThread(AsanThreadContext *context);
 static inline void DescribeThread(AsanThread *t) {
  if (t) DescribeThread(t->context());
 }
-const char *ThreadNameWithParenthesis(AsanThreadContext *t, char buff[],
-                                      uptr buff_len);
-const char *ThreadNameWithParenthesis(u32 tid, char buff[], uptr buff_len);
+
+class AsanThreadIdAndName {
+ public:
+  explicit AsanThreadIdAndName(AsanThreadContext *t);
+  explicit AsanThreadIdAndName(u32 tid);
+
+  // Contains "T%tid (%name)" or "T%tid" if the name is empty.
+  const char *c_str() const { return &name[0]; }
+
+ private:
+  void Init(u32 tid, const char *tname);
+
+  char name[128];
+};

 class Decorator : public __sanitizer::SanitizerCommonDecorator {
 public:
--- a/lib/asan/asan_errors.cc
+++ b/lib/asan/asan_errors.cc
@ -45,13 +45,11 @@ void ErrorDeadlySignal::Print() {

 void ErrorDoubleFree::Print() {
  Decorator d;
-  Printf("%s", d.Warning());
-  char tname[128];
+  Printf("%s", d.Error());
  Report(
-      "ERROR: AddressSanitizer: attempting %s on %p in "
-      "thread T%d%s:\n",
-      scariness.GetDescription(), addr_description.addr, tid,
-      ThreadNameWithParenthesis(tid, tname, sizeof(tname)));
+      "ERROR: AddressSanitizer: attempting %s on %p in thread %s:\n",
+      scariness.GetDescription(), addr_description.addr,
+      AsanThreadIdAndName(tid).c_str());
  Printf("%s", d.Default());
  scariness.Print();
  GET_STACK_TRACE_FATAL(second_free_stack->trace[0],
@ -63,13 +61,11 @@ void ErrorDoubleFree::Print() {

 void ErrorNewDeleteTypeMismatch::Print() {
  Decorator d;
-  Printf("%s", d.Warning());
-  char tname[128];
+  Printf("%s", d.Error());
  Report(
-      "ERROR: AddressSanitizer: %s on %p in thread "
-      "T%d%s:\n",
-      scariness.GetDescription(), addr_description.addr, tid,
-      ThreadNameWithParenthesis(tid, tname, sizeof(tname)));
+      "ERROR: AddressSanitizer: %s on %p in thread %s:\n",
+      scariness.GetDescription(), addr_description.addr,
+      AsanThreadIdAndName(tid).c_str());
  Printf("%s  object passed to delete has wrong type:\n", d.Default());
  if (delete_size != 0) {
    Printf(
@ -106,13 +102,11 @@ void ErrorNewDeleteTypeMismatch::Print() {

 void ErrorFreeNotMalloced::Print() {
  Decorator d;
-  Printf("%s", d.Warning());
-  char tname[128];
+  Printf("%s", d.Error());
  Report(
      "ERROR: AddressSanitizer: attempting free on address "
-      "which was not malloc()-ed: %p in thread T%d%s\n",
-      addr_description.Address(), tid,
-      ThreadNameWithParenthesis(tid, tname, sizeof(tname)));
+      "which was not malloc()-ed: %p in thread %s\n",
+      addr_description.Address(), AsanThreadIdAndName(tid).c_str());
  Printf("%s", d.Default());
  CHECK_GT(free_stack->size, 0);
  scariness.Print();
@ -129,7 +123,7 @@ void ErrorAllocTypeMismatch::Print() {
                                        "operator delete []"};
  CHECK_NE(alloc_type, dealloc_type);
  Decorator d;
-  Printf("%s", d.Warning());
+  Printf("%s", d.Error());
  Report("ERROR: AddressSanitizer: %s (%s vs %s) on %p\n",
         scariness.GetDescription(),
         alloc_names[alloc_type], dealloc_names[dealloc_type],
@ -148,7 +142,7 @@ void ErrorAllocTypeMismatch::Print() {

 void ErrorMallocUsableSizeNotOwned::Print() {
  Decorator d;
-  Printf("%s", d.Warning());
+  Printf("%s", d.Error());
  Report(
      "ERROR: AddressSanitizer: attempting to call malloc_usable_size() for "
      "pointer which is not owned: %p\n",
@ -161,7 +155,7 @@ void ErrorMallocUsableSizeNotOwned::Print() {

 void ErrorSanitizerGetAllocatedSizeNotOwned::Print() {
  Decorator d;
-  Printf("%s", d.Warning());
+  Printf("%s", d.Error());
  Report(
      "ERROR: AddressSanitizer: attempting to call "
      "__sanitizer_get_allocated_size() for pointer which is not owned: %p\n",
@ -172,11 +166,123 @@ void ErrorSanitizerGetAllocatedSizeNotOwned::Print() {
  ReportErrorSummary(scariness.GetDescription(), stack);
 }

+void ErrorCallocOverflow::Print() {
+  Decorator d;
+  Printf("%s", d.Error());
+  Report(
+      "ERROR: AddressSanitizer: calloc parameters overflow: count * size "
+      "(%zd * %zd) cannot be represented in type size_t (thread %s)\n",
+      count, size, AsanThreadIdAndName(tid).c_str());
+  Printf("%s", d.Default());
+  stack->Print();
+  PrintHintAllocatorCannotReturnNull();
+  ReportErrorSummary(scariness.GetDescription(), stack);
+}
+
+void ErrorPvallocOverflow::Print() {
+  Decorator d;
+  Printf("%s", d.Error());
+  Report(
+      "ERROR: AddressSanitizer: pvalloc parameters overflow: size 0x%zx "
+      "rounded up to system page size 0x%zx cannot be represented in type "
+      "size_t (thread %s)\n",
+      size, GetPageSizeCached(), AsanThreadIdAndName(tid).c_str());
+  Printf("%s", d.Default());
+  stack->Print();
+  PrintHintAllocatorCannotReturnNull();
+  ReportErrorSummary(scariness.GetDescription(), stack);
+}
+
+void ErrorInvalidAllocationAlignment::Print() {
+  Decorator d;
+  Printf("%s", d.Error());
+  Report(
+      "ERROR: AddressSanitizer: invalid allocation alignment: %zd, "
+      "alignment must be a power of two (thread %s)\n",
+      alignment, AsanThreadIdAndName(tid).c_str());
+  Printf("%s", d.Default());
+  stack->Print();
+  PrintHintAllocatorCannotReturnNull();
+  ReportErrorSummary(scariness.GetDescription(), stack);
+}
+
+void ErrorInvalidAlignedAllocAlignment::Print() {
+  Decorator d;
+  Printf("%s", d.Error());
+#if SANITIZER_POSIX
+  Report("ERROR: AddressSanitizer: invalid alignment requested in "
+         "aligned_alloc: %zd, alignment must be a power of two and the "
+         "requested size 0x%zx must be a multiple of alignment "
+         "(thread %s)\n", alignment, size, AsanThreadIdAndName(tid).c_str());
+#else
+  Report("ERROR: AddressSanitizer: invalid alignment requested in "
+         "aligned_alloc: %zd, the requested size 0x%zx must be a multiple of "
+         "alignment (thread %s)\n", alignment, size,
+         AsanThreadIdAndName(tid).c_str());
+#endif
+  Printf("%s", d.Default());
+  stack->Print();
+  PrintHintAllocatorCannotReturnNull();
+  ReportErrorSummary(scariness.GetDescription(), stack);
+}
+
+void ErrorInvalidPosixMemalignAlignment::Print() {
+  Decorator d;
+  Printf("%s", d.Error());
+  Report(
+      "ERROR: AddressSanitizer: invalid alignment requested in posix_memalign: "
+      "%zd, alignment must be a power of two and a multiple of sizeof(void*) "
+      "== %zd (thread %s)\n",
+      alignment, sizeof(void*), AsanThreadIdAndName(tid).c_str());  // NOLINT
+  Printf("%s", d.Default());
+  stack->Print();
+  PrintHintAllocatorCannotReturnNull();
+  ReportErrorSummary(scariness.GetDescription(), stack);
+}
+
+void ErrorAllocationSizeTooBig::Print() {
+  Decorator d;
+  Printf("%s", d.Error());
+  Report(
+      "ERROR: AddressSanitizer: requested allocation size 0x%zx (0x%zx after "
+      "adjustments for alignment, red zones etc.) exceeds maximum supported "
+      "size of 0x%zx (thread %s)\n",
+      user_size, total_size, max_size, AsanThreadIdAndName(tid).c_str());
+  Printf("%s", d.Default());
+  stack->Print();
+  PrintHintAllocatorCannotReturnNull();
+  ReportErrorSummary(scariness.GetDescription(), stack);
+}
+
+void ErrorRssLimitExceeded::Print() {
+  Decorator d;
+  Printf("%s", d.Error());
+  Report(
+      "ERROR: AddressSanitizer: specified RSS limit exceeded, currently set to "
+      "soft_rss_limit_mb=%zd\n", common_flags()->soft_rss_limit_mb);
+  Printf("%s", d.Default());
+  stack->Print();
+  PrintHintAllocatorCannotReturnNull();
+  ReportErrorSummary(scariness.GetDescription(), stack);
+}
+
+void ErrorOutOfMemory::Print() {
+  Decorator d;
+  Printf("%s", d.Error());
+  Report(
+      "ERROR: AddressSanitizer: allocator is out of memory trying to allocate "
+      "0x%zx bytes\n", requested_size);
+  Printf("%s", d.Default());
+  stack->Print();
+  PrintHintAllocatorCannotReturnNull();
+  ReportErrorSummary(scariness.GetDescription(), stack);
+}
+
 void ErrorStringFunctionMemoryRangesOverlap::Print() {
  Decorator d;
  char bug_type[100];
  internal_snprintf(bug_type, sizeof(bug_type), "%s-param-overlap", function);
-  Printf("%s", d.Warning());
+  Printf("%s", d.Error());
  Report(
      "ERROR: AddressSanitizer: %s: memory ranges [%p,%p) and [%p, %p) "
      "overlap\n",
@ -193,7 +299,7 @@ void ErrorStringFunctionMemoryRangesOverlap::Print() {

 void ErrorStringFunctionSizeOverflow::Print() {
  Decorator d;
-  Printf("%s", d.Warning());
+  Printf("%s", d.Error());
  Report("ERROR: AddressSanitizer: %s: (size=%zd)\n",
         scariness.GetDescription(), size);
  Printf("%s", d.Default());
@ -221,7 +327,7 @@ void ErrorBadParamsToAnnotateContiguousContainer::Print() {

 void ErrorODRViolation::Print() {
  Decorator d;
-  Printf("%s", d.Warning());
+  Printf("%s", d.Error());
  Report("ERROR: AddressSanitizer: %s (%p):\n", scariness.GetDescription(),
         global1.beg);
  Printf("%s", d.Default());
@ -250,7 +356,7 @@ void ErrorODRViolation::Print() {

 void ErrorInvalidPointerPair::Print() {
  Decorator d;
-  Printf("%s", d.Warning());
+  Printf("%s", d.Error());
  Report("ERROR: AddressSanitizer: %s: %p %p\n", scariness.GetDescription(),
         addr1_description.Address(), addr2_description.Address());
  Printf("%s", d.Default());
@ -414,6 +520,7 @@ static void PrintLegend(InternalScopedString *str) {
  PrintShadowByte(str, "  ASan internal:           ", kAsanInternalHeapMagic);
  PrintShadowByte(str, "  Left alloca redzone:     ", kAsanAllocaLeftMagic);
  PrintShadowByte(str, "  Right alloca redzone:    ", kAsanAllocaRightMagic);
+  PrintShadowByte(str, "  Shadow gap:              ", kAsanShadowGap);
 }

 static void PrintShadowBytes(InternalScopedString *str, const char *before,
@ -453,17 +560,15 @@ static void PrintShadowMemoryForAddress(uptr addr) {

 void ErrorGeneric::Print() {
  Decorator d;
-  Printf("%s", d.Warning());
+  Printf("%s", d.Error());
  uptr addr = addr_description.Address();
  Report("ERROR: AddressSanitizer: %s on address %p at pc %p bp %p sp %p\n",
         bug_descr, (void *)addr, pc, bp, sp);
  Printf("%s", d.Default());

-  char tname[128];
-  Printf("%s%s of size %zu at %p thread T%d%s%s\n", d.Access(),
+  Printf("%s%s of size %zu at %p thread %s%s\n", d.Access(),
         access_size ? (is_write ? "WRITE" : "READ") : "ACCESS", access_size,
-         (void *)addr, tid,
-         ThreadNameWithParenthesis(tid, tname, sizeof(tname)), d.Default());
+         (void *)addr, AsanThreadIdAndName(tid).c_str(), d.Default());

  scariness.Print();
  GET_STACK_TRACE_FATAL(pc, bp);
--- a/lib/asan/asan_errors.h
+++ b/lib/asan/asan_errors.h
@ -20,20 +20,30 @@

 namespace __asan {

+// (*) VS2013 does not implement unrestricted unions, so we need a trivial
+// default constructor explicitly defined for each particular error.
+
+// None of the error classes own the stack traces mentioned in them.
+
 struct ErrorBase {
-  ErrorBase() = default;
-  explicit ErrorBase(u32 tid_) : tid(tid_) {}
  ScarinessScoreBase scariness;
  u32 tid;
+
+  ErrorBase() = default;  // (*)
+  explicit ErrorBase(u32 tid_) : tid(tid_) {}
+  ErrorBase(u32 tid_, int initial_score, const char *reason) : tid(tid_) {
+    scariness.Clear();
+    scariness.Scare(initial_score, reason);
+  }
 };

 struct ErrorDeadlySignal : ErrorBase {
  SignalContext signal;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorDeadlySignal() = default;
+
+  ErrorDeadlySignal() = default;  // (*)
  ErrorDeadlySignal(u32 tid, const SignalContext &sig)
-      : ErrorBase(tid), signal(sig) {
+      : ErrorBase(tid),
+        signal(sig) {
    scariness.Clear();
    if (signal.IsStackOverflow()) {
      scariness.Scare(10, "stack-overflow");
@ -55,125 +65,206 @@ struct ErrorDeadlySignal : ErrorBase {
 };

 struct ErrorDoubleFree : ErrorBase {
-  // ErrorDoubleFree doesn't own the stack trace.
  const BufferedStackTrace *second_free_stack;
  HeapAddressDescription addr_description;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorDoubleFree() = default;
+
+  ErrorDoubleFree() = default;  // (*)
  ErrorDoubleFree(u32 tid, BufferedStackTrace *stack, uptr addr)
-      : ErrorBase(tid), second_free_stack(stack) {
+      : ErrorBase(tid, 42, "double-free"),
+        second_free_stack(stack) {
    CHECK_GT(second_free_stack->size, 0);
    GetHeapAddressInformation(addr, 1, &addr_description);
-    scariness.Clear();
-    scariness.Scare(42, "double-free");
  }
  void Print();
 };

 struct ErrorNewDeleteTypeMismatch : ErrorBase {
-  // ErrorNewDeleteTypeMismatch doesn't own the stack trace.
  const BufferedStackTrace *free_stack;
  HeapAddressDescription addr_description;
  uptr delete_size;
  uptr delete_alignment;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorNewDeleteTypeMismatch() = default;
+
+  ErrorNewDeleteTypeMismatch() = default;  // (*)
  ErrorNewDeleteTypeMismatch(u32 tid, BufferedStackTrace *stack, uptr addr,
                             uptr delete_size_, uptr delete_alignment_)
-      : ErrorBase(tid), free_stack(stack), delete_size(delete_size_),
+      : ErrorBase(tid, 10, "new-delete-type-mismatch"),
+        free_stack(stack),
+        delete_size(delete_size_),
        delete_alignment(delete_alignment_) {
    GetHeapAddressInformation(addr, 1, &addr_description);
-    scariness.Clear();
-    scariness.Scare(10, "new-delete-type-mismatch");
  }
  void Print();
 };

 struct ErrorFreeNotMalloced : ErrorBase {
-  // ErrorFreeNotMalloced doesn't own the stack trace.
  const BufferedStackTrace *free_stack;
  AddressDescription addr_description;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorFreeNotMalloced() = default;
+
+  ErrorFreeNotMalloced() = default;  // (*)
  ErrorFreeNotMalloced(u32 tid, BufferedStackTrace *stack, uptr addr)
-      : ErrorBase(tid),
+      : ErrorBase(tid, 40, "bad-free"),
        free_stack(stack),
-        addr_description(addr, /*shouldLockThreadRegistry=*/false) {
-    scariness.Clear();
-    scariness.Scare(40, "bad-free");
-  }
+        addr_description(addr, /*shouldLockThreadRegistry=*/false) {}
  void Print();
 };

 struct ErrorAllocTypeMismatch : ErrorBase {
-  // ErrorAllocTypeMismatch doesn't own the stack trace.
  const BufferedStackTrace *dealloc_stack;
  HeapAddressDescription addr_description;
  AllocType alloc_type, dealloc_type;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorAllocTypeMismatch() = default;
+
+  ErrorAllocTypeMismatch() = default;  // (*)
  ErrorAllocTypeMismatch(u32 tid, BufferedStackTrace *stack, uptr addr,
                         AllocType alloc_type_, AllocType dealloc_type_)
-      : ErrorBase(tid),
+      : ErrorBase(tid, 10, "alloc-dealloc-mismatch"),
        dealloc_stack(stack),
        alloc_type(alloc_type_),
        dealloc_type(dealloc_type_) {
    GetHeapAddressInformation(addr, 1, &addr_description);
-    scariness.Clear();
-    scariness.Scare(10, "alloc-dealloc-mismatch");
  };
  void Print();
 };

 struct ErrorMallocUsableSizeNotOwned : ErrorBase {
-  // ErrorMallocUsableSizeNotOwned doesn't own the stack trace.
  const BufferedStackTrace *stack;
  AddressDescription addr_description;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorMallocUsableSizeNotOwned() = default;
+
+  ErrorMallocUsableSizeNotOwned() = default;  // (*)
  ErrorMallocUsableSizeNotOwned(u32 tid, BufferedStackTrace *stack_, uptr addr)
-      : ErrorBase(tid),
+      : ErrorBase(tid, 10, "bad-malloc_usable_size"),
        stack(stack_),
-        addr_description(addr, /*shouldLockThreadRegistry=*/false) {
-    scariness.Clear();
-    scariness.Scare(10, "bad-malloc_usable_size");
-  }
+        addr_description(addr, /*shouldLockThreadRegistry=*/false) {}
  void Print();
 };

 struct ErrorSanitizerGetAllocatedSizeNotOwned : ErrorBase {
-  // ErrorSanitizerGetAllocatedSizeNotOwned doesn't own the stack trace.
  const BufferedStackTrace *stack;
  AddressDescription addr_description;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorSanitizerGetAllocatedSizeNotOwned() = default;
+
+  ErrorSanitizerGetAllocatedSizeNotOwned() = default;  // (*)
  ErrorSanitizerGetAllocatedSizeNotOwned(u32 tid, BufferedStackTrace *stack_,
                                         uptr addr)
-      : ErrorBase(tid),
+      : ErrorBase(tid, 10, "bad-__sanitizer_get_allocated_size"),
        stack(stack_),
-        addr_description(addr, /*shouldLockThreadRegistry=*/false) {
-    scariness.Clear();
-    scariness.Scare(10, "bad-__sanitizer_get_allocated_size");
-  }
+        addr_description(addr, /*shouldLockThreadRegistry=*/false) {}
+  void Print();
+};
+
+struct ErrorCallocOverflow : ErrorBase {
+  const BufferedStackTrace *stack;
+  uptr count;
+  uptr size;
+
+  ErrorCallocOverflow() = default;  // (*)
+  ErrorCallocOverflow(u32 tid, BufferedStackTrace *stack_, uptr count_,
+                      uptr size_)
+      : ErrorBase(tid, 10, "calloc-overflow"),
+        stack(stack_),
+        count(count_),
+        size(size_) {}
+  void Print();
+};
+
+struct ErrorPvallocOverflow : ErrorBase {
+  const BufferedStackTrace *stack;
+  uptr size;
+
+  ErrorPvallocOverflow() = default;  // (*)
+  ErrorPvallocOverflow(u32 tid, BufferedStackTrace *stack_, uptr size_)
+      : ErrorBase(tid, 10, "pvalloc-overflow"),
+        stack(stack_),
+        size(size_) {}
+  void Print();
+};
+
+struct ErrorInvalidAllocationAlignment : ErrorBase {
+  const BufferedStackTrace *stack;
+  uptr alignment;
+
+  ErrorInvalidAllocationAlignment() = default;  // (*)
+  ErrorInvalidAllocationAlignment(u32 tid, BufferedStackTrace *stack_,
+                                  uptr alignment_)
+      : ErrorBase(tid, 10, "invalid-allocation-alignment"),
+        stack(stack_),
+        alignment(alignment_) {}
+  void Print();
+};
+
+struct ErrorInvalidAlignedAllocAlignment : ErrorBase {
+  const BufferedStackTrace *stack;
+  uptr size;
+  uptr alignment;
+
+  ErrorInvalidAlignedAllocAlignment() = default;  // (*)
+  ErrorInvalidAlignedAllocAlignment(u32 tid, BufferedStackTrace *stack_,
+                                    uptr size_, uptr alignment_)
+      : ErrorBase(tid, 10, "invalid-aligned-alloc-alignment"),
+        stack(stack_),
+        size(size_),
+        alignment(alignment_) {}
+  void Print();
+};
+
+struct ErrorInvalidPosixMemalignAlignment : ErrorBase {
+  const BufferedStackTrace *stack;
+  uptr alignment;
+
+  ErrorInvalidPosixMemalignAlignment() = default;  // (*)
+  ErrorInvalidPosixMemalignAlignment(u32 tid, BufferedStackTrace *stack_,
+                                     uptr alignment_)
+      : ErrorBase(tid, 10, "invalid-posix-memalign-alignment"),
+        stack(stack_),
+        alignment(alignment_) {}
+  void Print();
+};
+
+struct ErrorAllocationSizeTooBig : ErrorBase {
+  const BufferedStackTrace *stack;
+  uptr user_size;
+  uptr total_size;
+  uptr max_size;
+
+  ErrorAllocationSizeTooBig() = default;  // (*)
+  ErrorAllocationSizeTooBig(u32 tid, BufferedStackTrace *stack_,
+                            uptr user_size_, uptr total_size_, uptr max_size_)
+      : ErrorBase(tid, 10, "allocation-size-too-big"),
+        stack(stack_),
+        user_size(user_size_),
+        total_size(total_size_),
+        max_size(max_size_) {}
+  void Print();
+};
+
+struct ErrorRssLimitExceeded : ErrorBase {
+  const BufferedStackTrace *stack;
+
+  ErrorRssLimitExceeded() = default;  // (*)
+  ErrorRssLimitExceeded(u32 tid, BufferedStackTrace *stack_)
+      : ErrorBase(tid, 10, "rss-limit-exceeded"),
+        stack(stack_) {}
+  void Print();
+};
+
+struct ErrorOutOfMemory : ErrorBase {
+  const BufferedStackTrace *stack;
+  uptr requested_size;
+
+  ErrorOutOfMemory() = default;  // (*)
+  ErrorOutOfMemory(u32 tid, BufferedStackTrace *stack_, uptr requested_size_)
+      : ErrorBase(tid, 10, "out-of-memory"),
+        stack(stack_),
+        requested_size(requested_size_) {}
  void Print();
 };

 struct ErrorStringFunctionMemoryRangesOverlap : ErrorBase {
-  // ErrorStringFunctionMemoryRangesOverlap doesn't own the stack trace.
  const BufferedStackTrace *stack;
  uptr length1, length2;
  AddressDescription addr1_description;
  AddressDescription addr2_description;
  const char *function;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorStringFunctionMemoryRangesOverlap() = default;
+
+  ErrorStringFunctionMemoryRangesOverlap() = default;  // (*)
  ErrorStringFunctionMemoryRangesOverlap(u32 tid, BufferedStackTrace *stack_,
                                         uptr addr1, uptr length1_, uptr addr2,
                                         uptr length2_, const char *function_)
@ -193,65 +284,51 @@ struct ErrorStringFunctionMemoryRangesOverlap : ErrorBase {
 };

 struct ErrorStringFunctionSizeOverflow : ErrorBase {
-  // ErrorStringFunctionSizeOverflow doesn't own the stack trace.
  const BufferedStackTrace *stack;
  AddressDescription addr_description;
  uptr size;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorStringFunctionSizeOverflow() = default;
+
+  ErrorStringFunctionSizeOverflow() = default;  // (*)
  ErrorStringFunctionSizeOverflow(u32 tid, BufferedStackTrace *stack_,
                                  uptr addr, uptr size_)
-      : ErrorBase(tid),
+      : ErrorBase(tid, 10, "negative-size-param"),
        stack(stack_),
        addr_description(addr, /*shouldLockThreadRegistry=*/false),
-        size(size_) {
-    scariness.Clear();
-    scariness.Scare(10, "negative-size-param");
-  }
+        size(size_) {}
  void Print();
 };

 struct ErrorBadParamsToAnnotateContiguousContainer : ErrorBase {
-  // ErrorBadParamsToAnnotateContiguousContainer doesn't own the stack trace.
  const BufferedStackTrace *stack;
  uptr beg, end, old_mid, new_mid;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorBadParamsToAnnotateContiguousContainer() = default;
+
+  ErrorBadParamsToAnnotateContiguousContainer() = default;  // (*)
  // PS4: Do we want an AddressDescription for beg?
  ErrorBadParamsToAnnotateContiguousContainer(u32 tid,
                                              BufferedStackTrace *stack_,
                                              uptr beg_, uptr end_,
                                              uptr old_mid_, uptr new_mid_)
-      : ErrorBase(tid),
+      : ErrorBase(tid, 10, "bad-__sanitizer_annotate_contiguous_container"),
        stack(stack_),
        beg(beg_),
        end(end_),
        old_mid(old_mid_),
-        new_mid(new_mid_) {
-    scariness.Clear();
-    scariness.Scare(10, "bad-__sanitizer_annotate_contiguous_container");
-  }
+        new_mid(new_mid_) {}
  void Print();
 };

 struct ErrorODRViolation : ErrorBase {
  __asan_global global1, global2;
  u32 stack_id1, stack_id2;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorODRViolation() = default;
+
+  ErrorODRViolation() = default;  // (*)
  ErrorODRViolation(u32 tid, const __asan_global *g1, u32 stack_id1_,
                    const __asan_global *g2, u32 stack_id2_)
-      : ErrorBase(tid),
+      : ErrorBase(tid, 10, "odr-violation"),
        global1(*g1),
        global2(*g2),
        stack_id1(stack_id1_),
-        stack_id2(stack_id2_) {
-    scariness.Clear();
-    scariness.Scare(10, "odr-violation");
-  }
+        stack_id2(stack_id2_) {}
  void Print();
 };

@ -259,20 +336,16 @@ struct ErrorInvalidPointerPair : ErrorBase {
  uptr pc, bp, sp;
  AddressDescription addr1_description;
  AddressDescription addr2_description;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorInvalidPointerPair() = default;
+
+  ErrorInvalidPointerPair() = default;  // (*)
  ErrorInvalidPointerPair(u32 tid, uptr pc_, uptr bp_, uptr sp_, uptr p1,
                          uptr p2)
-      : ErrorBase(tid),
+      : ErrorBase(tid, 10, "invalid-pointer-pair"),
        pc(pc_),
        bp(bp_),
        sp(sp_),
        addr1_description(p1, 1, /*shouldLockThreadRegistry=*/false),
-        addr2_description(p2, 1, /*shouldLockThreadRegistry=*/false)  {
-    scariness.Clear();
-    scariness.Scare(10, "invalid-pointer-pair");
-  }
+        addr2_description(p2, 1, /*shouldLockThreadRegistry=*/false) {}
  void Print();
 };

@ -283,9 +356,8 @@ struct ErrorGeneric : ErrorBase {
  const char *bug_descr;
  bool is_write;
  u8 shadow_val;
-  // VS2013 doesn't implement unrestricted unions, so we need a trivial default
-  // constructor
-  ErrorGeneric() = default;
+
+  ErrorGeneric() = default;  // (*)
  ErrorGeneric(u32 tid, uptr addr, uptr pc_, uptr bp_, uptr sp_, bool is_write_,
               uptr access_size_);
  void Print();
@ -300,6 +372,14 @@ struct ErrorGeneric : ErrorBase {
  macro(AllocTypeMismatch)                      \
  macro(MallocUsableSizeNotOwned)               \
  macro(SanitizerGetAllocatedSizeNotOwned)      \
+  macro(CallocOverflow)                         \
+  macro(PvallocOverflow)                        \
+  macro(InvalidAllocationAlignment)             \
+  macro(InvalidAlignedAllocAlignment)           \
+  macro(InvalidPosixMemalignAlignment)          \
+  macro(AllocationSizeTooBig)                   \
+  macro(RssLimitExceeded)                       \
+  macro(OutOfMemory)                            \
  macro(StringFunctionMemoryRangesOverlap)      \
  macro(StringFunctionSizeOverflow)             \
  macro(BadParamsToAnnotateContiguousContainer) \
@ -334,6 +414,7 @@ struct ErrorDescription {
  };

  ErrorDescription() { internal_memset(this, 0, sizeof(*this)); }
+  explicit ErrorDescription(LinkerInitialized) {}
  ASAN_FOR_EACH_ERROR_KIND(ASAN_ERROR_DESCRIPTION_CONSTRUCTOR)

  bool IsValid() { return kind != kErrorKindInvalid; }
--- a/lib/asan/asan_flags.cc
+++ b/lib/asan/asan_flags.cc
@ -33,10 +33,7 @@ static const char *MaybeCallAsanDefaultOptions() {

 static const char *MaybeUseAsanDefaultOptionsCompileDefinition() {
 #ifdef ASAN_DEFAULT_OPTIONS
-// Stringize the macro value.
-# define ASAN_STRINGIZE(x) #x
-# define ASAN_STRINGIZE_OPTIONS(options) ASAN_STRINGIZE(options)
-  return ASAN_STRINGIZE_OPTIONS(ASAN_DEFAULT_OPTIONS);
+  return SANITIZER_STRINGIFY(ASAN_DEFAULT_OPTIONS);
 #else
  return "";
 #endif
@ -163,6 +160,10 @@ void InitializeFlags() {
  CHECK_LE(f->max_redzone, 2048);
  CHECK(IsPowerOfTwo(f->redzone));
  CHECK(IsPowerOfTwo(f->max_redzone));
+  if (SANITIZER_RTEMS) {
+    CHECK(!f->unmap_shadow_on_exit);
+    CHECK(!f->protect_shadow_gap);
+  }

  // quarantine_size is deprecated but we still honor it.
  // quarantine_size can not be used together with quarantine_size_mb.
--- a/lib/asan/asan_flags.inc
+++ b/lib/asan/asan_flags.inc
@ -88,7 +88,8 @@ ASAN_FLAG(bool, check_malloc_usable_size, true,
          "295.*.")
 ASAN_FLAG(bool, unmap_shadow_on_exit, false,
          "If set, explicitly unmaps the (huge) shadow at exit.")
-ASAN_FLAG(bool, protect_shadow_gap, true, "If set, mprotect the shadow gap")
+ASAN_FLAG(bool, protect_shadow_gap, !SANITIZER_RTEMS,
+          "If set, mprotect the shadow gap")
 ASAN_FLAG(bool, print_stats, false,
          "Print various statistics after printing an error message or if "
          "atexit=1.")
@ -136,9 +137,9 @@ ASAN_FLAG(
    "Android. ")
 ASAN_FLAG(
    int, detect_invalid_pointer_pairs, 0,
-    "If non-zero, try to detect operations like <, <=, >, >= and - on "
-    "invalid pointer pairs (e.g. when pointers belong to different objects). "
-    "The bigger the value the harder we try.")
+    "If >= 2, detect operations like <, <=, >, >= and - on invalid pointer "
+    "pairs (e.g. when pointers belong to different objects); "
+    "If == 1, detect invalid operations only when both pointers are non-null.")
 ASAN_FLAG(
    bool, detect_container_overflow, true,
    "If true, honor the container overflow annotations. See "
--- a/lib/asan/asan_globals.cc
+++ b/lib/asan/asan_globals.cc
@ -224,8 +224,9 @@ static void RegisterGlobal(const Global *g) {
  list_of_all_globals = l;
  if (g->has_dynamic_init) {
    if (!dynamic_init_globals) {
-      dynamic_init_globals = new(allocator_for_globals)
-          VectorOfGlobals(kDynamicInitGlobalsInitialCapacity);
+      dynamic_init_globals =
+          new (allocator_for_globals) VectorOfGlobals;  // NOLINT
+      dynamic_init_globals->reserve(kDynamicInitGlobalsInitialCapacity);
    }
    DynInitGlobal dyn_global = { *g, false };
    dynamic_init_globals->push_back(dyn_global);
@ -358,9 +359,11 @@ void __asan_register_globals(__asan_global *globals, uptr n) {
  GET_STACK_TRACE_MALLOC;
  u32 stack_id = StackDepotPut(stack);
  BlockingMutexLock lock(&mu_for_globals);
-  if (!global_registration_site_vector)
+  if (!global_registration_site_vector) {
    global_registration_site_vector =
-        new(allocator_for_globals) GlobalRegistrationSiteVector(128);
+        new (allocator_for_globals) GlobalRegistrationSiteVector;  // NOLINT
+    global_registration_site_vector->reserve(128);
+  }
  GlobalRegistrationSite site = {stack_id, &globals[0], &globals[n - 1]};
  global_registration_site_vector->push_back(site);
  if (flags()->report_globals >= 2) {
--- a/lib/asan/asan_globals_win.cc
+++ b/lib/asan/asan_globals_win.cc
@ -19,9 +19,9 @@ namespace __asan {
 #pragma section(".ASAN$GA", read, write)  // NOLINT
 #pragma section(".ASAN$GZ", read, write)  // NOLINT
 extern "C" __declspec(allocate(".ASAN$GA"))
-__asan_global __asan_globals_start = {};
+    ALIGNED(sizeof(__asan_global)) __asan_global __asan_globals_start = {};
 extern "C" __declspec(allocate(".ASAN$GZ"))
-__asan_global __asan_globals_end = {};
+    ALIGNED(sizeof(__asan_global)) __asan_global __asan_globals_end = {};
 #pragma comment(linker, "/merge:.ASAN=.data")

 static void call_on_globals(void (*hook)(__asan_global *, uptr)) {
--- a/lib/asan/asan_interceptors.cc
+++ b/lib/asan/asan_interceptors.cc
@ -24,15 +24,20 @@
 #include "lsan/lsan_common.h"
 #include "sanitizer_common/sanitizer_libc.h"

-// There is no general interception at all on Fuchsia.
+// There is no general interception at all on Fuchsia and RTEMS.
 // Only the functions in asan_interceptors_memintrinsics.cc are
 // really defined to replace libc functions.
-#if !SANITIZER_FUCHSIA
+#if !SANITIZER_FUCHSIA && !SANITIZER_RTEMS

 #if SANITIZER_POSIX
 #include "sanitizer_common/sanitizer_posix.h"
 #endif

+#if ASAN_INTERCEPT__UNWIND_RAISEEXCEPTION || \
+    ASAN_INTERCEPT__SJLJ_UNWIND_RAISEEXCEPTION
+#include <unwind.h>
+#endif
+
 #if defined(__i386) && SANITIZER_LINUX
 #define ASAN_PTHREAD_CREATE_VERSION "GLIBC_2.1"
 #elif defined(__mips__) && SANITIZER_LINUX
@ -178,6 +183,7 @@ DECLARE_REAL_AND_INTERCEPTOR(void, free, void *)
    (void)(s);                                \
  } while (false)
 #include "sanitizer_common/sanitizer_common_syscalls.inc"
+#include "sanitizer_common/sanitizer_syscalls_netbsd.inc"

 struct ThreadStartParam {
  atomic_uintptr_t t;
@ -269,7 +275,15 @@ INTERCEPTOR(int, swapcontext, struct ucontext_t *oucp,
  uptr stack, ssize;
  ReadContextStack(ucp, &stack, &ssize);
  ClearShadowMemoryForContextStack(stack, ssize);
+#if __has_attribute(__indirect_return__) && \
+    (defined(__x86_64__) || defined(__i386__))
+  int (*real_swapcontext)(struct ucontext_t *, struct ucontext_t *)
+    __attribute__((__indirect_return__))
+    = REAL(swapcontext);
+  int res = real_swapcontext(oucp, ucp);
+#else
  int res = REAL(swapcontext)(oucp, ucp);
+#endif
  // swapcontext technically does not return, but program may swap context to
  // "oucp" later, that would look as if swapcontext() returned 0.
  // We need to clear shadow for ucp once again, as it may be in arbitrary
@ -318,6 +332,32 @@ INTERCEPTOR(void, __cxa_throw, void *a, void *b, void *c) {
 }
 #endif

+#if ASAN_INTERCEPT___CXA_RETHROW_PRIMARY_EXCEPTION
+INTERCEPTOR(void, __cxa_rethrow_primary_exception, void *a) {
+  CHECK(REAL(__cxa_rethrow_primary_exception));
+  __asan_handle_no_return();
+  REAL(__cxa_rethrow_primary_exception)(a);
+}
+#endif
+
+#if ASAN_INTERCEPT__UNWIND_RAISEEXCEPTION
+INTERCEPTOR(_Unwind_Reason_Code, _Unwind_RaiseException,
+            _Unwind_Exception *object) {
+  CHECK(REAL(_Unwind_RaiseException));
+  __asan_handle_no_return();
+  return REAL(_Unwind_RaiseException)(object);
+}
+#endif
+
+#if ASAN_INTERCEPT__SJLJ_UNWIND_RAISEEXCEPTION
+INTERCEPTOR(_Unwind_Reason_Code, _Unwind_SjLj_RaiseException,
+            _Unwind_Exception *object) {
+  CHECK(REAL(_Unwind_SjLj_RaiseException));
+  __asan_handle_no_return();
+  return REAL(_Unwind_SjLj_RaiseException)(object);
+}
+#endif
+
 #if ASAN_INTERCEPT_INDEX
 # if ASAN_USE_ALIAS_ATTRIBUTE_FOR_INDEX
 INTERCEPTOR(char*, index, const char *string, int c)
@ -540,14 +580,6 @@ INTERCEPTOR(int, __cxa_atexit, void (*func)(void *), void *arg,
 }
 #endif  // ASAN_INTERCEPT___CXA_ATEXIT

-#if ASAN_INTERCEPT_FORK
-INTERCEPTOR(int, fork, void) {
-  ENSURE_ASAN_INITED();
-  int pid = REAL(fork)();
-  return pid;
-}
-#endif  // ASAN_INTERCEPT_FORK
-
 // ---------------------- InitializeAsanInterceptors ---------------- {{{1
 namespace __asan {
 void InitializeAsanInterceptors() {
@ -598,6 +630,17 @@ void InitializeAsanInterceptors() {
 #if ASAN_INTERCEPT___CXA_THROW
  ASAN_INTERCEPT_FUNC(__cxa_throw);
 #endif
+#if ASAN_INTERCEPT___CXA_RETHROW_PRIMARY_EXCEPTION
+  ASAN_INTERCEPT_FUNC(__cxa_rethrow_primary_exception);
+#endif
+  // Indirectly intercept std::rethrow_exception.
+#if ASAN_INTERCEPT__UNWIND_RAISEEXCEPTION
+  INTERCEPT_FUNCTION(_Unwind_RaiseException);
+#endif
+  // Indirectly intercept std::rethrow_exception.
+#if ASAN_INTERCEPT__UNWIND_SJLJ_RAISEEXCEPTION
+  INTERCEPT_FUNCTION(_Unwind_SjLj_RaiseException);
+#endif

  // Intercept threading-related functions
 #if ASAN_INTERCEPT_PTHREAD_CREATE
@ -614,10 +657,6 @@ void InitializeAsanInterceptors() {
  ASAN_INTERCEPT_FUNC(__cxa_atexit);
 #endif

-#if ASAN_INTERCEPT_FORK
-  ASAN_INTERCEPT_FUNC(fork);
-#endif
-
  InitializePlatformInterceptors();

  VReport(1, "AddressSanitizer: libc interceptors initialized\n");
--- a/lib/asan/asan_interceptors.h
+++ b/lib/asan/asan_interceptors.h
@ -34,10 +34,10 @@ void InitializePlatformInterceptors();

 }  // namespace __asan

-// There is no general interception at all on Fuchsia.
+// There is no general interception at all on Fuchsia and RTEMS.
 // Only the functions in asan_interceptors_memintrinsics.h are
 // really defined to replace libc functions.
-#if !SANITIZER_FUCHSIA
+#if !SANITIZER_FUCHSIA && !SANITIZER_RTEMS

 // Use macro to describe if specific function should be
 // intercepted on a given platform.
@ -46,13 +46,11 @@ void InitializePlatformInterceptors();
 # define ASAN_INTERCEPT__LONGJMP 1
 # define ASAN_INTERCEPT_INDEX 1
 # define ASAN_INTERCEPT_PTHREAD_CREATE 1
-# define ASAN_INTERCEPT_FORK 1
 #else
 # define ASAN_INTERCEPT_ATOLL_AND_STRTOLL 0
 # define ASAN_INTERCEPT__LONGJMP 0
 # define ASAN_INTERCEPT_INDEX 0
 # define ASAN_INTERCEPT_PTHREAD_CREATE 0
-# define ASAN_INTERCEPT_FORK 0
 #endif

 #if SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_NETBSD || \
@ -80,13 +78,20 @@ void InitializePlatformInterceptors();
 # define ASAN_INTERCEPT___LONGJMP_CHK 0
 #endif

-// Android bug: https://code.google.com/p/android/issues/detail?id=61799
-#if ASAN_HAS_EXCEPTIONS && !SANITIZER_WINDOWS && \
-    !(SANITIZER_ANDROID && defined(__i386)) && \
-    !SANITIZER_SOLARIS
+#if ASAN_HAS_EXCEPTIONS && !SANITIZER_WINDOWS && !SANITIZER_SOLARIS && \
+    !SANITIZER_NETBSD
 # define ASAN_INTERCEPT___CXA_THROW 1
+# define ASAN_INTERCEPT___CXA_RETHROW_PRIMARY_EXCEPTION 1
+# if defined(_GLIBCXX_SJLJ_EXCEPTIONS) || (SANITIZER_IOS && defined(__arm__))
+#  define ASAN_INTERCEPT__UNWIND_SJLJ_RAISEEXCEPTION 1
+# else
+#  define ASAN_INTERCEPT__UNWIND_RAISEEXCEPTION 1
+# endif
 #else
 # define ASAN_INTERCEPT___CXA_THROW 0
+# define ASAN_INTERCEPT___CXA_RETHROW_PRIMARY_EXCEPTION 0
+# define ASAN_INTERCEPT__UNWIND_RAISEEXCEPTION 0
+# define ASAN_INTERCEPT__UNWIND_SJLJ_RAISEEXCEPTION 0
 #endif

 #if !SANITIZER_WINDOWS
--- a/lib/asan/asan_interceptors_memintrinsics.cc
+++ b/lib/asan/asan_interceptors_memintrinsics.cc
@ -31,14 +31,14 @@ void *__asan_memmove(void *to, const void *from, uptr size) {
  ASAN_MEMMOVE_IMPL(nullptr, to, from, size);
 }

-#if SANITIZER_FUCHSIA
+#if SANITIZER_FUCHSIA || SANITIZER_RTEMS

-// Fuchsia doesn't use sanitizer_common_interceptors.inc, but the only
-// things there it wants are these three.  Just define them as aliases
-// here rather than repeating the contents.
+// Fuchsia and RTEMS don't use sanitizer_common_interceptors.inc, but
+// the only things there it wants are these three.  Just define them
+// as aliases here rather than repeating the contents.

-decltype(memcpy) memcpy[[gnu::alias("__asan_memcpy")]];
-decltype(memmove) memmove[[gnu::alias("__asan_memmove")]];
-decltype(memset) memset[[gnu::alias("__asan_memset")]];
+extern "C" decltype(__asan_memcpy) memcpy[[gnu::alias("__asan_memcpy")]];
+extern "C" decltype(__asan_memmove) memmove[[gnu::alias("__asan_memmove")]];
+extern "C" decltype(__asan_memset) memset[[gnu::alias("__asan_memset")]];

-#endif  // SANITIZER_FUCHSIA
+#endif  // SANITIZER_FUCHSIA || SANITIZER_RTEMS
--- a/lib/asan/asan_interceptors_memintrinsics.h
+++ b/lib/asan/asan_interceptors_memintrinsics.h
@ -133,15 +133,22 @@ static inline bool RangesOverlap(const char *offset1, uptr length1,
                                 const char *offset2, uptr length2) {
  return !((offset1 + length1 <= offset2) || (offset2 + length2 <= offset1));
 }
-#define CHECK_RANGES_OVERLAP(name, _offset1, length1, _offset2, length2) do { \
-  const char *offset1 = (const char*)_offset1; \
-  const char *offset2 = (const char*)_offset2; \
-  if (RangesOverlap(offset1, length1, offset2, length2)) { \
-    GET_STACK_TRACE_FATAL_HERE; \
-    ReportStringFunctionMemoryRangesOverlap(name, offset1, length1, \
-                                            offset2, length2, &stack); \
-  } \
-} while (0)
+#define CHECK_RANGES_OVERLAP(name, _offset1, length1, _offset2, length2)   \
+  do {                                                                     \
+    const char *offset1 = (const char *)_offset1;                          \
+    const char *offset2 = (const char *)_offset2;                          \
+    if (RangesOverlap(offset1, length1, offset2, length2)) {               \
+      GET_STACK_TRACE_FATAL_HERE;                                          \
+      bool suppressed = IsInterceptorSuppressed(name);                     \
+      if (!suppressed && HaveStackTraceBasedSuppressions()) {              \
+        suppressed = IsStackTraceSuppressed(&stack);                       \
+      }                                                                    \
+      if (!suppressed) {                                                   \
+        ReportStringFunctionMemoryRangesOverlap(name, offset1, length1,    \
+                                                offset2, length2, &stack); \
+      }                                                                    \
+    }                                                                      \
+  } while (0)

 }  // namespace __asan

--- a/lib/asan/asan_internal.h
+++ b/lib/asan/asan_internal.h
@ -36,7 +36,7 @@
 // If set, values like allocator chunk size, as well as defaults for some flags
 // will be changed towards less memory overhead.
 #ifndef ASAN_LOW_MEMORY
-# if SANITIZER_IOS || SANITIZER_ANDROID
+# if SANITIZER_IOS || SANITIZER_ANDROID || SANITIZER_RTEMS
 #  define ASAN_LOW_MEMORY 1
 # else
 #  define ASAN_LOW_MEMORY 0
@ -78,7 +78,7 @@ void InitializeShadowMemory();
 // asan_malloc_linux.cc / asan_malloc_mac.cc
 void ReplaceSystemMalloc();

-// asan_linux.cc / asan_mac.cc / asan_win.cc
+// asan_linux.cc / asan_mac.cc / asan_rtems.cc / asan_win.cc
 uptr FindDynamicShadowStart();
 void *AsanDoesNotSupportStaticLinkage();
 void AsanCheckDynamicRTPrereqs();
@ -147,6 +147,9 @@ const int kAsanArrayCookieMagic = 0xac;
 const int kAsanIntraObjectRedzone = 0xbb;
 const int kAsanAllocaLeftMagic = 0xca;
 const int kAsanAllocaRightMagic = 0xcb;
+// Used to populate the shadow gap for systems without memory
+// protection there (i.e. Myriad).
+const int kAsanShadowGap = 0xcc;

 static const uptr kCurrentStackFrameMagic = 0x41B58AB3;
 static const uptr kRetiredStackFrameMagic = 0x45E0360E;
--- a/lib/asan/asan_linux.cc
+++ b/lib/asan/asan_linux.cc
@ -32,6 +32,7 @@
 #include <sys/types.h>
 #include <dlfcn.h>
 #include <fcntl.h>
+#include <limits.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <unistd.h>
@ -214,7 +215,7 @@ void AsanCheckIncompatibleRT() {
      // the functions in dynamic ASan runtime instead of the functions in
      // system libraries, causing crashes later in ASan initialization.
      MemoryMappingLayout proc_maps(/*cache_enabled*/true);
-      char filename[128];
+      char filename[PATH_MAX];
      MemoryMappedSegment segment(filename, sizeof(filename));
      while (proc_maps.Next(&segment)) {
        if (IsDynamicRTName(segment.filename)) {
--- a/lib/asan/asan_mac.cc
+++ b/lib/asan/asan_mac.cc
@ -62,16 +62,36 @@ uptr FindDynamicShadowStart() {
  uptr space_size = kHighShadowEnd + left_padding;

  uptr largest_gap_found = 0;
-  uptr shadow_start = FindAvailableMemoryRange(space_size, alignment,
-                                               granularity, &largest_gap_found);
+  uptr max_occupied_addr = 0;
+  VReport(2, "FindDynamicShadowStart, space_size = %p\n", space_size);
+  uptr shadow_start =
+      FindAvailableMemoryRange(space_size, alignment, granularity,
+                               &largest_gap_found, &max_occupied_addr);
  // If the shadow doesn't fit, restrict the address space to make it fit.
  if (shadow_start == 0) {
+    VReport(
+        2,
+        "Shadow doesn't fit, largest_gap_found = %p, max_occupied_addr = %p\n",
+        largest_gap_found, max_occupied_addr);
    uptr new_max_vm = RoundDownTo(largest_gap_found << SHADOW_SCALE, alignment);
+    if (new_max_vm < max_occupied_addr) {
+      Report("Unable to find a memory range for dynamic shadow.\n");
+      Report(
+          "space_size = %p, largest_gap_found = %p, max_occupied_addr = %p, "
+          "new_max_vm = %p\n",
+          space_size, largest_gap_found, max_occupied_addr, new_max_vm);
+      CHECK(0 && "cannot place shadow");
+    }
    RestrictMemoryToMaxAddress(new_max_vm);
    kHighMemEnd = new_max_vm - 1;
    space_size = kHighShadowEnd + left_padding;
-    shadow_start =
-        FindAvailableMemoryRange(space_size, alignment, granularity, nullptr);
+    VReport(2, "FindDynamicShadowStart, space_size = %p\n", space_size);
+    shadow_start = FindAvailableMemoryRange(space_size, alignment, granularity,
+                                            nullptr, nullptr);
+    if (shadow_start == 0) {
+      Report("Unable to find a memory range after restricting VM.\n");
+      CHECK(0 && "cannot place shadow after restricting vm");
+    }
  }
  CHECK_NE((uptr)0, shadow_start);
  CHECK(IsAligned(shadow_start, alignment));
--- a/lib/asan/asan_malloc_linux.cc
+++ b/lib/asan/asan_malloc_linux.cc
@ -16,19 +16,23 @@

 #include "sanitizer_common/sanitizer_platform.h"
 #if SANITIZER_FREEBSD || SANITIZER_FUCHSIA || SANITIZER_LINUX || \
-    SANITIZER_NETBSD || SANITIZER_SOLARIS
+    SANITIZER_NETBSD || SANITIZER_RTEMS || SANITIZER_SOLARIS

+#include "sanitizer_common/sanitizer_allocator_checks.h"
+#include "sanitizer_common/sanitizer_errno.h"
 #include "sanitizer_common/sanitizer_tls_get_addr.h"
 #include "asan_allocator.h"
 #include "asan_interceptors.h"
 #include "asan_internal.h"
+#include "asan_malloc_local.h"
 #include "asan_stack.h"

 // ---------------------- Replacement functions ---------------- {{{1
 using namespace __asan;  // NOLINT

 static uptr allocated_for_dlsym;
-static const uptr kDlsymAllocPoolSize = 1024;
+static uptr last_dlsym_alloc_size_in_words;
+static const uptr kDlsymAllocPoolSize = SANITIZER_RTEMS ? 4096 : 1024;
 static uptr alloc_memory_for_dlsym[kDlsymAllocPoolSize];

 static INLINE bool IsInDlsymAllocPool(const void *ptr) {
@ -39,21 +43,73 @@ static INLINE bool IsInDlsymAllocPool(const void *ptr) {
 static void *AllocateFromLocalPool(uptr size_in_bytes) {
  uptr size_in_words = RoundUpTo(size_in_bytes, kWordSize) / kWordSize;
  void *mem = (void*)&alloc_memory_for_dlsym[allocated_for_dlsym];
+  last_dlsym_alloc_size_in_words = size_in_words;
  allocated_for_dlsym += size_in_words;
  CHECK_LT(allocated_for_dlsym, kDlsymAllocPoolSize);
  return mem;
 }

+static void DeallocateFromLocalPool(const void *ptr) {
+  // Hack: since glibc 2.27 dlsym no longer uses stack-allocated memory to store
+  // error messages and instead uses malloc followed by free. To avoid pool
+  // exhaustion due to long object filenames, handle that special case here.
+  uptr prev_offset = allocated_for_dlsym - last_dlsym_alloc_size_in_words;
+  void *prev_mem = (void*)&alloc_memory_for_dlsym[prev_offset];
+  if (prev_mem == ptr) {
+    REAL(memset)(prev_mem, 0, last_dlsym_alloc_size_in_words * kWordSize);
+    allocated_for_dlsym = prev_offset;
+    last_dlsym_alloc_size_in_words = 0;
+  }
+}
+
+static int PosixMemalignFromLocalPool(void **memptr, uptr alignment,
+                                      uptr size_in_bytes) {
+  if (UNLIKELY(!CheckPosixMemalignAlignment(alignment)))
+    return errno_EINVAL;
+
+  CHECK(alignment >= kWordSize);
+
+  uptr addr = (uptr)&alloc_memory_for_dlsym[allocated_for_dlsym];
+  uptr aligned_addr = RoundUpTo(addr, alignment);
+  uptr aligned_size = RoundUpTo(size_in_bytes, kWordSize);
+
+  uptr *end_mem = (uptr*)(aligned_addr + aligned_size);
+  uptr allocated = end_mem - alloc_memory_for_dlsym;
+  if (allocated >= kDlsymAllocPoolSize)
+    return errno_ENOMEM;
+
+  allocated_for_dlsym = allocated;
+  *memptr = (void*)aligned_addr;
+  return 0;
+}
+
+#if SANITIZER_RTEMS
+void* MemalignFromLocalPool(uptr alignment, uptr size) {
+  void *ptr = nullptr;
+  alignment = Max(alignment, kWordSize);
+  PosixMemalignFromLocalPool(&ptr, alignment, size);
+  return ptr;
+}
+
+bool IsFromLocalPool(const void *ptr) {
+  return IsInDlsymAllocPool(ptr);
+}
+#endif
+
 static INLINE bool MaybeInDlsym() {
  // Fuchsia doesn't use dlsym-based interceptors.
  return !SANITIZER_FUCHSIA && asan_init_is_running;
 }

+static INLINE bool UseLocalPool() {
+  return EarlyMalloc() || MaybeInDlsym();
+}
+
 static void *ReallocFromLocalPool(void *ptr, uptr size) {
  const uptr offset = (uptr)ptr - (uptr)alloc_memory_for_dlsym;
  const uptr copy_size = Min(size, kDlsymAllocPoolSize - offset);
  void *new_ptr;
-  if (UNLIKELY(MaybeInDlsym())) {
+  if (UNLIKELY(UseLocalPool())) {
    new_ptr = AllocateFromLocalPool(size);
  } else {
    ENSURE_ASAN_INITED();
@ -66,8 +122,10 @@ static void *ReallocFromLocalPool(void *ptr, uptr size) {

 INTERCEPTOR(void, free, void *ptr) {
  GET_STACK_TRACE_FREE;
-  if (UNLIKELY(IsInDlsymAllocPool(ptr)))
+  if (UNLIKELY(IsInDlsymAllocPool(ptr))) {
+    DeallocateFromLocalPool(ptr);
    return;
+  }
  asan_free(ptr, &stack, FROM_MALLOC);
 }

@ -81,7 +139,7 @@ INTERCEPTOR(void, cfree, void *ptr) {
 #endif // SANITIZER_INTERCEPT_CFREE

 INTERCEPTOR(void*, malloc, uptr size) {
-  if (UNLIKELY(MaybeInDlsym()))
+  if (UNLIKELY(UseLocalPool()))
    // Hack: dlsym calls malloc before REAL(malloc) is retrieved from dlsym.
    return AllocateFromLocalPool(size);
  ENSURE_ASAN_INITED();
@ -90,7 +148,7 @@ INTERCEPTOR(void*, malloc, uptr size) {
 }

 INTERCEPTOR(void*, calloc, uptr nmemb, uptr size) {
-  if (UNLIKELY(MaybeInDlsym()))
+  if (UNLIKELY(UseLocalPool()))
    // Hack: dlsym calls calloc before REAL(calloc) is retrieved from dlsym.
    return AllocateFromLocalPool(nmemb * size);
  ENSURE_ASAN_INITED();
@ -101,7 +159,7 @@ INTERCEPTOR(void*, calloc, uptr nmemb, uptr size) {
 INTERCEPTOR(void*, realloc, void *ptr, uptr size) {
  if (UNLIKELY(IsInDlsymAllocPool(ptr)))
    return ReallocFromLocalPool(ptr, size);
-  if (UNLIKELY(MaybeInDlsym()))
+  if (UNLIKELY(UseLocalPool()))
    return AllocateFromLocalPool(size);
  ENSURE_ASAN_INITED();
  GET_STACK_TRACE_MALLOC;
@ -122,10 +180,12 @@ INTERCEPTOR(void*, __libc_memalign, uptr boundary, uptr size) {
 }
 #endif // SANITIZER_INTERCEPT_MEMALIGN

+#if SANITIZER_INTERCEPT_ALIGNED_ALLOC
 INTERCEPTOR(void*, aligned_alloc, uptr boundary, uptr size) {
  GET_STACK_TRACE_MALLOC;
-  return asan_memalign(boundary, size, &stack, FROM_MALLOC);
+  return asan_aligned_alloc(boundary, size, &stack);
 }
+#endif // SANITIZER_INTERCEPT_ALIGNED_ALLOC

 INTERCEPTOR(uptr, malloc_usable_size, void *ptr) {
  GET_CURRENT_PC_BP_SP;
@ -154,8 +214,9 @@ INTERCEPTOR(int, mallopt, int cmd, int value) {
 #endif // SANITIZER_INTERCEPT_MALLOPT_AND_MALLINFO

 INTERCEPTOR(int, posix_memalign, void **memptr, uptr alignment, uptr size) {
+  if (UNLIKELY(UseLocalPool()))
+    return PosixMemalignFromLocalPool(memptr, alignment, size);
  GET_STACK_TRACE_MALLOC;
-  // Printf("posix_memalign: %zx %zu\n", alignment, size);
  return asan_posix_memalign(memptr, alignment, size, &stack);
 }

--- a/lib/asan/asan_malloc_local.h
+++ b/lib/asan/asan_malloc_local.h
@ -0,0 +1,44 @@
+//===-- asan_malloc_local.h -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// Provide interfaces to check for and handle local pool memory allocation.
+//===----------------------------------------------------------------------===//
+
+#ifndef ASAN_MALLOC_LOCAL_H
+#define ASAN_MALLOC_LOCAL_H
+
+#include "sanitizer_common/sanitizer_platform.h"
+#include "asan_internal.h"
+
+// On RTEMS, we use the local pool to handle memory allocation when the ASan
+// run-time is not up.
+static INLINE bool EarlyMalloc() {
+  return SANITIZER_RTEMS && (!__asan::asan_inited ||
+                             __asan::asan_init_is_running);
+}
+
+void* MemalignFromLocalPool(uptr alignment, uptr size);
+
+#if SANITIZER_RTEMS
+
+bool IsFromLocalPool(const void *ptr);
+
+#define ALLOCATE_FROM_LOCAL_POOL UNLIKELY(EarlyMalloc())
+#define IS_FROM_LOCAL_POOL(ptr) UNLIKELY(IsFromLocalPool(ptr))
+
+#else  // SANITIZER_RTEMS
+
+#define ALLOCATE_FROM_LOCAL_POOL 0
+#define IS_FROM_LOCAL_POOL(ptr) 0
+
+#endif  // SANITIZER_RTEMS
+
+#endif  // ASAN_MALLOC_LOCAL_H
--- a/lib/asan/asan_malloc_mac.cc
+++ b/lib/asan/asan_malloc_mac.cc
@ -38,6 +38,9 @@ using namespace __asan;
 #define COMMON_MALLOC_CALLOC(count, size) \
  GET_STACK_TRACE_MALLOC; \
  void *p = asan_calloc(count, size, &stack);
+#define COMMON_MALLOC_POSIX_MEMALIGN(memptr, alignment, size) \
+  GET_STACK_TRACE_MALLOC; \
+  int res = asan_posix_memalign(memptr, alignment, size, &stack);
 #define COMMON_MALLOC_VALLOC(size) \
  GET_STACK_TRACE_MALLOC; \
  void *p = asan_memalign(GetPageSizeCached(), size, &stack, FROM_MALLOC);
--- a/lib/asan/asan_mapping.h
+++ b/lib/asan/asan_mapping.h
@ -122,6 +122,13 @@
 // || `[0x400000000000, 0x47ffffffffff]` || LowShadow  ||
 // || `[0x000000000000, 0x3fffffffffff]` || LowMem     ||
 //
+// Shadow mapping on NerBSD/i386 with SHADOW_OFFSET == 0x40000000:
+// || `[0x60000000, 0xfffff000]` || HighMem    ||
+// || `[0x4c000000, 0x5fffffff]` || HighShadow ||
+// || `[0x48000000, 0x4bffffff]` || ShadowGap  ||
+// || `[0x40000000, 0x47ffffff]` || LowShadow  ||
+// || `[0x00000000, 0x3fffffff]` || LowMem     ||
+//
 // Default Windows/i386 mapping:
 // (the exact location of HighShadow/HighMem may vary depending
 //  on WoW64, /LARGEADDRESSAWARE, etc).
@ -130,11 +137,17 @@
 // || `[0x36000000, 0x39ffffff]` || ShadowGap  ||
 // || `[0x30000000, 0x35ffffff]` || LowShadow  ||
 // || `[0x00000000, 0x2fffffff]` || LowMem     ||
+//
+// Shadow mapping on Myriad2 (for shadow scale 5):
+// || `[0x9ff80000, 0x9fffffff]` || ShadowGap  ||
+// || `[0x9f000000, 0x9ff7ffff]` || LowShadow  ||
+// || `[0x80000000, 0x9effffff]` || LowMem     ||
+// || `[0x00000000, 0x7fffffff]` || Ignored    ||

 #if defined(ASAN_SHADOW_SCALE)
 static const u64 kDefaultShadowScale = ASAN_SHADOW_SCALE;
 #else
-static const u64 kDefaultShadowScale = 3;
+static const u64 kDefaultShadowScale = SANITIZER_MYRIAD2 ? 5 : 3;
 #endif
 static const u64 kDefaultShadowSentinel = ~(uptr)0;
 static const u64 kDefaultShadowOffset32 = 1ULL << 29;  // 0x20000000
@ -152,9 +165,19 @@ static const u64 kPPC64_ShadowOffset64 = 1ULL << 44;
 static const u64 kSystemZ_ShadowOffset64 = 1ULL << 52;
 static const u64 kFreeBSD_ShadowOffset32 = 1ULL << 30;  // 0x40000000
 static const u64 kFreeBSD_ShadowOffset64 = 1ULL << 46;  // 0x400000000000
+static const u64 kNetBSD_ShadowOffset32 = 1ULL << 30;  // 0x40000000
 static const u64 kNetBSD_ShadowOffset64 = 1ULL << 46;  // 0x400000000000
 static const u64 kWindowsShadowOffset32 = 3ULL << 28;  // 0x30000000

+static const u64 kMyriadMemoryOffset32 = 0x80000000ULL;
+static const u64 kMyriadMemorySize32 = 0x20000000ULL;
+static const u64 kMyriadMemoryEnd32 =
+    kMyriadMemoryOffset32 + kMyriadMemorySize32 - 1;
+static const u64 kMyriadShadowOffset32 =
+    (kMyriadMemoryOffset32 + kMyriadMemorySize32 -
+     (kMyriadMemorySize32 >> kDefaultShadowScale));
+static const u64 kMyriadCacheBitMask32 = 0x40000000ULL;
+
 #define SHADOW_SCALE kDefaultShadowScale

 #if SANITIZER_FUCHSIA
@ -166,6 +189,8 @@ static const u64 kWindowsShadowOffset32 = 3ULL << 28;  // 0x30000000
 #    define SHADOW_OFFSET kMIPS32_ShadowOffset32
 #  elif SANITIZER_FREEBSD
 #    define SHADOW_OFFSET kFreeBSD_ShadowOffset32
+#  elif SANITIZER_NETBSD
+#    define SHADOW_OFFSET kNetBSD_ShadowOffset32
 #  elif SANITIZER_WINDOWS
 #    define SHADOW_OFFSET kWindowsShadowOffset32
 #  elif SANITIZER_IOS
@ -174,6 +199,8 @@ static const u64 kWindowsShadowOffset32 = 3ULL << 28;  // 0x30000000
 #    else
 #      define SHADOW_OFFSET kIosShadowOffset32
 #    endif
+#  elif SANITIZER_MYRIAD2
+#    define SHADOW_OFFSET kMyriadShadowOffset32
 #  else
 #    define SHADOW_OFFSET kDefaultShadowOffset32
 #  endif
@ -212,6 +239,39 @@ static const u64 kWindowsShadowOffset32 = 3ULL << 28;  // 0x30000000
 #endif

 #define SHADOW_GRANULARITY (1ULL << SHADOW_SCALE)
+
+#define DO_ASAN_MAPPING_PROFILE 0  // Set to 1 to profile the functions below.
+
+#if DO_ASAN_MAPPING_PROFILE
+# define PROFILE_ASAN_MAPPING() AsanMappingProfile[__LINE__]++;
+#else
+# define PROFILE_ASAN_MAPPING()
+#endif
+
+// If 1, all shadow boundaries are constants.
+// Don't set to 1 other than for testing.
+#define ASAN_FIXED_MAPPING 0
+
+namespace __asan {
+
+extern uptr AsanMappingProfile[];
+
+#if ASAN_FIXED_MAPPING
+// Fixed mapping for 64-bit Linux. Mostly used for performance comparison
+// with non-fixed mapping. As of r175253 (Feb 2013) the performance
+// difference between fixed and non-fixed mapping is below the noise level.
+static uptr kHighMemEnd = 0x7fffffffffffULL;
+static uptr kMidMemBeg =    0x3000000000ULL;
+static uptr kMidMemEnd =    0x4fffffffffULL;
+#else
+extern uptr kHighMemEnd, kMidMemBeg, kMidMemEnd;  // Initialized in __asan_init.
+#endif
+
+}  // namespace __asan
+
+#if SANITIZER_MYRIAD2
+#include "asan_mapping_myriad.h"
+#else
 #define MEM_TO_SHADOW(mem) (((mem) >> SHADOW_SCALE) + (SHADOW_OFFSET))

 #define kLowMemBeg      0
@ -243,36 +303,11 @@ static const u64 kWindowsShadowOffset32 = 3ULL << 28;  // 0x30000000
 #define kShadowGap3Beg (kMidMemBeg ? kMidMemEnd + 1 : 0)
 #define kShadowGap3End (kMidMemBeg ? kHighShadowBeg - 1 : 0)

-#define DO_ASAN_MAPPING_PROFILE 0  // Set to 1 to profile the functions below.
-
-#if DO_ASAN_MAPPING_PROFILE
-# define PROFILE_ASAN_MAPPING() AsanMappingProfile[__LINE__]++;
-#else
-# define PROFILE_ASAN_MAPPING()
-#endif
-
-// If 1, all shadow boundaries are constants.
-// Don't set to 1 other than for testing.
-#define ASAN_FIXED_MAPPING 0
-
 namespace __asan {

-extern uptr AsanMappingProfile[];
-
-#if ASAN_FIXED_MAPPING
-// Fixed mapping for 64-bit Linux. Mostly used for performance comparison
-// with non-fixed mapping. As of r175253 (Feb 2013) the performance
-// difference between fixed and non-fixed mapping is below the noise level.
-static uptr kHighMemEnd = 0x7fffffffffffULL;
-static uptr kMidMemBeg =    0x3000000000ULL;
-static uptr kMidMemEnd =    0x4fffffffffULL;
-#else
-extern uptr kHighMemEnd, kMidMemBeg, kMidMemEnd;  // Initialized in __asan_init.
-#endif
-
 static inline bool AddrIsInLowMem(uptr a) {
  PROFILE_ASAN_MAPPING();
-  return a < kLowMemEnd;
+  return a <= kLowMemEnd;
 }

 static inline bool AddrIsInLowShadow(uptr a) {
@ -280,16 +315,26 @@ static inline bool AddrIsInLowShadow(uptr a) {
  return a >= kLowShadowBeg && a <= kLowShadowEnd;
 }

-static inline bool AddrIsInHighMem(uptr a) {
-  PROFILE_ASAN_MAPPING();
-  return a >= kHighMemBeg && a <= kHighMemEnd;
-}
-
 static inline bool AddrIsInMidMem(uptr a) {
  PROFILE_ASAN_MAPPING();
  return kMidMemBeg && a >= kMidMemBeg && a <= kMidMemEnd;
 }

+static inline bool AddrIsInMidShadow(uptr a) {
+  PROFILE_ASAN_MAPPING();
+  return kMidMemBeg && a >= kMidShadowBeg && a <= kMidShadowEnd;
+}
+
+static inline bool AddrIsInHighMem(uptr a) {
+  PROFILE_ASAN_MAPPING();
+  return kHighMemBeg && a >= kHighMemBeg && a <= kHighMemEnd;
+}
+
+static inline bool AddrIsInHighShadow(uptr a) {
+  PROFILE_ASAN_MAPPING();
+  return kHighMemBeg && a >= kHighShadowBeg && a <= kHighShadowEnd;
+}
+
 static inline bool AddrIsInShadowGap(uptr a) {
  PROFILE_ASAN_MAPPING();
  if (kMidMemBeg) {
@ -305,6 +350,12 @@ static inline bool AddrIsInShadowGap(uptr a) {
  return a >= kShadowGapBeg && a <= kShadowGapEnd;
 }

+}  // namespace __asan
+
+#endif  // SANITIZER_MYRIAD2
+
+namespace __asan {
+
 static inline bool AddrIsInMem(uptr a) {
  PROFILE_ASAN_MAPPING();
  return AddrIsInLowMem(a) || AddrIsInMidMem(a) || AddrIsInHighMem(a) ||
@ -317,16 +368,6 @@ static inline uptr MemToShadow(uptr p) {
  return MEM_TO_SHADOW(p);
 }

-static inline bool AddrIsInHighShadow(uptr a) {
-  PROFILE_ASAN_MAPPING();
-  return a >= kHighShadowBeg && a <= kHighMemEnd;
-}
-
-static inline bool AddrIsInMidShadow(uptr a) {
-  PROFILE_ASAN_MAPPING();
-  return kMidMemBeg && a >= kMidShadowBeg && a <= kMidMemEnd;
-}
-
 static inline bool AddrIsInShadow(uptr a) {
  PROFILE_ASAN_MAPPING();
  return AddrIsInLowShadow(a) || AddrIsInMidShadow(a) || AddrIsInHighShadow(a);
@ -339,6 +380,8 @@ static inline bool AddrIsAlignedByGranularity(uptr a) {

 static inline bool AddressIsPoisoned(uptr a) {
  PROFILE_ASAN_MAPPING();
+  if (SANITIZER_MYRIAD2 && !AddrIsInMem(a) && !AddrIsInShadow(a))
+    return false;
  const uptr kAccessSize = 1;
  u8 *shadow_address = (u8*)MEM_TO_SHADOW(a);
  s8 shadow_value = *shadow_address;
--- a/lib/asan/asan_mapping_myriad.h
+++ b/lib/asan/asan_mapping_myriad.h
@ -0,0 +1,86 @@
+//===-- asan_mapping_myriad.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// Myriad-specific definitions for ASan memory mapping.
+//===----------------------------------------------------------------------===//
+#ifndef ASAN_MAPPING_MYRIAD_H
+#define ASAN_MAPPING_MYRIAD_H
+
+#define RAW_ADDR(mem) ((mem) & ~kMyriadCacheBitMask32)
+#define MEM_TO_SHADOW(mem) \
+  (((RAW_ADDR(mem) - kLowMemBeg) >> SHADOW_SCALE) + (SHADOW_OFFSET))
+
+#define kLowMemBeg     kMyriadMemoryOffset32
+#define kLowMemEnd     (SHADOW_OFFSET - 1)
+
+#define kLowShadowBeg  SHADOW_OFFSET
+#define kLowShadowEnd  MEM_TO_SHADOW(kLowMemEnd)
+
+#define kHighMemBeg    0
+
+#define kHighShadowBeg 0
+#define kHighShadowEnd 0
+
+#define kMidShadowBeg  0
+#define kMidShadowEnd  0
+
+#define kShadowGapBeg  (kLowShadowEnd + 1)
+#define kShadowGapEnd  kMyriadMemoryEnd32
+
+#define kShadowGap2Beg 0
+#define kShadowGap2End 0
+
+#define kShadowGap3Beg 0
+#define kShadowGap3End 0
+
+namespace __asan {
+
+static inline bool AddrIsInLowMem(uptr a) {
+  PROFILE_ASAN_MAPPING();
+  a = RAW_ADDR(a);
+  return a >= kLowMemBeg && a <= kLowMemEnd;
+}
+
+static inline bool AddrIsInLowShadow(uptr a) {
+  PROFILE_ASAN_MAPPING();
+  a = RAW_ADDR(a);
+  return a >= kLowShadowBeg && a <= kLowShadowEnd;
+}
+
+static inline bool AddrIsInMidMem(uptr a) {
+  PROFILE_ASAN_MAPPING();
+  return false;
+}
+
+static inline bool AddrIsInMidShadow(uptr a) {
+  PROFILE_ASAN_MAPPING();
+  return false;
+}
+
+static inline bool AddrIsInHighMem(uptr a) {
+  PROFILE_ASAN_MAPPING();
+  return false;
+}
+
+static inline bool AddrIsInHighShadow(uptr a) {
+  PROFILE_ASAN_MAPPING();
+  return false;
+}
+
+static inline bool AddrIsInShadowGap(uptr a) {
+  PROFILE_ASAN_MAPPING();
+  a = RAW_ADDR(a);
+  return a >= kShadowGapBeg && a <= kShadowGapEnd;
+}
+
+}  // namespace __asan
+
+#endif  // ASAN_MAPPING_MYRIAD_H
--- a/lib/asan/asan_memory_profile.cc
+++ b/lib/asan/asan_memory_profile.cc
@ -31,9 +31,9 @@ struct AllocationSite {

 class HeapProfile {
 public:
-  HeapProfile() : allocations_(1024) {}
+  HeapProfile() { allocations_.reserve(1024); }

-  void ProcessChunk(const AsanChunkView& cv) {
+  void ProcessChunk(const AsanChunkView &cv) {
    if (cv.IsAllocated()) {
      total_allocated_user_size_ += cv.UsedSize();
      total_allocated_count_++;
@ -49,10 +49,10 @@ class HeapProfile {
  }

  void Print(uptr top_percent, uptr max_number_of_contexts) {
-    InternalSort(&allocations_, allocations_.size(),
-                 [](const AllocationSite &a, const AllocationSite &b) {
-                   return a.total_size > b.total_size;
-                 });
+    Sort(allocations_.data(), allocations_.size(),
+         [](const AllocationSite &a, const AllocationSite &b) {
+           return a.total_size > b.total_size;
+         });
    CHECK(total_allocated_user_size_);
    uptr total_shown = 0;
    Printf("Live Heap Allocations: %zd bytes in %zd chunks; quarantined: "
--- a/lib/asan/asan_new_delete.cc
+++ b/lib/asan/asan_new_delete.cc
@ -14,6 +14,8 @@

 #include "asan_allocator.h"
 #include "asan_internal.h"
+#include "asan_malloc_local.h"
+#include "asan_report.h"
 #include "asan_stack.h"

 #include "interception/interception.h"
@ -67,16 +69,28 @@ struct nothrow_t {};
 enum class align_val_t: size_t {};
 }  // namespace std

-// TODO(alekseys): throw std::bad_alloc instead of dying on OOM.
+// TODO(alekseyshl): throw std::bad_alloc instead of dying on OOM.
+// For local pool allocation, align to SHADOW_GRANULARITY to match asan
+// allocator behavior.
 #define OPERATOR_NEW_BODY(type, nothrow) \
+  if (ALLOCATE_FROM_LOCAL_POOL) {\
+    void *res = MemalignFromLocalPool(SHADOW_GRANULARITY, size);\
+    if (!nothrow) CHECK(res);\
+    return res;\
+  }\
  GET_STACK_TRACE_MALLOC;\
  void *res = asan_memalign(0, size, &stack, type);\
-  if (!nothrow && UNLIKELY(!res)) DieOnFailure::OnOOM();\
+  if (!nothrow && UNLIKELY(!res)) ReportOutOfMemory(size, &stack);\
  return res;
 #define OPERATOR_NEW_BODY_ALIGN(type, nothrow) \
+  if (ALLOCATE_FROM_LOCAL_POOL) {\
+    void *res = MemalignFromLocalPool((uptr)align, size);\
+    if (!nothrow) CHECK(res);\
+    return res;\
+  }\
  GET_STACK_TRACE_MALLOC;\
  void *res = asan_memalign((uptr)align, size, &stack, type);\
-  if (!nothrow && UNLIKELY(!res)) DieOnFailure::OnOOM();\
+  if (!nothrow && UNLIKELY(!res)) ReportOutOfMemory(size, &stack);\
  return res;

 // On OS X it's not enough to just provide our own 'operator new' and
@ -128,18 +142,22 @@ INTERCEPTOR(void *, _ZnamRKSt9nothrow_t, size_t size, std::nothrow_t const&) {
 #endif  // !SANITIZER_MAC

 #define OPERATOR_DELETE_BODY(type) \
+  if (IS_FROM_LOCAL_POOL(ptr)) return;\
  GET_STACK_TRACE_FREE;\
  asan_delete(ptr, 0, 0, &stack, type);

 #define OPERATOR_DELETE_BODY_SIZE(type) \
+  if (IS_FROM_LOCAL_POOL(ptr)) return;\
  GET_STACK_TRACE_FREE;\
  asan_delete(ptr, size, 0, &stack, type);

 #define OPERATOR_DELETE_BODY_ALIGN(type) \
+  if (IS_FROM_LOCAL_POOL(ptr)) return;\
  GET_STACK_TRACE_FREE;\
  asan_delete(ptr, 0, static_cast<uptr>(align), &stack, type);

 #define OPERATOR_DELETE_BODY_SIZE_ALIGN(type) \
+  if (IS_FROM_LOCAL_POOL(ptr)) return;\
  GET_STACK_TRACE_FREE;\
  asan_delete(ptr, size, static_cast<uptr>(align), &stack, type);

--- a/lib/asan/asan_poisoning.cc
+++ b/lib/asan/asan_poisoning.cc
@ -32,7 +32,7 @@ bool CanPoisonMemory() {
 }

 void PoisonShadow(uptr addr, uptr size, u8 value) {
-  if (!CanPoisonMemory()) return;
+  if (value && !CanPoisonMemory()) return;
  CHECK(AddrIsAlignedByGranularity(addr));
  CHECK(AddrIsInMem(addr));
  CHECK(AddrIsAlignedByGranularity(addr + size));
@ -182,8 +182,15 @@ int __asan_address_is_poisoned(void const volatile *addr) {
 uptr __asan_region_is_poisoned(uptr beg, uptr size) {
  if (!size) return 0;
  uptr end = beg + size;
-  if (!AddrIsInMem(beg)) return beg;
-  if (!AddrIsInMem(end)) return end;
+  if (SANITIZER_MYRIAD2) {
+    // On Myriad, address not in DRAM range need to be treated as
+    // unpoisoned.
+    if (!AddrIsInMem(beg) && !AddrIsInShadow(beg)) return 0;
+    if (!AddrIsInMem(end) && !AddrIsInShadow(end)) return 0;
+  } else {
+    if (!AddrIsInMem(beg)) return beg;
+    if (!AddrIsInMem(end)) return end;
+  }
  CHECK_LT(beg, end);
  uptr aligned_b = RoundUpTo(beg, SHADOW_GRANULARITY);
  uptr aligned_e = RoundDownTo(end, SHADOW_GRANULARITY);
@ -452,4 +459,3 @@ bool WordIsPoisoned(uptr addr) {
  return (__asan_region_is_poisoned(addr, sizeof(uptr)) != 0);
 }
 }
-
--- a/lib/asan/asan_poisoning.h
+++ b/lib/asan/asan_poisoning.h
@ -38,7 +38,7 @@ void PoisonShadowPartialRightRedzone(uptr addr,
 // performance-critical code with care.
 ALWAYS_INLINE void FastPoisonShadow(uptr aligned_beg, uptr aligned_size,
                                    u8 value) {
-  DCHECK(CanPoisonMemory());
+  DCHECK(!value || CanPoisonMemory());
  uptr shadow_beg = MEM_TO_SHADOW(aligned_beg);
  uptr shadow_end = MEM_TO_SHADOW(
      aligned_beg + aligned_size - SHADOW_GRANULARITY) + 1;
@ -51,6 +51,9 @@ ALWAYS_INLINE void FastPoisonShadow(uptr aligned_beg, uptr aligned_size,
      // changed at all.  It doesn't currently have an efficient means
      // to zero a bunch of pages, but maybe we should add one.
      SANITIZER_FUCHSIA == 1 ||
+      // RTEMS doesn't have have pages, let alone a fast way to zero
+      // them, so default to memset.
+      SANITIZER_RTEMS == 1 ||
      shadow_end - shadow_beg < common_flags()->clear_shadow_mmap_threshold) {
    REAL(memset)((void*)shadow_beg, value, shadow_end - shadow_beg);
  } else {
--- a/lib/asan/asan_report.cc
+++ b/lib/asan/asan_report.cc
@ -84,7 +84,7 @@ static void PrintZoneForPointer(uptr ptr, uptr zone_ptr,
 bool ParseFrameDescription(const char *frame_descr,
                           InternalMmapVector<StackVarDescr> *vars) {
  CHECK(frame_descr);
-  char *p;
+  const char *p;
  // This string is created by the compiler and has the following form:
  // "n alloc_1 alloc_2 ... alloc_n"
  // where alloc_i looks like "offset size len ObjectName"
@ -134,6 +134,10 @@ class ScopedInErrorReport {
  }

  ~ScopedInErrorReport() {
+    if (halt_on_error_ && !__sanitizer_acquire_crash_state()) {
+      asanThreadRegistry().Unlock();
+      return;
+    }
    ASAN_ON_ERROR();
    if (current_error_.IsValid()) current_error_.Print();

@ -152,7 +156,7 @@ class ScopedInErrorReport {

    // Copy the message buffer so that we could start logging without holding a
    // lock that gets aquired during printing.
-    InternalScopedBuffer<char> buffer_copy(kErrorMessageBufferSize);
+    InternalMmapVector<char> buffer_copy(kErrorMessageBufferSize);
    {
      BlockingMutexLock l(&error_message_buf_mutex);
      internal_memcpy(buffer_copy.data(),
@ -202,7 +206,7 @@ class ScopedInErrorReport {
  bool halt_on_error_;
 };

-ErrorDescription ScopedInErrorReport::current_error_;
+ErrorDescription ScopedInErrorReport::current_error_(LINKER_INITIALIZED);

 void ReportDeadlySignal(const SignalContext &sig) {
  ScopedInErrorReport in_report(/*fatal*/ true);
@ -254,6 +258,62 @@ void ReportSanitizerGetAllocatedSizeNotOwned(uptr addr,
  in_report.ReportError(error);
 }

+void ReportCallocOverflow(uptr count, uptr size, BufferedStackTrace *stack) {
+  ScopedInErrorReport in_report(/*fatal*/ true);
+  ErrorCallocOverflow error(GetCurrentTidOrInvalid(), stack, count, size);
+  in_report.ReportError(error);
+}
+
+void ReportPvallocOverflow(uptr size, BufferedStackTrace *stack) {
+  ScopedInErrorReport in_report(/*fatal*/ true);
+  ErrorPvallocOverflow error(GetCurrentTidOrInvalid(), stack, size);
+  in_report.ReportError(error);
+}
+
+void ReportInvalidAllocationAlignment(uptr alignment,
+                                      BufferedStackTrace *stack) {
+  ScopedInErrorReport in_report(/*fatal*/ true);
+  ErrorInvalidAllocationAlignment error(GetCurrentTidOrInvalid(), stack,
+                                        alignment);
+  in_report.ReportError(error);
+}
+
+void ReportInvalidAlignedAllocAlignment(uptr size, uptr alignment,
+                                        BufferedStackTrace *stack) {
+  ScopedInErrorReport in_report(/*fatal*/ true);
+  ErrorInvalidAlignedAllocAlignment error(GetCurrentTidOrInvalid(), stack,
+                                          size, alignment);
+  in_report.ReportError(error);
+}
+
+void ReportInvalidPosixMemalignAlignment(uptr alignment,
+                                         BufferedStackTrace *stack) {
+  ScopedInErrorReport in_report(/*fatal*/ true);
+  ErrorInvalidPosixMemalignAlignment error(GetCurrentTidOrInvalid(), stack,
+                                           alignment);
+  in_report.ReportError(error);
+}
+
+void ReportAllocationSizeTooBig(uptr user_size, uptr total_size, uptr max_size,
+                                BufferedStackTrace *stack) {
+  ScopedInErrorReport in_report(/*fatal*/ true);
+  ErrorAllocationSizeTooBig error(GetCurrentTidOrInvalid(), stack, user_size,
+                                  total_size, max_size);
+  in_report.ReportError(error);
+}
+
+void ReportRssLimitExceeded(BufferedStackTrace *stack) {
+  ScopedInErrorReport in_report(/*fatal*/ true);
+  ErrorRssLimitExceeded error(GetCurrentTidOrInvalid(), stack);
+  in_report.ReportError(error);
+}
+
+void ReportOutOfMemory(uptr requested_size, BufferedStackTrace *stack) {
+  ScopedInErrorReport in_report(/*fatal*/ true);
+  ErrorOutOfMemory error(GetCurrentTidOrInvalid(), stack, requested_size);
+  in_report.ReportError(error);
+}
+
 void ReportStringFunctionMemoryRangesOverlap(const char *function,
                                             const char *offset1, uptr length1,
                                             const char *offset2, uptr length2,
@ -343,7 +403,11 @@ static bool IsInvalidPointerPair(uptr a1, uptr a2) {
 }

 static INLINE void CheckForInvalidPointerPair(void *p1, void *p2) {
-  if (!flags()->detect_invalid_pointer_pairs) return;
+  switch (flags()->detect_invalid_pointer_pairs) {
+    case 0 : return;
+    case 1 : if (p1 == nullptr || p2 == nullptr) return; break;
+  }
+
  uptr a1 = reinterpret_cast<uptr>(p1);
  uptr a2 = reinterpret_cast<uptr>(p2);

--- a/lib/asan/asan_report.h
+++ b/lib/asan/asan_report.h
@ -58,6 +58,18 @@ void ReportAllocTypeMismatch(uptr addr, BufferedStackTrace *free_stack,
 void ReportMallocUsableSizeNotOwned(uptr addr, BufferedStackTrace *stack);
 void ReportSanitizerGetAllocatedSizeNotOwned(uptr addr,
                                             BufferedStackTrace *stack);
+void ReportCallocOverflow(uptr count, uptr size, BufferedStackTrace *stack);
+void ReportPvallocOverflow(uptr size, BufferedStackTrace *stack);
+void ReportInvalidAllocationAlignment(uptr alignment,
+                                      BufferedStackTrace *stack);
+void ReportInvalidAlignedAllocAlignment(uptr size, uptr alignment,
+                                        BufferedStackTrace *stack);
+void ReportInvalidPosixMemalignAlignment(uptr alignment,
+                                         BufferedStackTrace *stack);
+void ReportAllocationSizeTooBig(uptr user_size, uptr total_size, uptr max_size,
+                                BufferedStackTrace *stack);
+void ReportRssLimitExceeded(BufferedStackTrace *stack);
+void ReportOutOfMemory(uptr requested_size, BufferedStackTrace *stack);
 void ReportStringFunctionMemoryRangesOverlap(const char *function,
                                             const char *offset1, uptr length1,
                                             const char *offset2, uptr length2,
--- a/lib/asan/asan_rtems.cc
+++ b/lib/asan/asan_rtems.cc
@ -0,0 +1,253 @@
+//===-- asan_rtems.cc -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// RTEMS-specific details.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_rtems.h"
+#if SANITIZER_RTEMS
+
+#include "asan_internal.h"
+#include "asan_interceptors.h"
+#include "asan_mapping.h"
+#include "asan_poisoning.h"
+#include "asan_report.h"
+#include "asan_stack.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_libc.h"
+
+#include <pthread.h>
+#include <stdlib.h>
+
+namespace __asan {
+
+static void ResetShadowMemory() {
+  uptr shadow_start = SHADOW_OFFSET;
+  uptr shadow_end = MEM_TO_SHADOW(kMyriadMemoryEnd32);
+  uptr gap_start = MEM_TO_SHADOW(shadow_start);
+  uptr gap_end = MEM_TO_SHADOW(shadow_end);
+
+  REAL(memset)((void *)shadow_start, 0, shadow_end - shadow_start);
+  REAL(memset)((void *)gap_start, kAsanShadowGap, gap_end - gap_start);
+}
+
+void InitializeShadowMemory() {
+  kHighMemEnd = 0;
+  kMidMemBeg =  0;
+  kMidMemEnd =  0;
+
+  ResetShadowMemory();
+}
+
+void AsanApplyToGlobals(globals_op_fptr op, const void *needle) {
+  UNIMPLEMENTED();
+}
+
+void AsanCheckDynamicRTPrereqs() {}
+void AsanCheckIncompatibleRT() {}
+void InitializeAsanInterceptors() {}
+void InitializePlatformInterceptors() {}
+void InitializePlatformExceptionHandlers() {}
+
+// RTEMS only support static linking; it sufficies to return with no
+// error.
+void *AsanDoesNotSupportStaticLinkage() { return nullptr; }
+
+void AsanOnDeadlySignal(int signo, void *siginfo, void *context) {
+  UNIMPLEMENTED();
+}
+
+void EarlyInit() {
+  // Provide early initialization of shadow memory so that
+  // instrumented code running before full initialzation will not
+  // report spurious errors.
+  ResetShadowMemory();
+}
+
+// We can use a plain thread_local variable for TSD.
+static thread_local void *per_thread;
+
+void *AsanTSDGet() { return per_thread; }
+
+void AsanTSDSet(void *tsd) { per_thread = tsd; }
+
+// There's no initialization needed, and the passed-in destructor
+// will never be called.  Instead, our own thread destruction hook
+// (below) will call AsanThread::TSDDtor directly.
+void AsanTSDInit(void (*destructor)(void *tsd)) {
+  DCHECK(destructor == &PlatformTSDDtor);
+}
+
+void PlatformTSDDtor(void *tsd) { UNREACHABLE(__func__); }
+
+//
+// Thread registration.  We provide an API similar to the Fushia port.
+//
+
+struct AsanThread::InitOptions {
+  uptr stack_bottom, stack_size, tls_bottom, tls_size;
+};
+
+// Shared setup between thread creation and startup for the initial thread.
+static AsanThread *CreateAsanThread(StackTrace *stack, u32 parent_tid,
+                                    uptr user_id, bool detached,
+                                    uptr stack_bottom, uptr stack_size,
+                                    uptr tls_bottom, uptr tls_size) {
+  // In lieu of AsanThread::Create.
+  AsanThread *thread = (AsanThread *)MmapOrDie(sizeof(AsanThread), __func__);
+  AsanThreadContext::CreateThreadContextArgs args = {thread, stack};
+  asanThreadRegistry().CreateThread(user_id, detached, parent_tid, &args);
+
+  // On other systems, AsanThread::Init() is called from the new
+  // thread itself.  But on RTEMS we already know the stack address
+  // range beforehand, so we can do most of the setup right now.
+  const AsanThread::InitOptions options = {stack_bottom, stack_size,
+                                           tls_bottom, tls_size};
+  thread->Init(&options);
+  return thread;
+}
+
+// This gets the same arguments passed to Init by CreateAsanThread, above.
+// We're in the creator thread before the new thread is actually started, but
+// its stack and tls address range are already known.
+void AsanThread::SetThreadStackAndTls(const AsanThread::InitOptions *options) {
+  DCHECK_NE(GetCurrentThread(), this);
+  DCHECK_NE(GetCurrentThread(), nullptr);
+  CHECK_NE(options->stack_bottom, 0);
+  CHECK_NE(options->stack_size, 0);
+  stack_bottom_ = options->stack_bottom;
+  stack_top_ = options->stack_bottom + options->stack_size;
+  tls_begin_ = options->tls_bottom;
+  tls_end_ = options->tls_bottom + options->tls_size;
+}
+
+// Called by __asan::AsanInitInternal (asan_rtl.c).  Unlike other ports, the
+// main thread on RTEMS does not require special treatment; its AsanThread is
+// already created by the provided hooks.  This function simply looks up and
+// returns the created thread.
+AsanThread *CreateMainThread() {
+  return GetThreadContextByTidLocked(0)->thread;
+}
+
+// This is called before each thread creation is attempted.  So, in
+// its first call, the calling thread is the initial and sole thread.
+static void *BeforeThreadCreateHook(uptr user_id, bool detached,
+                                    uptr stack_bottom, uptr stack_size,
+                                    uptr tls_bottom, uptr tls_size) {
+  EnsureMainThreadIDIsCorrect();
+  // Strict init-order checking is thread-hostile.
+  if (flags()->strict_init_order) StopInitOrderChecking();
+
+  GET_STACK_TRACE_THREAD;
+  u32 parent_tid = GetCurrentTidOrInvalid();
+
+  return CreateAsanThread(&stack, parent_tid, user_id, detached,
+                          stack_bottom, stack_size, tls_bottom, tls_size);
+}
+
+// This is called after creating a new thread (in the creating thread),
+// with the pointer returned by BeforeThreadCreateHook (above).
+static void ThreadCreateHook(void *hook, bool aborted) {
+  AsanThread *thread = static_cast<AsanThread *>(hook);
+  if (!aborted) {
+    // The thread was created successfully.
+    // ThreadStartHook is already running in the new thread.
+  } else {
+    // The thread wasn't created after all.
+    // Clean up everything we set up in BeforeThreadCreateHook.
+    asanThreadRegistry().FinishThread(thread->tid());
+    UnmapOrDie(thread, sizeof(AsanThread));
+  }
+}
+
+// This is called (1) in the newly-created thread before it runs anything else,
+// with the pointer returned by BeforeThreadCreateHook (above).  (2) before a
+// thread restart.
+static void ThreadStartHook(void *hook, uptr os_id) {
+  if (!hook)
+    return;
+
+  AsanThread *thread = static_cast<AsanThread *>(hook);
+  SetCurrentThread(thread);
+
+  ThreadStatus status =
+      asanThreadRegistry().GetThreadLocked(thread->tid())->status;
+  DCHECK(status == ThreadStatusCreated || status == ThreadStatusRunning);
+  // Determine whether we are starting or restarting the thread.
+  if (status == ThreadStatusCreated)
+    // In lieu of AsanThread::ThreadStart.
+    asanThreadRegistry().StartThread(thread->tid(), os_id,
+                                     /*workerthread*/ false, nullptr);
+  else {
+    // In a thread restart, a thread may resume execution at an
+    // arbitrary function entry point, with its stack and TLS state
+    // reset.  We unpoison the stack in that case.
+    PoisonShadow(thread->stack_bottom(), thread->stack_size(), 0);
+  }
+}
+
+// Each thread runs this just before it exits,
+// with the pointer returned by BeforeThreadCreateHook (above).
+// All per-thread destructors have already been called.
+static void ThreadExitHook(void *hook, uptr os_id) {
+  AsanThread *thread = static_cast<AsanThread *>(hook);
+  if (thread)
+    AsanThread::TSDDtor(thread->context());
+}
+
+static void HandleExit() {
+  // Disable ASan by setting it to uninitialized.  Also reset the
+  // shadow memory to avoid reporting errors after the run-time has
+  // been desroyed.
+  if (asan_inited) {
+    asan_inited = false;
+    ResetShadowMemory();
+  }
+}
+
+}  // namespace __asan
+
+// These are declared (in extern "C") by <some_path/sanitizer.h>.
+// The system runtime will call our definitions directly.
+
+extern "C" {
+void __sanitizer_early_init() {
+  __asan::EarlyInit();
+}
+
+void *__sanitizer_before_thread_create_hook(uptr thread, bool detached,
+                                            const char *name,
+                                            void *stack_base, size_t stack_size,
+                                            void *tls_base, size_t tls_size) {
+  return __asan::BeforeThreadCreateHook(
+      thread, detached,
+      reinterpret_cast<uptr>(stack_base), stack_size,
+      reinterpret_cast<uptr>(tls_base), tls_size);
+}
+
+void __sanitizer_thread_create_hook(void *handle, uptr thread, int status) {
+  __asan::ThreadCreateHook(handle, status != 0);
+}
+
+void __sanitizer_thread_start_hook(void *handle, uptr self) {
+  __asan::ThreadStartHook(handle, self);
+}
+
+void __sanitizer_thread_exit_hook(void *handle, uptr self) {
+  __asan::ThreadExitHook(handle, self);
+}
+
+void __sanitizer_exit() {
+  __asan::HandleExit();
+}
+}  // "C"
+
+#endif  // SANITIZER_RTEMS
--- a/lib/asan/asan_rtl.cc
+++ b/lib/asan/asan_rtl.cc
@ -56,7 +56,8 @@ static void AsanDie() {
      UnmapOrDie((void*)kLowShadowBeg, kMidMemBeg - kLowShadowBeg);
      UnmapOrDie((void*)kMidMemEnd, kHighShadowEnd - kMidMemEnd);
    } else {
-      UnmapOrDie((void*)kLowShadowBeg, kHighShadowEnd - kLowShadowBeg);
+      if (kHighShadowEnd)
+        UnmapOrDie((void*)kLowShadowBeg, kHighShadowEnd - kLowShadowBeg);
    }
  }
 }
@ -65,8 +66,14 @@ static void AsanCheckFailed(const char *file, int line, const char *cond,
                            u64 v1, u64 v2) {
  Report("AddressSanitizer CHECK failed: %s:%d \"%s\" (0x%zx, 0x%zx)\n", file,
         line, cond, (uptr)v1, (uptr)v2);
-  // FIXME: check for infinite recursion without a thread-local counter here.
-  PRINT_CURRENT_STACK_CHECK();
+
+  // Print a stack trace the first time we come here. Otherwise, we probably
+  // failed a CHECK during symbolization.
+  static atomic_uint32_t num_calls;
+  if (atomic_fetch_add(&num_calls, 1, memory_order_relaxed) == 0) {
+    PRINT_CURRENT_STACK_CHECK();
+  }
+
  Die();
 }

@ -140,6 +147,8 @@ ASAN_REPORT_ERROR_N(load, false)
 ASAN_REPORT_ERROR_N(store, true)

 #define ASAN_MEMORY_ACCESS_CALLBACK_BODY(type, is_write, size, exp_arg, fatal) \
+    if (SANITIZER_MYRIAD2 && !AddrIsInMem(addr) && !AddrIsInShadow(addr))      \
+      return;                                                                  \
    uptr sp = MEM_TO_SHADOW(addr);                                             \
    uptr s = size <= SHADOW_GRANULARITY ? *reinterpret_cast<u8 *>(sp)          \
                                        : *reinterpret_cast<u16 *>(sp);        \
@ -306,6 +315,7 @@ static void asan_atexit() {
 }

 static void InitializeHighMemEnd() {
+#if !SANITIZER_MYRIAD2
 #if !ASAN_FIXED_MAPPING
  kHighMemEnd = GetMaxUserVirtualAddress();
  // Increase kHighMemEnd to make sure it's properly
@ -313,13 +323,16 @@ static void InitializeHighMemEnd() {
  kHighMemEnd |= SHADOW_GRANULARITY * GetMmapGranularity() - 1;
 #endif  // !ASAN_FIXED_MAPPING
  CHECK_EQ((kHighMemBeg % GetMmapGranularity()), 0);
+#endif  // !SANITIZER_MYRIAD2
 }

 void PrintAddressSpaceLayout() {
-  Printf("|| `[%p, %p]` || HighMem    ||\n",
-         (void*)kHighMemBeg, (void*)kHighMemEnd);
-  Printf("|| `[%p, %p]` || HighShadow ||\n",
-         (void*)kHighShadowBeg, (void*)kHighShadowEnd);
+  if (kHighMemBeg) {
+    Printf("|| `[%p, %p]` || HighMem    ||\n",
+           (void*)kHighMemBeg, (void*)kHighMemEnd);
+    Printf("|| `[%p, %p]` || HighShadow ||\n",
+           (void*)kHighShadowBeg, (void*)kHighShadowEnd);
+  }
  if (kMidMemBeg) {
    Printf("|| `[%p, %p]` || ShadowGap3 ||\n",
           (void*)kShadowGap3Beg, (void*)kShadowGap3End);
@ -338,11 +351,14 @@ void PrintAddressSpaceLayout() {
    Printf("|| `[%p, %p]` || LowMem     ||\n",
           (void*)kLowMemBeg, (void*)kLowMemEnd);
  }
-  Printf("MemToShadow(shadow): %p %p %p %p",
+  Printf("MemToShadow(shadow): %p %p",
         (void*)MEM_TO_SHADOW(kLowShadowBeg),
-         (void*)MEM_TO_SHADOW(kLowShadowEnd),
-         (void*)MEM_TO_SHADOW(kHighShadowBeg),
-         (void*)MEM_TO_SHADOW(kHighShadowEnd));
+         (void*)MEM_TO_SHADOW(kLowShadowEnd));
+  if (kHighMemBeg) {
+    Printf(" %p %p",
+           (void*)MEM_TO_SHADOW(kHighShadowBeg),
+           (void*)MEM_TO_SHADOW(kHighShadowEnd));
+  }
  if (kMidMemBeg) {
    Printf(" %p %p",
           (void*)MEM_TO_SHADOW(kMidShadowBeg),
@ -374,6 +390,7 @@ static void AsanInitInternal() {
  asan_init_is_running = true;

  CacheBinaryName();
+  CheckASLR();

  // Initialize flags. This must be done early, because most of the
  // initialization steps look at flags().
@ -526,6 +543,9 @@ void NOINLINE __asan_handle_no_return() {
  if (curr_thread) {
    top = curr_thread->stack_top();
    bottom = ((uptr)&local_stack - PageSize) & ~(PageSize - 1);
+  } else if (SANITIZER_RTEMS) {
+    // Give up On RTEMS.
+    return;
  } else {
    CHECK(!SANITIZER_FUCHSIA);
    // If we haven't seen this thread, try asking the OS for stack bounds.
--- a/lib/asan/asan_shadow_setup.cc
+++ b/lib/asan/asan_shadow_setup.cc
@ -14,8 +14,9 @@

 #include "sanitizer_common/sanitizer_platform.h"

-// asan_fuchsia.cc has its own InitializeShadowMemory implementation.
-#if !SANITIZER_FUCHSIA
+// asan_fuchsia.cc and asan_rtems.cc have their own
+// InitializeShadowMemory implementation.
+#if !SANITIZER_FUCHSIA && !SANITIZER_RTEMS

 #include "asan_internal.h"
 #include "asan_mapping.h"
@ -30,8 +31,7 @@ void ReserveShadowMemoryRange(uptr beg, uptr end, const char *name) {
  CHECK_EQ(((end + 1) % GetMmapGranularity()), 0);
  uptr size = end - beg + 1;
  DecreaseTotalMmap(size);  // Don't count the shadow against mmap_limit_mb.
-  void *res = MmapFixedNoReserve(beg, size, name);
-  if (res != (void *)beg) {
+  if (!MmapFixedNoReserve(beg, size, name)) {
    Report(
        "ReserveShadowMemoryRange failed while trying to map 0x%zx bytes. "
        "Perhaps you're using ulimit -v\n",
@ -162,4 +162,4 @@ void InitializeShadowMemory() {

 }  // namespace __asan

-#endif  // !SANITIZER_FUCHSIA
+#endif  // !SANITIZER_FUCHSIA && !SANITIZER_RTEMS
--- a/lib/asan/asan_thread.cc
+++ b/lib/asan/asan_thread.cc
@ -221,22 +221,25 @@ FakeStack *AsanThread::AsyncSignalSafeLazyInitFakeStack() {
 void AsanThread::Init(const InitOptions *options) {
  next_stack_top_ = next_stack_bottom_ = 0;
  atomic_store(&stack_switching_, false, memory_order_release);
-  fake_stack_ = nullptr;  // Will be initialized lazily if needed.
  CHECK_EQ(this->stack_size(), 0U);
  SetThreadStackAndTls(options);
  CHECK_GT(this->stack_size(), 0U);
  CHECK(AddrIsInMem(stack_bottom_));
  CHECK(AddrIsInMem(stack_top_ - 1));
  ClearShadowForThreadStackAndTLS();
+  fake_stack_ = nullptr;
+  if (__asan_option_detect_stack_use_after_return)
+    AsyncSignalSafeLazyInitFakeStack();
  int local = 0;
  VReport(1, "T%d: stack [%p,%p) size 0x%zx; local=%p\n", tid(),
          (void *)stack_bottom_, (void *)stack_top_, stack_top_ - stack_bottom_,
          &local);
 }

-// Fuchsia doesn't use ThreadStart.
-// asan_fuchsia.c defines CreateMainThread and SetThreadStackAndTls.
-#if !SANITIZER_FUCHSIA
+// Fuchsia and RTEMS don't use ThreadStart.
+// asan_fuchsia.c/asan_rtems.c define CreateMainThread and
+// SetThreadStackAndTls.
+#if !SANITIZER_FUCHSIA && !SANITIZER_RTEMS

 thread_return_t AsanThread::ThreadStart(
    tid_t os_id, atomic_uintptr_t *signal_thread_is_registered) {
@ -296,12 +299,17 @@ void AsanThread::SetThreadStackAndTls(const InitOptions *options) {
  CHECK(AddrIsInStack((uptr)&local));
 }

-#endif  // !SANITIZER_FUCHSIA
+#endif  // !SANITIZER_FUCHSIA && !SANITIZER_RTEMS

 void AsanThread::ClearShadowForThreadStackAndTLS() {
  PoisonShadow(stack_bottom_, stack_top_ - stack_bottom_, 0);
-  if (tls_begin_ != tls_end_)
-    PoisonShadow(tls_begin_, tls_end_ - tls_begin_, 0);
+  if (tls_begin_ != tls_end_) {
+    uptr tls_begin_aligned = RoundDownTo(tls_begin_, SHADOW_GRANULARITY);
+    uptr tls_end_aligned = RoundUpTo(tls_end_, SHADOW_GRANULARITY);
+    FastPoisonShadowPartialRightRedzone(tls_begin_aligned,
+                                        tls_end_ - tls_begin_aligned,
+                                        tls_end_aligned - tls_end_, 0);
+  }
 }

 bool AsanThread::GetStackFrameAccessByAddr(uptr addr,
@ -386,6 +394,9 @@ static bool ThreadStackContainsAddress(ThreadContextBase *tctx_base,
 }

 AsanThread *GetCurrentThread() {
+  if (SANITIZER_RTEMS && !asan_inited)
+    return nullptr;
+
  AsanThreadContext *context =
      reinterpret_cast<AsanThreadContext *>(AsanTSDGet());
  if (!context) {
@ -477,6 +488,11 @@ void UnlockThreadRegistry() {
  __asan::asanThreadRegistry().Unlock();
 }

+ThreadRegistry *GetThreadRegistryLocked() {
+  __asan::asanThreadRegistry().CheckLocked();
+  return &__asan::asanThreadRegistry();
+}
+
 void EnsureMainThreadIDIsCorrect() {
  __asan::EnsureMainThreadIDIsCorrect();
 }
--- a/lib/asan/asan_win.cc
+++ b/lib/asan/asan_win.cc
@ -222,8 +222,8 @@ uptr FindDynamicShadowStart() {
  uptr alignment = 8 * granularity;
  uptr left_padding = granularity;
  uptr space_size = kHighShadowEnd + left_padding;
-  uptr shadow_start =
-      FindAvailableMemoryRange(space_size, alignment, granularity, nullptr);
+  uptr shadow_start = FindAvailableMemoryRange(space_size, alignment,
+                                               granularity, nullptr, nullptr);
  CHECK_NE((uptr)0, shadow_start);
  CHECK(IsAligned(shadow_start, alignment));
  return shadow_start;
@ -265,11 +265,6 @@ ShadowExceptionHandler(PEXCEPTION_POINTERS exception_pointers) {
  // Determine the address of the page that is being accessed.
  uptr page = RoundDownTo(addr, page_size);

-  // Query the existing page.
-  MEMORY_BASIC_INFORMATION mem_info = {};
-  if (::VirtualQuery((LPVOID)page, &mem_info, sizeof(mem_info)) == 0)
-    return EXCEPTION_CONTINUE_SEARCH;
-
  // Commit the page.
  uptr result =
      (uptr)::VirtualAlloc((LPVOID)page, page_size, MEM_COMMIT, PAGE_READWRITE);
--- a/lib/asan/asan_win_dll_thunk.cc
+++ b/lib/asan/asan_win_dll_thunk.cc
@ -99,7 +99,7 @@ INTERCEPTOR(int, _except_handler4, void *a, void *b, void *c, void *d) {
 }
 #endif

-// Window specific functions not included in asan_interface.inc.
+// Windows specific functions not included in asan_interface.inc.
 INTERCEPT_WRAP_W_V(__asan_should_detect_stack_use_after_return)
 INTERCEPT_WRAP_W_V(__asan_get_shadow_memory_dynamic_address)
 INTERCEPT_WRAP_W_W(__asan_unhandled_exception_filter)
--- a/lib/asan/scripts/asan_device_setup
+++ b/lib/asan/scripts/asan_device_setup
@ -309,7 +309,7 @@ if [[ -n "$ASAN_RT64" ]]; then
  cp "$ASAN_RT_PATH/$ASAN_RT64" "$TMPDIR/"
 fi

-ASAN_OPTIONS=start_deactivated=1,malloc_context_size=0
+ASAN_OPTIONS=start_deactivated=1

 # The name of a symlink to libclang_rt.asan-$ARCH-android.so used in LD_PRELOAD.
 # The idea is to have the same name in lib and lib64 to keep it from falling
@ -336,6 +336,13 @@ exec $_to \$@
 EOF
 }

+# On Android-L not allowing user segv handler breaks some applications.
+# Since ~May 2017 this is the default setting; included for compatibility with
+# older library versions.
+if [[ PRE_L -eq 0 ]]; then
+    ASAN_OPTIONS="$ASAN_OPTIONS,allow_user_segv_handler=1"
+fi
+
 if [[ x$extra_options != x ]] ; then
    ASAN_OPTIONS="$ASAN_OPTIONS,$extra_options"
 fi
--- a/lib/asan/tests/CMakeLists.txt
+++ b/lib/asan/tests/CMakeLists.txt
@ -237,6 +237,9 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS AND NOT ANDROID)
  if(APPLE)
    darwin_filter_host_archs(ASAN_SUPPORTED_ARCH ASAN_TEST_ARCH)
  endif()
+  if(OS_NAME MATCHES "SunOS")
+    list(REMOVE_ITEM ASAN_TEST_ARCH x86_64)
+  endif()

  foreach(arch ${ASAN_TEST_ARCH})

@ -248,6 +251,8 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS AND NOT ANDROID)
        $<TARGET_OBJECTS:RTInterception.osx>
        $<TARGET_OBJECTS:RTSanitizerCommon.osx>
        $<TARGET_OBJECTS:RTSanitizerCommonLibc.osx>
+        $<TARGET_OBJECTS:RTSanitizerCommonCoverage.osx>
+        $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.osx>
        $<TARGET_OBJECTS:RTLSanCommon.osx>
        $<TARGET_OBJECTS:RTUbsan.osx>)
    else()
@ -257,6 +262,8 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS AND NOT ANDROID)
        $<TARGET_OBJECTS:RTInterception.${arch}>
        $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
        $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
+        $<TARGET_OBJECTS:RTSanitizerCommonCoverage.${arch}>
+        $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.${arch}>
        $<TARGET_OBJECTS:RTLSanCommon.${arch}>
        $<TARGET_OBJECTS:RTUbsan.${arch}>
        $<TARGET_OBJECTS:RTUbsan_cxx.${arch}>)
@ -280,6 +287,8 @@ if(ANDROID)
      $<TARGET_OBJECTS:RTInterception.${arch}>
      $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
      $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
+      $<TARGET_OBJECTS:RTSanitizerCommonCoverage.${arch}>
+      $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.${arch}>
      $<TARGET_OBJECTS:RTUbsan.${arch}>
      $<TARGET_OBJECTS:RTUbsan_cxx.${arch}>
      ${COMPILER_RT_GTEST_SOURCE}
--- a/lib/asan/tests/asan_test.cc
+++ b/lib/asan/tests/asan_test.cc
@ -25,6 +25,11 @@
 #endif
 #endif

+#if defined(__sun__) && defined(__svr4__)
+using std::_setjmp;
+using std::_longjmp;
+#endif
+
 NOINLINE void *malloc_fff(size_t size) {
  void *res = malloc/**/(size); break_optimization(0); return res;}
 NOINLINE void *malloc_eee(size_t size) {
--- a/lib/builtins/CMakeLists.txt
+++ b/lib/builtins/CMakeLists.txt
@ -173,8 +173,8 @@ set(GENERIC_TF_SOURCES
  trunctfsf2.c)

 option(COMPILER_RT_EXCLUDE_ATOMIC_BUILTIN
-  "Skip the atomic builtin (this may be needed if system headers are unavailable)"
-  Off)
+  "Skip the atomic builtin (these should normally be provided by a shared library)"
+  On)

 if(NOT FUCHSIA AND NOT COMPILER_RT_BAREMETAL_BUILD)
  set(GENERIC_SOURCES
@ -406,6 +406,7 @@ if(MINGW)
      arm/aeabi_ldivmod.S
      arm/aeabi_uidivmod.S
      arm/aeabi_uldivmod.S
+      arm/chkstk.S
      divmoddi4.c
      divmodsi4.c
      divdi3.c
@ -459,6 +460,41 @@ set(armv6m_SOURCES ${thumb1_SOURCES})
 set(armv7m_SOURCES ${arm_SOURCES})
 set(armv7em_SOURCES ${arm_SOURCES})

+# hexagon arch
+set(hexagon_SOURCES ${GENERIC_SOURCES} ${GENERIC_TF_SOURCES})
+set(hexagon_SOURCES
+  hexagon/common_entry_exit_abi1.S
+  hexagon/common_entry_exit_abi2.S
+  hexagon/common_entry_exit_legacy.S
+  hexagon/dfaddsub.S
+  hexagon/dfdiv.S
+  hexagon/dffma.S
+  hexagon/dfminmax.S
+  hexagon/dfmul.S
+  hexagon/dfsqrt.S
+  hexagon/divdi3.S
+  hexagon/divsi3.S
+  hexagon/fabs_opt.S
+  hexagon/fastmath2_dlib_asm.S
+  hexagon/fastmath2_ldlib_asm.S
+  hexagon/fastmath_dlib_asm.S
+  hexagon/fma_opt.S
+  hexagon/fmax_opt.S
+  hexagon/fmin_opt.S
+  hexagon/memcpy_forward_vp4cp4n2.S
+  hexagon/memcpy_likely_aligned.S
+  hexagon/moddi3.S
+  hexagon/modsi3.S
+  hexagon/sfdiv_opt.S
+  hexagon/sfsqrt_opt.S
+  hexagon/udivdi3.S
+  hexagon/udivmoddi4.S
+  hexagon/udivmodsi4.S
+  hexagon/udivsi3.S
+  hexagon/umoddi3.S
+  hexagon/umodsi3.S)
+
+
 set(mips_SOURCES ${GENERIC_SOURCES})
 set(mipsel_SOURCES ${mips_SOURCES})
 set(mips64_SOURCES ${GENERIC_TF_SOURCES}
@ -480,6 +516,12 @@ set(powerpc64_SOURCES
  ${GENERIC_SOURCES})
 set(powerpc64le_SOURCES ${powerpc64_SOURCES})

+set(riscv_SOURCES ${GENERIC_SOURCES} ${GENERIC_TF_SOURCES})
+set(riscv32_SOURCES
+  riscv/mulsi3.S
+  ${riscv_SOURCES})
+set(riscv64_SOURCES ${riscv_SOURCES})
+
 set(wasm32_SOURCES
  ${GENERIC_TF_SOURCES}
  ${GENERIC_SOURCES})
@ -542,6 +584,12 @@ else ()
        list(APPEND BUILTIN_CFLAGS -fomit-frame-pointer -DCOMPILER_RT_ARMHF_TARGET)
      endif()

+      # For RISCV32, we must force enable int128 for compiling long
+      # double routines.
+      if("${arch}" STREQUAL "riscv32")
+        list(APPEND BUILTIN_CFLAGS -fforce-enable-int128)
+      endif()
+
      add_compiler_rt_runtime(clang_rt.builtins
                              STATIC
                              ARCHS ${arch}
--- a/lib/builtins/arm/chkstk.S
+++ b/lib/builtins/arm/chkstk.S
@ -0,0 +1,34 @@
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+
+#include "../assembly.h"
+
+// __chkstk routine
+// This routine is windows specific.
+// http://msdn.microsoft.com/en-us/library/ms648426.aspx
+
+// This clobbers the register r12, and the condition codes, and uses r5 and r6
+// as temporaries by backing them up and restoring them afterwards.
+// Does not modify any memory or the stack pointer.
+
+//      movw    r4,  #256 // Number of bytes of stack, in units of 4 byte
+//      bl      __chkstk
+//      sub.w   sp, sp, r4
+
+#define PAGE_SIZE 4096
+
+        .p2align 2
+DEFINE_COMPILERRT_FUNCTION(__chkstk)
+        lsl    r4,  r4,  #2
+        mov    r12, sp
+        push   {r5, r6}
+        mov    r5,  r4
+1:
+        sub    r12, r12, #PAGE_SIZE
+        subs   r5,  r5,  #PAGE_SIZE
+        ldr    r6,  [r12]
+        bgt    1b
+
+        pop    {r5, r6}
+        bx     lr
+END_COMPILERRT_FUNCTION(__chkstk)
--- a/lib/builtins/clear_cache.c
+++ b/lib/builtins/clear_cache.c
@ -33,6 +33,11 @@ uintptr_t GetCurrentProcess(void);
  #include <machine/sysarch.h>
 #endif

+#if defined(__OpenBSD__) && defined(__mips__)
+  #include <sys/types.h>
+  #include <machine/sysarch.h>
+#endif
+
 #if defined(__linux__) && defined(__mips__)
  #include <sys/cachectl.h>
  #include <sys/syscall.h>
@ -96,6 +101,8 @@ void __clear_cache(void *start, void *end) {
 * Intel processors have a unified instruction and data cache
 * so there is nothing to do
 */
+#elif defined(_WIN32) && (defined(__arm__) || defined(__aarch64__))
+    FlushInstructionCache(GetCurrentProcess(), start, end - start);
 #elif defined(__arm__) && !defined(__APPLE__)
    #if defined(__FreeBSD__) || defined(__NetBSD__)
        struct arm_sync_icache_args arg;
@ -123,8 +130,6 @@ void __clear_cache(void *start, void *end) {
                          : "r"(syscall_nr), "r"(start_reg), "r"(end_reg),
                            "r"(flags));
         assert(start_reg == 0 && "Cache flush syscall failed.");
-    #elif defined(_WIN32)
-        FlushInstructionCache(GetCurrentProcess(), start, end - start);
    #else
        compilerrt_abort();
    #endif
@ -142,6 +147,8 @@ void __clear_cache(void *start, void *end) {
    #else
        syscall(__NR_cacheflush, start, (end_int - start_int), BCACHE);
    #endif
+#elif defined(__mips__) && defined(__OpenBSD__)
+  cacheflush(start, (uintptr_t)end - (uintptr_t)start, BCACHE);
 #elif defined(__aarch64__) && !defined(__APPLE__)
  uint64_t xstart = (uint64_t)(uintptr_t) start;
  uint64_t xend = (uint64_t)(uintptr_t) end;
@ -156,12 +163,14 @@ void __clear_cache(void *start, void *end) {
   * uintptr_t in case this runs in an IPL32 environment.
   */
  const size_t dcache_line_size = 4 << ((ctr_el0 >> 16) & 15);
-  for (addr = xstart; addr < xend; addr += dcache_line_size)
+  for (addr = xstart & ~(dcache_line_size - 1); addr < xend;
+       addr += dcache_line_size)
    __asm __volatile("dc cvau, %0" :: "r"(addr));
  __asm __volatile("dsb ish");

  const size_t icache_line_size = 4 << ((ctr_el0 >> 0) & 15);
-  for (addr = xstart; addr < xend; addr += icache_line_size)
+  for (addr = xstart & ~(icache_line_size - 1); addr < xend;
+       addr += icache_line_size)
    __asm __volatile("ic ivau, %0" :: "r"(addr));
  __asm __volatile("isb sy");
 #elif defined (__powerpc64__)
--- a/lib/builtins/clzdi2.c
+++ b/lib/builtins/clzdi2.c
@ -16,6 +16,12 @@

 /* Returns: the number of leading 0-bits */

+#if !defined(__clang__) && (defined(__sparc64__) || defined(__mips64) || defined(__riscv__))
+/* gcc resolves __builtin_clz -> __clzdi2 leading to infinite recursion */
+#define __builtin_clz(a) __clzsi2(a)
+extern si_int __clzsi2(si_int);
+#endif
+
 /* Precondition: a != 0 */

 COMPILER_RT_ABI si_int
--- a/lib/builtins/cpu_model.c
+++ b/lib/builtins/cpu_model.c
@ -416,9 +416,9 @@ static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
      *Subtype = AMDFAM15H_BDVER3;
      break; // "bdver3"; 30h-3Fh: Steamroller
    }
-    if (Model >= 0x10 && Model <= 0x1f) {
+    if ((Model >= 0x10 && Model <= 0x1f) || Model == 0x02) {
      *Subtype = AMDFAM15H_BDVER2;
-      break; // "bdver2"; 10h-1Fh: Piledriver
+      break; // "bdver2"; 02h, 10h-1Fh: Piledriver
    }
    if (Model <= 0x0f) {
      *Subtype = AMDFAM15H_BDVER1;
--- a/lib/builtins/ctzdi2.c
+++ b/lib/builtins/ctzdi2.c
@ -16,6 +16,12 @@

 /* Returns: the number of trailing 0-bits  */

+#if !defined(__clang__) && (defined(__sparc64__) || defined(__mips64) || defined(__riscv__))
+/* gcc resolves __builtin_ctz -> __ctzdi2 leading to infinite recursion */
+#define __builtin_ctz(a) __ctzsi2(a)
+extern si_int __ctzsi2(si_int);
+#endif
+
 /* Precondition: a != 0 */

 COMPILER_RT_ABI si_int
--- a/lib/builtins/emutls.c
+++ b/lib/builtins/emutls.c
@ -14,7 +14,22 @@
 #include "int_lib.h"
 #include "int_util.h"

+#ifdef __BIONIC__
+/* There are 4 pthread key cleanup rounds on Bionic. Delay emutls deallocation
+   to round 2. We need to delay deallocation because:
+    - Android versions older than M lack __cxa_thread_atexit_impl, so apps
+      use a pthread key destructor to call C++ destructors.
+    - Apps might use __thread/thread_local variables in pthread destructors.
+   We can't wait until the final two rounds, because jemalloc needs two rounds
+   after the final malloc/free call to free its thread-specific data (see
+   https://reviews.llvm.org/D46978#1107507). */
+#define EMUTLS_SKIP_DESTRUCTOR_ROUNDS 1
+#else
+#define EMUTLS_SKIP_DESTRUCTOR_ROUNDS 0
+#endif
+
 typedef struct emutls_address_array {
+    uintptr_t skip_destructor_rounds;
    uintptr_t size;  /* number of elements in the 'data' array */
    void* data[];
 } emutls_address_array;
@ -65,9 +80,30 @@ static __inline void emutls_memalign_free(void *base) {
 #endif
 }

+static __inline void emutls_setspecific(emutls_address_array *value) {
+    pthread_setspecific(emutls_pthread_key, (void*) value);
+}
+
+static __inline emutls_address_array* emutls_getspecific() {
+    return (emutls_address_array*) pthread_getspecific(emutls_pthread_key);
+}
+
 static void emutls_key_destructor(void* ptr) {
-    emutls_shutdown((emutls_address_array*)ptr);
-    free(ptr);
+    emutls_address_array *array = (emutls_address_array*)ptr;
+    if (array->skip_destructor_rounds > 0) {
+        /* emutls is deallocated using a pthread key destructor. These
+         * destructors are called in several rounds to accommodate destructor
+         * functions that (re)initialize key values with pthread_setspecific.
+         * Delay the emutls deallocation to accommodate other end-of-thread
+         * cleanup tasks like calling thread_local destructors (e.g. the
+         * __cxa_thread_atexit fallback in libc++abi).
+         */
+        array->skip_destructor_rounds--;
+        emutls_setspecific(array);
+    } else {
+        emutls_shutdown(array);
+        free(ptr);
+    }
 }

 static __inline void emutls_init(void) {
@ -88,15 +124,7 @@ static __inline void emutls_unlock() {
    pthread_mutex_unlock(&emutls_mutex);
 }

-static __inline void emutls_setspecific(emutls_address_array *value) {
-    pthread_setspecific(emutls_pthread_key, (void*) value);
-}
-
-static __inline emutls_address_array* emutls_getspecific() {
-    return (emutls_address_array*) pthread_getspecific(emutls_pthread_key);
-}
-
-#else
+#else /* _WIN32 */

 #include <windows.h>
 #include <malloc.h>
@ -222,11 +250,11 @@ static __inline void __atomic_store_n(void *ptr, uintptr_t val, unsigned type) {
    InterlockedExchangePointer((void *volatile *)ptr, (void *)val);
 }

-#endif
+#endif /* __ATOMIC_RELEASE */

 #pragma warning (pop)

-#endif
+#endif /* _WIN32 */

 static size_t emutls_num_object = 0;  /* number of allocated TLS objects */

@ -314,11 +342,12 @@ static __inline void emutls_check_array_set_size(emutls_address_array *array,
 * which must be no smaller than the given index.
 */
 static __inline uintptr_t emutls_new_data_array_size(uintptr_t index) {
-   /* Need to allocate emutls_address_array with one extra slot
-    * to store the data array size.
+   /* Need to allocate emutls_address_array with extra slots
+    * to store the header.
    * Round up the emutls_address_array size to multiple of 16.
    */
-    return ((index + 1 + 15) & ~((uintptr_t)15)) - 1;
+    uintptr_t header_words = sizeof(emutls_address_array) / sizeof(void *);
+    return ((index + header_words + 15) & ~((uintptr_t)15)) - header_words;
 }

 /* Returns the size in bytes required for an emutls_address_array with
@ -337,8 +366,10 @@ emutls_get_address_array(uintptr_t index) {
    if (array == NULL) {
        uintptr_t new_size = emutls_new_data_array_size(index);
        array = (emutls_address_array*) malloc(emutls_asize(new_size));
-        if (array)
+        if (array) {
            memset(array->data, 0, new_size * sizeof(void*));
+            array->skip_destructor_rounds = EMUTLS_SKIP_DESTRUCTOR_ROUNDS;
+        }
        emutls_check_array_set_size(array, new_size);
    } else if (index > array->size) {
        uintptr_t orig_size = array->size;
--- a/lib/builtins/hexagon/common_entry_exit_abi1.S
+++ b/lib/builtins/hexagon/common_entry_exit_abi1.S
@ -0,0 +1,103 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Functions that implement common sequences in function prologues and epilogues
+   used to save code size */
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.globl \name
+	.type  \name, @function
+	.falign
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+	.macro FALLTHROUGH_TAIL_CALL name0 name1
+	.size \name0, . - \name0
+	.globl \name1
+	.type \name1, @function
+	.falign
+\name1:
+	.endm
+
+
+
+
+/* Save r25:24 at fp+#-8 and r27:26 at fp+#-16. */
+
+
+
+
+/* The compiler knows that the __save_* functions clobber LR.  No other
+   registers should be used without informing the compiler. */
+
+/* Since we can only issue one store per packet, we don't hurt performance by
+   simply jumping to the right point in this sequence of stores. */
+
+FUNCTION_BEGIN __save_r24_through_r27
+		memd(fp+#-16) = r27:26
+FALLTHROUGH_TAIL_CALL __save_r24_through_r27 __save_r24_through_r25
+	{
+		memd(fp+#-8) = r25:24
+		jumpr lr
+	}
+FUNCTION_END __save_r24_through_r25
+
+
+
+
+/* For each of the *_before_tailcall functions, jumpr lr is executed in parallel
+   with deallocframe.  That way, the return gets the old value of lr, which is
+   where these functions need to return, and at the same time, lr gets the value
+   it needs going into the tail call. */
+
+FUNCTION_BEGIN __restore_r24_through_r27_and_deallocframe_before_tailcall
+		r27:26 = memd(fp+#-16)
+FALLTHROUGH_TAIL_CALL __restore_r24_through_r27_and_deallocframe_before_tailcall __restore_r24_through_r25_and_deallocframe_before_tailcall
+	{
+		r25:24 = memd(fp+#-8)
+		deallocframe
+		jumpr lr
+	}
+FUNCTION_END __restore_r24_through_r25_and_deallocframe_before_tailcall
+
+
+
+
+/* Here we use the extra load bandwidth to restore LR early, allowing the return
+   to occur in parallel with the deallocframe. */
+
+FUNCTION_BEGIN __restore_r24_through_r27_and_deallocframe
+	{
+		lr = memw(fp+#4)
+		r27:26 = memd(fp+#-16)
+	}
+	{
+		r25:24 = memd(fp+#-8)
+		deallocframe
+		jumpr lr
+	}
+FUNCTION_END __restore_r24_through_r27_and_deallocframe
+
+
+
+
+/* Here the load bandwidth is maximized. */
+
+FUNCTION_BEGIN __restore_r24_through_r25_and_deallocframe
+	{
+		r25:24 = memd(fp+#-8)
+		deallocframe
+	}
+		jumpr lr
+FUNCTION_END __restore_r24_through_r25_and_deallocframe
--- a/lib/builtins/hexagon/common_entry_exit_abi2.S
+++ b/lib/builtins/hexagon/common_entry_exit_abi2.S
@ -0,0 +1,268 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Functions that implement common sequences in function prologues and epilogues
+   used to save code size */
+
+	.macro FUNCTION_BEGIN name
+	.p2align 2
+        .section .text.\name,"ax",@progbits
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+	.macro FALLTHROUGH_TAIL_CALL name0 name1
+	.p2align 2
+	.size \name0, . - \name0
+	.globl \name1
+	.type \name1, @function
+\name1:
+	.endm
+
+
+
+
+/* Save r17:16 at fp+#-8, r19:18 at fp+#-16, r21:20 at fp+#-24, r23:22 at
+   fp+#-32, r25:24 at fp+#-40, and r27:26 at fp+#-48.
+   The compiler knows that the __save_* functions clobber LR.  No other
+   registers should be used without informing the compiler. */
+
+FUNCTION_BEGIN __save_r16_through_r27
+        {
+                memd(fp+#-48) = r27:26
+                memd(fp+#-40) = r25:24
+        }
+        {
+                memd(fp+#-32) = r23:22
+                memd(fp+#-24) = r21:20
+        }
+        {
+                memd(fp+#-16) = r19:18
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r27
+
+FUNCTION_BEGIN __save_r16_through_r25
+        {
+                memd(fp+#-40) = r25:24
+                memd(fp+#-32) = r23:22
+        }
+        {
+                memd(fp+#-24) = r21:20
+                memd(fp+#-16) = r19:18
+        }
+        {
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r25
+
+FUNCTION_BEGIN __save_r16_through_r23
+        {
+                memd(fp+#-32) = r23:22
+                memd(fp+#-24) = r21:20
+        }
+        {
+                memd(fp+#-16) = r19:18
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r23
+
+FUNCTION_BEGIN __save_r16_through_r21
+        {
+                memd(fp+#-24) = r21:20
+                memd(fp+#-16) = r19:18
+        }
+        {
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r21
+
+FUNCTION_BEGIN __save_r16_through_r19
+        {
+                memd(fp+#-16) = r19:18
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r19
+
+FUNCTION_BEGIN __save_r16_through_r17
+        {
+                memd(fp+#-8) = r17:16
+                jumpr lr
+        }
+FUNCTION_END __save_r16_through_r17
+
+/* For each of the *_before_tailcall functions, jumpr lr is executed in parallel
+   with deallocframe.  That way, the return gets the old value of lr, which is
+   where these functions need to return, and at the same time, lr gets the value
+   it needs going into the tail call. */
+
+
+FUNCTION_BEGIN __restore_r16_through_r27_and_deallocframe_before_tailcall
+                r27:26 = memd(fp+#-48)
+        {
+                r25:24 = memd(fp+#-40)
+                r23:22 = memd(fp+#-32)
+        }
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r27_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r25_and_deallocframe_before_tailcall
+        {
+                r25:24 = memd(fp+#-40)
+                r23:22 = memd(fp+#-32)
+        }
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r25_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r23_and_deallocframe_before_tailcall
+        {
+                r23:22 = memd(fp+#-32)
+                r21:20 = memd(fp+#-24)
+        }
+                r19:18 = memd(fp+#-16)
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r23_and_deallocframe_before_tailcall
+
+
+FUNCTION_BEGIN __restore_r16_through_r21_and_deallocframe_before_tailcall
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r19_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r19_and_deallocframe_before_tailcall
+                r19:18 = memd(fp+#-16)
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r19_and_deallocframe_before_tailcall
+
+FUNCTION_BEGIN __restore_r16_through_r17_and_deallocframe_before_tailcall
+        {
+                r17:16 = memd(fp+#-8)
+                deallocframe
+                jumpr lr
+        }
+FUNCTION_END __restore_r16_through_r17_and_deallocframe_before_tailcall
+
+
+FUNCTION_BEGIN __restore_r16_through_r27_and_deallocframe
+                r27:26 = memd(fp+#-48)
+        {
+                r25:24 = memd(fp+#-40)
+                r23:22 = memd(fp+#-32)
+        }
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+	{
+		r17:16 = memd(fp+#-8)
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r27_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r25_and_deallocframe
+        {
+                r25:24 = memd(fp+#-40)
+                r23:22 = memd(fp+#-32)
+        }
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+	{
+		r17:16 = memd(fp+#-8)
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r25_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r23_and_deallocframe
+        {
+                r23:22 = memd(fp+#-32)
+        }
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+	{
+		r17:16 = memd(fp+#-8)
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r23_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r21_and_deallocframe
+        {
+                r21:20 = memd(fp+#-24)
+                r19:18 = memd(fp+#-16)
+        }
+	{
+		r17:16 = memd(fp+#-8)
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r21_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r19_and_deallocframe
+	{
+                r19:18 = memd(fp+#-16)
+		r17:16 = memd(fp+#-8)
+        }
+        {
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r19_and_deallocframe
+
+FUNCTION_BEGIN __restore_r16_through_r17_and_deallocframe
+	{
+		r17:16 = memd(fp+#-8)
+		dealloc_return
+	}
+FUNCTION_END __restore_r16_through_r17_and_deallocframe
+
+FUNCTION_BEGIN __deallocframe
+        dealloc_return
+FUNCTION_END __deallocframe
--- a/lib/builtins/hexagon/common_entry_exit_legacy.S
+++ b/lib/builtins/hexagon/common_entry_exit_legacy.S
@ -0,0 +1,157 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/* Functions that implement common sequences in function prologues and epilogues
+   used to save code size */
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.globl \name
+	.type  \name, @function
+	.falign
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+	.macro FALLTHROUGH_TAIL_CALL name0 name1
+	.size \name0, . - \name0
+	.globl \name1
+	.type \name1, @function
+	.falign
+\name1:
+	.endm
+
+
+
+
+/* Save r27:26 at fp+#-8, r25:24 at fp+#-16, r23:22 at fp+#-24, r21:20 at
+   fp+#-32, r19:18 at fp+#-40, and r17:16 at fp+#-48. */
+
+
+
+
+/* The compiler knows that the __save_* functions clobber LR.  No other
+   registers should be used without informing the compiler. */
+
+/* Since we can only issue one store per packet, we don't hurt performance by
+   simply jumping to the right point in this sequence of stores. */
+
+FUNCTION_BEGIN __save_r27_through_r16
+		memd(fp+#-48) = r17:16
+FALLTHROUGH_TAIL_CALL __save_r27_through_r16 __save_r27_through_r18
+		memd(fp+#-40) = r19:18
+FALLTHROUGH_TAIL_CALL __save_r27_through_r18 __save_r27_through_r20
+		memd(fp+#-32) = r21:20
+FALLTHROUGH_TAIL_CALL __save_r27_through_r20 __save_r27_through_r22
+		memd(fp+#-24) = r23:22
+FALLTHROUGH_TAIL_CALL __save_r27_through_r22 __save_r27_through_r24
+		memd(fp+#-16) = r25:24
+	{
+		memd(fp+#-8) = r27:26
+		jumpr lr
+	}
+FUNCTION_END __save_r27_through_r24
+
+
+
+
+/* For each of the *_before_sibcall functions, jumpr lr is executed in parallel
+   with deallocframe.  That way, the return gets the old value of lr, which is
+   where these functions need to return, and at the same time, lr gets the value
+   it needs going into the sibcall. */
+
+FUNCTION_BEGIN __restore_r27_through_r20_and_deallocframe_before_sibcall
+	{
+		r21:20 = memd(fp+#-32)
+		r23:22 = memd(fp+#-24)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r20_and_deallocframe_before_sibcall __restore_r27_through_r24_and_deallocframe_before_sibcall
+	{
+		r25:24 = memd(fp+#-16)
+		jump __restore_r27_through_r26_and_deallocframe_before_sibcall
+	}
+FUNCTION_END __restore_r27_through_r24_and_deallocframe_before_sibcall
+
+
+
+
+FUNCTION_BEGIN __restore_r27_through_r16_and_deallocframe_before_sibcall
+		r17:16 = memd(fp+#-48)
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r16_and_deallocframe_before_sibcall __restore_r27_through_r18_and_deallocframe_before_sibcall
+	{
+		r19:18 = memd(fp+#-40)
+		r21:20 = memd(fp+#-32)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r18_and_deallocframe_before_sibcall __restore_r27_through_r22_and_deallocframe_before_sibcall
+	{
+		r23:22 = memd(fp+#-24)
+		r25:24 = memd(fp+#-16)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r22_and_deallocframe_before_sibcall __restore_r27_through_r26_and_deallocframe_before_sibcall
+	{
+		r27:26 = memd(fp+#-8)
+		deallocframe
+		jumpr lr
+	}
+FUNCTION_END __restore_r27_through_r26_and_deallocframe_before_sibcall
+
+
+
+
+/* Here we use the extra load bandwidth to restore LR early, allowing the return
+   to occur in parallel with the deallocframe. */
+
+FUNCTION_BEGIN __restore_r27_through_r16_and_deallocframe
+	{
+		r17:16 = memd(fp+#-48)
+		r19:18 = memd(fp+#-40)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r16_and_deallocframe __restore_r27_through_r20_and_deallocframe
+	{
+		r21:20 = memd(fp+#-32)
+		r23:22 = memd(fp+#-24)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r20_and_deallocframe __restore_r27_through_r24_and_deallocframe
+	{
+		lr = memw(fp+#4)
+		r25:24 = memd(fp+#-16)
+	}
+	{
+		r27:26 = memd(fp+#-8)
+		deallocframe
+		jumpr lr
+	}
+FUNCTION_END __restore_r27_through_r24_and_deallocframe
+
+
+
+
+/* Here the load bandwidth is maximized for all three functions. */
+
+FUNCTION_BEGIN __restore_r27_through_r18_and_deallocframe
+	{
+		r19:18 = memd(fp+#-40)
+		r21:20 = memd(fp+#-32)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r18_and_deallocframe __restore_r27_through_r22_and_deallocframe
+	{
+		r23:22 = memd(fp+#-24)
+		r25:24 = memd(fp+#-16)
+	}
+FALLTHROUGH_TAIL_CALL __restore_r27_through_r22_and_deallocframe __restore_r27_through_r26_and_deallocframe
+	{
+		r27:26 = memd(fp+#-8)
+		deallocframe
+	}
+		jumpr lr
+FUNCTION_END __restore_r27_through_r26_and_deallocframe
--- a/lib/builtins/hexagon/dfaddsub.S
+++ b/lib/builtins/hexagon/dfaddsub.S
@ -0,0 +1,398 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision Multiply */
+
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+
+#define EXPA r4
+#define EXPB r5
+#define EXPB_A r5:4
+
+#define ZTMP r7:6
+#define ZTMPH r7
+#define ZTMPL r6
+
+#define ATMP r13:12
+#define ATMPH r13
+#define ATMPL r12
+
+#define BTMP r9:8
+#define BTMPH r9
+#define BTMPL r8
+
+#define ATMP2 r11:10
+#define ATMP2H r11
+#define ATMP2L r10
+
+#define EXPDIFF r15
+#define EXTRACTOFF r14
+#define EXTRACTAMT r15:14
+
+#define TMP r28
+
+#define MANTBITS 52
+#define HI_MANTBITS 20
+#define EXPBITS 11
+#define BIAS 1024
+#define MANTISSA_TO_INT_BIAS 52
+#define SR_BIT_INEXACT 5
+
+#ifndef SR_ROUND_OFF
+#define SR_ROUND_OFF 22
+#endif
+
+#define NORMAL p3
+#define BIGB p2
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+	.text
+	.global __hexagon_adddf3
+	.global __hexagon_subdf3
+	.type __hexagon_adddf3, @function
+	.type __hexagon_subdf3, @function
+
+Q6_ALIAS(adddf3)
+FAST_ALIAS(adddf3)
+FAST2_ALIAS(adddf3)
+Q6_ALIAS(subdf3)
+FAST_ALIAS(subdf3)
+FAST2_ALIAS(subdf3)
+
+	.p2align 5
+__hexagon_adddf3:
+	{
+		EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
+		ATMP = combine(##0x20000000,#0)
+	}
+	{
+		NORMAL = dfclass(A,#2)
+		NORMAL = dfclass(B,#2)
+		BTMP = ATMP
+		BIGB = cmp.gtu(EXPB,EXPA)			// Is B substantially greater than A?
+	}
+	{
+		if (!NORMAL) jump .Ladd_abnormal		// If abnormal, go to special code
+		if (BIGB) A = B				// if B >> A, swap A and B
+		if (BIGB) B = A				// If B >> A, swap A and B
+		if (BIGB) EXPB_A = combine(EXPA,EXPB)	// swap exponents
+	}
+	{
+		ATMP = insert(A,#MANTBITS,#EXPBITS-2)	// Q1.62
+		BTMP = insert(B,#MANTBITS,#EXPBITS-2)	// Q1.62
+		EXPDIFF = sub(EXPA,EXPB)
+		ZTMP = combine(#62,#1)
+	}
+#undef BIGB
+#undef NORMAL
+#define B_POS p3
+#define A_POS p2
+#define NO_STICKIES p1
+.Ladd_continue:
+	{
+		EXPDIFF = min(EXPDIFF,ZTMPH)		// If exponent difference >= ~60,
+							// will collapse to sticky bit
+		ATMP2 = neg(ATMP)
+		A_POS = cmp.gt(AH,#-1)
+		EXTRACTOFF = #0
+	}
+	{
+		if (!A_POS) ATMP = ATMP2
+		ATMP2 = extractu(BTMP,EXTRACTAMT)
+		BTMP = ASR(BTMP,EXPDIFF)
+#undef EXTRACTAMT
+#undef EXPDIFF
+#undef EXTRACTOFF
+#define ZERO r15:14
+		ZERO = #0
+	}
+	{
+		NO_STICKIES = cmp.eq(ATMP2,ZERO)
+		if (!NO_STICKIES.new) BTMPL = or(BTMPL,ZTMPL)
+		EXPB = add(EXPA,#-BIAS-60)
+		B_POS = cmp.gt(BH,#-1)
+	}
+	{
+		ATMP = add(ATMP,BTMP)			// ADD!!!
+		ATMP2 = sub(ATMP,BTMP)			// Negate and ADD --> SUB!!!
+		ZTMP = combine(#54,##2045)
+	}
+	{
+		p0 = cmp.gtu(EXPA,ZTMPH)		// must be pretty high in case of large cancellation
+		p0 = !cmp.gtu(EXPA,ZTMPL)
+		if (!p0.new) jump:nt .Ladd_ovf_unf
+		if (!B_POS) ATMP = ATMP2		// if B neg, pick difference
+	}
+	{
+		A = convert_d2df(ATMP)			// Convert to Double Precision, taking care of flags, etc.  So nice!
+		p0 = cmp.eq(ATMPH,#0)
+		p0 = cmp.eq(ATMPL,#0)
+		if (p0.new) jump:nt .Ladd_zero		// or maybe conversion handles zero case correctly?
+	}
+	{
+		AH += asl(EXPB,#HI_MANTBITS)
+		jumpr r31
+	}
+	.falign
+__hexagon_subdf3:
+	{
+		BH = togglebit(BH,#31)
+		jump __qdsp_adddf3
+	}
+
+
+	.falign
+.Ladd_zero:
+	// True zero, full cancellation
+	// +0 unless round towards negative infinity
+	{
+		TMP = USR
+		A = #0
+		BH = #1
+	}
+	{
+		TMP = extractu(TMP,#2,#22)
+		BH = asl(BH,#31)
+	}
+	{
+		p0 = cmp.eq(TMP,#2)
+		if (p0.new) AH = xor(AH,BH)
+		jumpr r31
+	}
+	.falign
+.Ladd_ovf_unf:
+	// Overflow or Denormal is possible
+	// Good news: Underflow flag is not possible!
+	/*
+	 * ATMP has 2's complement value
+	 *
+	 * EXPA has A's exponent, EXPB has EXPA-BIAS-60
+	 *
+	 * Convert, extract exponent, add adjustment.
+	 * If > 2046, overflow
+	 * If <= 0, denormal
+	 *
+	 * Note that we've not done our zero check yet, so do that too
+	 *
+	 */
+	{
+		A = convert_d2df(ATMP)
+		p0 = cmp.eq(ATMPH,#0)
+		p0 = cmp.eq(ATMPL,#0)
+		if (p0.new) jump:nt .Ladd_zero
+	}
+	{
+		TMP = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		AH += asl(EXPB,#HI_MANTBITS)
+	}
+	{
+		EXPB = add(EXPB,TMP)
+		B = combine(##0x00100000,#0)
+	}
+	{
+		p0 = cmp.gt(EXPB,##BIAS+BIAS-2)
+		if (p0.new) jump:nt .Ladd_ovf
+	}
+	{
+		p0 = cmp.gt(EXPB,#0)
+		if (p0.new) jumpr:t r31
+		TMP = sub(#1,EXPB)
+	}
+	{
+		B = insert(A,#MANTBITS,#0)
+		A = ATMP
+	}
+	{
+		B = lsr(B,TMP)
+	}
+	{
+		A = insert(B,#63,#0)
+		jumpr r31
+	}
+	.falign
+.Ladd_ovf:
+	// We get either max finite value or infinity.  Either way, overflow+inexact
+	{
+		A = ATMP				// 2's complement value
+		TMP = USR
+		ATMP = combine(##0x7fefffff,#-1)	// positive max finite
+	}
+	{
+		EXPB = extractu(TMP,#2,#SR_ROUND_OFF)	// rounding bits
+		TMP = or(TMP,#0x28)			// inexact + overflow
+		BTMP = combine(##0x7ff00000,#0)		// positive infinity
+	}
+	{
+		USR = TMP
+		EXPB ^= lsr(AH,#31)			// Does sign match rounding?
+		TMP = EXPB				// unmodified rounding mode
+	}
+	{
+		p0 = !cmp.eq(TMP,#1)			// If not round-to-zero and
+		p0 = !cmp.eq(EXPB,#2)			// Not rounding the other way,
+		if (p0.new) ATMP = BTMP			// we should get infinity
+	}
+	{
+		A = insert(ATMP,#63,#0)			// insert inf/maxfinite, leave sign
+	}
+	{
+		p0 = dfcmp.eq(A,A)
+		jumpr r31
+	}
+
+.Ladd_abnormal:
+	{
+		ATMP = extractu(A,#63,#0)		// strip off sign
+		BTMP = extractu(B,#63,#0)		// strip off sign
+	}
+	{
+		p3 = cmp.gtu(ATMP,BTMP)
+		if (!p3.new) A = B			// sort values
+		if (!p3.new) B = A			// sort values
+	}
+	{
+		// Any NaN --> NaN, possibly raise invalid if sNaN
+		p0 = dfclass(A,#0x0f)		// A not NaN?
+		if (!p0.new) jump:nt .Linvalid_nan_add
+		if (!p3) ATMP = BTMP
+		if (!p3) BTMP = ATMP
+	}
+	{
+		// Infinity + non-infinity number is infinity
+		// Infinity + infinity --> inf or nan
+		p1 = dfclass(A,#0x08)		// A is infinity
+		if (p1.new) jump:nt .Linf_add
+	}
+	{
+		p2 = dfclass(B,#0x01)		// B is zero
+		if (p2.new) jump:nt .LB_zero	// so return A or special 0+0
+		ATMP = #0
+	}
+	// We are left with adding one or more subnormals
+	{
+		p0 = dfclass(A,#4)
+		if (p0.new) jump:nt .Ladd_two_subnormal
+		ATMP = combine(##0x20000000,#0)
+	}
+	{
+		EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		EXPB = #1
+		// BTMP already ABS(B)
+		BTMP = asl(BTMP,#EXPBITS-2)
+	}
+#undef ZERO
+#define EXTRACTOFF r14
+#define EXPDIFF r15
+	{
+		ATMP = insert(A,#MANTBITS,#EXPBITS-2)
+		EXPDIFF = sub(EXPA,EXPB)
+		ZTMP = combine(#62,#1)
+		jump .Ladd_continue
+	}
+
+.Ladd_two_subnormal:
+	{
+		ATMP = extractu(A,#63,#0)
+		BTMP = extractu(B,#63,#0)
+	}
+	{
+		ATMP = neg(ATMP)
+		BTMP = neg(BTMP)
+		p0 = cmp.gt(AH,#-1)
+		p1 = cmp.gt(BH,#-1)
+	}
+	{
+		if (p0) ATMP = A
+		if (p1) BTMP = B
+	}
+	{
+		ATMP = add(ATMP,BTMP)
+	}
+	{
+		BTMP = neg(ATMP)
+		p0 = cmp.gt(ATMPH,#-1)
+		B = #0
+	}
+	{
+		if (!p0) A = BTMP
+		if (p0) A = ATMP
+		BH = ##0x80000000
+	}
+	{
+		if (!p0) AH = or(AH,BH)
+		p0 = dfcmp.eq(A,B)
+		if (p0.new) jump:nt .Lzero_plus_zero
+	}
+	{
+		jumpr r31
+	}
+
+.Linvalid_nan_add:
+	{
+		TMP = convert_df2sf(A)			// will generate invalid if sNaN
+		p0 = dfclass(B,#0x0f)			// if B is not NaN
+		if (p0.new) B = A 			// make it whatever A is
+	}
+	{
+		BL = convert_df2sf(B)			// will generate invalid if sNaN
+		A = #-1
+		jumpr r31
+	}
+	.falign
+.LB_zero:
+	{
+		p0 = dfcmp.eq(ATMP,A)			// is A also zero?
+		if (!p0.new) jumpr:t r31		// If not, just return A
+	}
+	// 0 + 0 is special
+	// if equal integral values, they have the same sign, which is fine for all rounding
+	// modes.
+	// If unequal in sign, we get +0 for all rounding modes except round down
+.Lzero_plus_zero:
+	{
+		p0 = cmp.eq(A,B)
+		if (p0.new) jumpr:t r31
+	}
+	{
+		TMP = USR
+	}
+	{
+		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+		A = #0
+	}
+	{
+		p0 = cmp.eq(TMP,#2)
+		if (p0.new) AH = ##0x80000000
+		jumpr r31
+	}
+.Linf_add:
+	// adding infinities is only OK if they are equal
+	{
+		p0 = !cmp.eq(AH,BH)			// Do they have different signs
+		p0 = dfclass(B,#8)			// And is B also infinite?
+		if (!p0.new) jumpr:t r31		// If not, just a normal inf
+	}
+	{
+		BL = ##0x7f800001			// sNAN
+	}
+	{
+		A = convert_sf2df(BL)			// trigger invalid, set NaN
+		jumpr r31
+	}
+END(__hexagon_adddf3)
--- a/lib/builtins/hexagon/dfdiv.S
+++ b/lib/builtins/hexagon/dfdiv.S
@ -0,0 +1,492 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision Divide */
+
+#define A r1:0
+#define AH r1
+#define AL r0
+
+#define B r3:2
+#define BH r3
+#define BL r2
+
+#define Q r5:4
+#define QH r5
+#define QL r4
+
+#define PROD r7:6
+#define PRODHI r7
+#define PRODLO r6
+
+#define SFONE r8
+#define SFDEN r9
+#define SFERROR r10
+#define SFRECIP r11
+
+#define EXPBA r13:12
+#define EXPB r13
+#define EXPA r12
+
+#define REMSUB2 r15:14
+
+
+
+#define SIGN r28
+
+#define Q_POSITIVE p3
+#define NORMAL p2
+#define NO_OVF_UNF p1
+#define P_TMP p0
+
+#define RECIPEST_SHIFT 3
+#define QADJ 61
+
+#define DFCLASS_NORMAL 0x02
+#define DFCLASS_NUMBER 0x0F
+#define DFCLASS_INFINITE 0x08
+#define DFCLASS_ZERO 0x01
+#define DFCLASS_NONZERO (DFCLASS_NUMBER ^ DFCLASS_ZERO)
+#define DFCLASS_NONINFINITE (DFCLASS_NUMBER ^ DFCLASS_INFINITE)
+
+#define DF_MANTBITS 52
+#define DF_EXPBITS 11
+#define SF_MANTBITS 23
+#define SF_EXPBITS 8
+#define DF_BIAS 0x3ff
+
+#define SR_ROUND_OFF 22
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+	.text
+	.global __hexagon_divdf3
+	.type __hexagon_divdf3,@function
+	Q6_ALIAS(divdf3)
+        FAST_ALIAS(divdf3)
+        FAST2_ALIAS(divdf3)
+	.p2align 5
+__hexagon_divdf3:
+	{
+		NORMAL = dfclass(A,#DFCLASS_NORMAL)
+		NORMAL = dfclass(B,#DFCLASS_NORMAL)
+		EXPBA = combine(BH,AH)
+		SIGN = xor(AH,BH)
+	}
+#undef A
+#undef AH
+#undef AL
+#undef B
+#undef BH
+#undef BL
+#define REM r1:0
+#define REMHI r1
+#define REMLO r0
+#define DENOM r3:2
+#define DENOMHI r3
+#define DENOMLO r2
+	{
+		if (!NORMAL) jump .Ldiv_abnormal
+		PROD = extractu(DENOM,#SF_MANTBITS,#DF_MANTBITS-SF_MANTBITS)
+		SFONE = ##0x3f800001
+	}
+	{
+		SFDEN = or(SFONE,PRODLO)
+		EXPB = extractu(EXPB,#DF_EXPBITS,#DF_MANTBITS-32)
+		EXPA = extractu(EXPA,#DF_EXPBITS,#DF_MANTBITS-32)
+		Q_POSITIVE = cmp.gt(SIGN,#-1)
+	}
+#undef SIGN
+#define ONE r28
+.Ldenorm_continue:
+	{
+		SFRECIP,P_TMP = sfrecipa(SFONE,SFDEN)
+		SFERROR = and(SFONE,#-2)
+		ONE = #1
+		EXPA = sub(EXPA,EXPB)
+	}
+#undef EXPB
+#define RECIPEST r13
+	{
+		SFERROR -= sfmpy(SFRECIP,SFDEN):lib
+		REMHI = insert(ONE,#DF_EXPBITS+1,#DF_MANTBITS-32)
+		RECIPEST = ##0x00800000 << RECIPEST_SHIFT
+	}
+	{
+		SFRECIP += sfmpy(SFRECIP,SFERROR):lib
+		DENOMHI = insert(ONE,#DF_EXPBITS+1,#DF_MANTBITS-32)
+		SFERROR = and(SFONE,#-2)
+	}
+	{
+		SFERROR -= sfmpy(SFRECIP,SFDEN):lib
+		QH = #-DF_BIAS+1
+		QL = #DF_BIAS-1
+	}
+	{
+		SFRECIP += sfmpy(SFRECIP,SFERROR):lib
+		NO_OVF_UNF = cmp.gt(EXPA,QH)
+		NO_OVF_UNF = !cmp.gt(EXPA,QL)
+	}
+	{
+		RECIPEST = insert(SFRECIP,#SF_MANTBITS,#RECIPEST_SHIFT)
+		Q = #0
+		EXPA = add(EXPA,#-QADJ)
+	}
+#undef SFERROR
+#undef SFRECIP
+#define TMP r10
+#define TMP1 r11
+	{
+		RECIPEST = add(RECIPEST,#((-3) << RECIPEST_SHIFT))
+	}
+
+#define DIV_ITER1B(QSHIFTINSN,QSHIFT,REMSHIFT,EXTRA) \
+	{ \
+		PROD = mpyu(RECIPEST,REMHI); \
+		REM = asl(REM,# ## ( REMSHIFT )); \
+	}; \
+	{ \
+		PRODLO = # ## 0; \
+		REM -= mpyu(PRODHI,DENOMLO); \
+		REMSUB2 = mpyu(PRODHI,DENOMHI); \
+	}; \
+	{ \
+		Q += QSHIFTINSN(PROD, # ## ( QSHIFT )); \
+		REM -= asl(REMSUB2, # ## 32); \
+		EXTRA \
+	}
+
+
+	DIV_ITER1B(ASL,14,15,)
+	DIV_ITER1B(ASR,1,15,)
+	DIV_ITER1B(ASR,16,15,)
+	DIV_ITER1B(ASR,31,15,PROD=# ( 0 );)
+
+#undef REMSUB2
+#define TMPPAIR r15:14
+#define TMPPAIRHI r15
+#define TMPPAIRLO r14
+#undef RECIPEST
+#define EXPB r13
+	{
+		// compare or sub with carry
+		TMPPAIR = sub(REM,DENOM)
+		P_TMP = cmp.gtu(DENOM,REM)
+		// set up amt to add to q
+		if (!P_TMP.new) PRODLO  = #2
+	}
+	{
+		Q = add(Q,PROD)
+		if (!P_TMP) REM = TMPPAIR
+		TMPPAIR = #0
+	}
+	{
+		P_TMP = cmp.eq(REM,TMPPAIR)
+		if (!P_TMP.new) QL = or(QL,ONE)
+	}
+	{
+		PROD = neg(Q)
+	}
+	{
+		if (!Q_POSITIVE) Q = PROD
+	}
+#undef REM
+#undef REMHI
+#undef REMLO
+#undef DENOM
+#undef DENOMLO
+#undef DENOMHI
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+	{
+		A = convert_d2df(Q)
+		if (!NO_OVF_UNF) jump .Ldiv_ovf_unf
+	}
+	{
+		AH += asl(EXPA,#DF_MANTBITS-32)
+		jumpr r31
+	}
+
+.Ldiv_ovf_unf:
+	{
+		AH += asl(EXPA,#DF_MANTBITS-32)
+		EXPB = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
+	}
+	{
+		PROD = abs(Q)
+		EXPA = add(EXPA,EXPB)
+	}
+	{
+		P_TMP = cmp.gt(EXPA,##DF_BIAS+DF_BIAS)		// overflow
+		if (P_TMP.new) jump:nt .Ldiv_ovf
+	}
+	{
+		P_TMP = cmp.gt(EXPA,#0)
+		if (P_TMP.new) jump:nt .Lpossible_unf		// round up to normal possible...
+	}
+	/* Underflow */
+	/* We know what the infinite range exponent should be (EXPA) */
+	/* Q is 2's complement, PROD is abs(Q) */
+	/* Normalize Q, shift right, add a high bit, convert, change exponent */
+
+#define FUDGE1 7	// how much to shift right
+#define FUDGE2 4	// how many guard/round to keep at lsbs
+
+	{
+		EXPB = add(clb(PROD),#-1)			// doesn't need to be added in since
+		EXPA = sub(#FUDGE1,EXPA)			// we extract post-converted exponent
+		TMP = USR
+		TMP1 = #63
+	}
+	{
+		EXPB = min(EXPA,TMP1)
+		TMP1 = or(TMP,#0x030)
+		PROD = asl(PROD,EXPB)
+		EXPA = #0
+	}
+	{
+		TMPPAIR = extractu(PROD,EXPBA)				// bits that will get shifted out
+		PROD = lsr(PROD,EXPB)					// shift out bits
+		B = #1
+	}
+	{
+		P_TMP = cmp.gtu(B,TMPPAIR)
+		if (!P_TMP.new) PRODLO = or(BL,PRODLO)
+		PRODHI = setbit(PRODHI,#DF_MANTBITS-32+FUDGE2)
+	}
+	{
+		Q = neg(PROD)
+		P_TMP = bitsclr(PRODLO,#(1<<FUDGE2)-1)
+		if (!P_TMP.new) TMP = TMP1
+	}
+	{
+		USR = TMP
+		if (Q_POSITIVE) Q = PROD
+		TMP = #-DF_BIAS-(DF_MANTBITS+FUDGE2)
+	}
+	{
+		A = convert_d2df(Q)
+	}
+	{
+		AH += asl(TMP,#DF_MANTBITS-32)
+		jumpr r31
+	}
+
+
+.Lpossible_unf:
+	/* If upper parts of Q were all F's, but abs(A) == 0x00100000_00000000, we rounded up to min_normal */
+	/* The answer is correct, but we need to raise Underflow */
+	{
+		B = extractu(A,#63,#0)
+		TMPPAIR = combine(##0x00100000,#0)		// min normal
+		TMP = #0x7FFF
+	}
+	{
+		P_TMP = dfcmp.eq(TMPPAIR,B)		// Is everything zero in the rounded value...
+		P_TMP = bitsset(PRODHI,TMP)		// but a bunch of bits set in the unrounded abs(quotient)?
+	}
+
+#if (__HEXAGON_ARCH__ == 60)
+		TMP = USR		// If not, just return
+		if (!P_TMP) jumpr r31   // Else, we want to set Unf+Inexact
+					// Note that inexact is already set...
+#else
+	{
+		if (!P_TMP) jumpr r31			// If not, just return
+		TMP = USR				// Else, we want to set Unf+Inexact
+	}						// Note that inexact is already set...
+#endif
+	{
+		TMP = or(TMP,#0x30)
+	}
+	{
+		USR = TMP
+	}
+	{
+		p0 = dfcmp.eq(A,A)
+		jumpr r31
+	}
+
+.Ldiv_ovf:
+	/*
+	 * Raise Overflow, and choose the correct overflow value (saturated normal or infinity)
+	 */
+	{
+		TMP = USR
+		B = combine(##0x7fefffff,#-1)
+		AH = mux(Q_POSITIVE,#0,#-1)
+	}
+	{
+		PROD = combine(##0x7ff00000,#0)
+		QH = extractu(TMP,#2,#SR_ROUND_OFF)
+		TMP = or(TMP,#0x28)
+	}
+	{
+		USR = TMP
+		QH ^= lsr(AH,#31)
+		QL = QH
+	}
+	{
+		p0 = !cmp.eq(QL,#1)		// if not round-to-zero
+		p0 = !cmp.eq(QH,#2)		// and not rounding the other way
+		if (p0.new) B = PROD		// go to inf
+		p0 = dfcmp.eq(B,B)		// get exceptions
+	}
+	{
+		A = insert(B,#63,#0)
+		jumpr r31
+	}
+
+#undef ONE
+#define SIGN r28
+#undef NORMAL
+#undef NO_OVF_UNF
+#define P_INF p1
+#define P_ZERO p2
+.Ldiv_abnormal:
+	{
+		P_TMP = dfclass(A,#DFCLASS_NUMBER)
+		P_TMP = dfclass(B,#DFCLASS_NUMBER)
+		Q_POSITIVE = cmp.gt(SIGN,#-1)
+	}
+	{
+		P_INF = dfclass(A,#DFCLASS_INFINITE)
+		P_INF = dfclass(B,#DFCLASS_INFINITE)
+	}
+	{
+		P_ZERO = dfclass(A,#DFCLASS_ZERO)
+		P_ZERO = dfclass(B,#DFCLASS_ZERO)
+	}
+	{
+		if (!P_TMP) jump .Ldiv_nan
+		if (P_INF) jump .Ldiv_invalid
+	}
+	{
+		if (P_ZERO) jump .Ldiv_invalid
+	}
+	{
+		P_ZERO = dfclass(A,#DFCLASS_NONZERO)		// nonzero
+		P_ZERO = dfclass(B,#DFCLASS_NONINFINITE)	// non-infinite
+	}
+	{
+		P_INF = dfclass(A,#DFCLASS_NONINFINITE)	// non-infinite
+		P_INF = dfclass(B,#DFCLASS_NONZERO)	// nonzero
+	}
+	{
+		if (!P_ZERO) jump .Ldiv_zero_result
+		if (!P_INF) jump .Ldiv_inf_result
+	}
+	/* Now we've narrowed it down to (de)normal / (de)normal */
+	/* Set up A/EXPA B/EXPB and go back */
+#undef P_ZERO
+#undef P_INF
+#define P_TMP2 p1
+	{
+		P_TMP = dfclass(A,#DFCLASS_NORMAL)
+		P_TMP2 = dfclass(B,#DFCLASS_NORMAL)
+		TMP = ##0x00100000
+	}
+	{
+		EXPBA = combine(BH,AH)
+		AH = insert(TMP,#DF_EXPBITS+1,#DF_MANTBITS-32)		// clear out hidden bit, sign bit
+		BH = insert(TMP,#DF_EXPBITS+1,#DF_MANTBITS-32)		// clear out hidden bit, sign bit
+	}
+	{
+		if (P_TMP) AH = or(AH,TMP)				// if normal, add back in hidden bit
+		if (P_TMP2) BH = or(BH,TMP)				// if normal, add back in hidden bit
+	}
+	{
+		QH = add(clb(A),#-DF_EXPBITS)
+		QL = add(clb(B),#-DF_EXPBITS)
+		TMP = #1
+	}
+	{
+		EXPA = extractu(EXPA,#DF_EXPBITS,#DF_MANTBITS-32)
+		EXPB = extractu(EXPB,#DF_EXPBITS,#DF_MANTBITS-32)
+	}
+	{
+		A = asl(A,QH)
+		B = asl(B,QL)
+		if (!P_TMP) EXPA = sub(TMP,QH)
+		if (!P_TMP2) EXPB = sub(TMP,QL)
+	}	// recreate values needed by resume coke
+	{
+		PROD = extractu(B,#SF_MANTBITS,#DF_MANTBITS-SF_MANTBITS)
+	}
+	{
+		SFDEN = or(SFONE,PRODLO)
+		jump .Ldenorm_continue
+	}
+
+.Ldiv_zero_result:
+	{
+		AH = xor(AH,BH)
+		B = #0
+	}
+	{
+		A = insert(B,#63,#0)
+		jumpr r31
+	}
+.Ldiv_inf_result:
+	{
+		p2 = dfclass(B,#DFCLASS_ZERO)
+		p2 = dfclass(A,#DFCLASS_NONINFINITE)
+	}
+	{
+		TMP = USR
+		if (!p2) jump 1f
+		AH = xor(AH,BH)
+	}
+	{
+		TMP = or(TMP,#0x04)		// DBZ
+	}
+	{
+		USR = TMP
+	}
+1:
+	{
+		B = combine(##0x7ff00000,#0)
+		p0 = dfcmp.uo(B,B)		// take possible exception
+	}
+	{
+		A = insert(B,#63,#0)
+		jumpr r31
+	}
+.Ldiv_nan:
+	{
+		p0 = dfclass(A,#0x10)
+		p1 = dfclass(B,#0x10)
+		if (!p0.new) A = B
+		if (!p1.new) B = A
+	}
+	{
+		QH = convert_df2sf(A)	// get possible invalid exceptions
+		QL = convert_df2sf(B)
+	}
+	{
+		A = #-1
+		jumpr r31
+	}
+
+.Ldiv_invalid:
+	{
+		TMP = ##0x7f800001
+	}
+	{
+		A = convert_sf2df(TMP)		// get invalid, get DF qNaN
+		jumpr r31
+	}
+END(__hexagon_divdf3)
--- a/lib/builtins/hexagon/dffma.S
+++ b/lib/builtins/hexagon/dffma.S
@ -0,0 +1,705 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+/* Double Precision Multiply */
+
+
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+#define C r5:4
+#define CH r5
+#define CL r4
+
+
+
+#define BTMP r15:14
+#define BTMPH r15
+#define BTMPL r14
+
+#define ATMP r13:12
+#define ATMPH r13
+#define ATMPL r12
+
+#define CTMP r11:10
+#define CTMPH r11
+#define CTMPL r10
+
+#define PP_LL r9:8
+#define PP_LL_H r9
+#define PP_LL_L r8
+
+#define PP_ODD r7:6
+#define PP_ODD_H r7
+#define PP_ODD_L r6
+
+
+#define PP_HH r17:16
+#define PP_HH_H r17
+#define PP_HH_L r16
+
+#define EXPA r18
+#define EXPB r19
+#define EXPBA r19:18
+
+#define TMP r28
+
+#define P_TMP p0
+#define PROD_NEG p3
+#define EXACT p2
+#define SWAP p1
+
+#define MANTBITS 52
+#define HI_MANTBITS 20
+#define EXPBITS 11
+#define BIAS 1023
+#define STACKSPACE 32
+
+#define ADJUST 4
+
+#define FUDGE 7
+#define FUDGE2 3
+
+#ifndef SR_ROUND_OFF
+#define SR_ROUND_OFF 22
+#endif
+
+	/*
+	 * First, classify for normal values, and abort if abnormal
+	 *
+	 * Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
+	 *
+	 * Since we know that the 2 MSBs of the H registers is zero, we should never carry
+	 * the partial products that involve the H registers
+	 *
+	 * Try to buy X slots, at the expense of latency if needed
+	 *
+	 * We will have PP_HH with the upper bits of the product, PP_LL with the lower
+	 * PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
+	 * PP_HH can have a minimum of 0x0100_0000_0000_0000
+	 *
+	 * 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
+	 *
+	 * We need to align CTMP.
+	 * If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
+	 * If CTMP << PP align CTMP and add 128 bits.  Then compute sticky
+	 * If CTMP ~= PP, align CTMP and add 128 bits.  May have massive cancellation.
+	 *
+	 * Convert partial product and CTMP to 2's complement prior to addition
+	 *
+	 * After we add, we need to normalize into upper 64 bits, then compute sticky.
+	 *
+	 *
+	 */
+
+	.text
+	.global __hexagon_fmadf4
+        .type __hexagon_fmadf4,@function
+	.global __hexagon_fmadf5
+        .type __hexagon_fmadf5,@function
+	.global fma
+	.type fma,@function
+	Q6_ALIAS(fmadf5)
+	.p2align 5
+__hexagon_fmadf4:
+__hexagon_fmadf5:
+fma:
+	{
+		P_TMP = dfclass(A,#2)
+		P_TMP = dfclass(B,#2)
+		ATMP = #0
+		BTMP = #0
+	}
+	{
+		ATMP = insert(A,#MANTBITS,#EXPBITS-3)
+		BTMP = insert(B,#MANTBITS,#EXPBITS-3)
+		PP_ODD_H = ##0x10000000
+		allocframe(#STACKSPACE)
+	}
+	{
+		PP_LL = mpyu(ATMPL,BTMPL)
+		if (!P_TMP) jump .Lfma_abnormal_ab
+		ATMPH = or(ATMPH,PP_ODD_H)
+		BTMPH = or(BTMPH,PP_ODD_H)
+	}
+	{
+		P_TMP = dfclass(C,#2)
+		if (!P_TMP.new) jump:nt .Lfma_abnormal_c
+		CTMP = combine(PP_ODD_H,#0)
+		PP_ODD = combine(#0,PP_LL_H)
+	}
+.Lfma_abnormal_c_restart:
+	{
+		PP_ODD += mpyu(BTMPL,ATMPH)
+		CTMP = insert(C,#MANTBITS,#EXPBITS-3)
+		memd(r29+#0) = PP_HH
+		memd(r29+#8) = EXPBA
+	}
+	{
+		PP_ODD += mpyu(ATMPL,BTMPH)
+		EXPBA = neg(CTMP)
+		P_TMP = cmp.gt(CH,#-1)
+		TMP = xor(AH,BH)
+	}
+	{
+		EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
+		PP_HH = combine(#0,PP_ODD_H)
+		if (!P_TMP) CTMP = EXPBA
+	}
+	{
+		PP_HH += mpyu(ATMPH,BTMPH)
+		PP_LL = combine(PP_ODD_L,PP_LL_L)
+#undef PP_ODD
+#undef PP_ODD_H
+#undef PP_ODD_L
+#undef ATMP
+#undef ATMPL
+#undef ATMPH
+#undef BTMP
+#undef BTMPL
+#undef BTMPH
+#define RIGHTLEFTSHIFT r13:12
+#define RIGHTSHIFT r13
+#define LEFTSHIFT r12
+
+		EXPA = add(EXPA,EXPB)
+#undef EXPB
+#undef EXPBA
+#define EXPC r19
+#define EXPCA r19:18
+		EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
+	}
+	/* PP_HH:PP_LL now has product */
+	/* CTMP is negated */
+	/* EXPA,B,C are extracted */
+	/*
+	 * We need to negate PP
+	 * Since we will be adding with carry later, if we need to negate,
+	 * just invert all bits now, which we can do conditionally and in parallel
+	 */
+#define PP_HH_TMP r15:14
+#define PP_LL_TMP r7:6
+	{
+		EXPA = add(EXPA,#-BIAS+(ADJUST))
+		PROD_NEG = !cmp.gt(TMP,#-1)
+		PP_LL_TMP = #0
+		PP_HH_TMP = #0
+	}
+	{
+		PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
+		P_TMP = !cmp.gt(TMP,#-1)
+		SWAP = cmp.gt(EXPC,EXPA)	// If C >> PP
+		if (SWAP.new) EXPCA = combine(EXPA,EXPC)
+	}
+	{
+		PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
+		if (P_TMP) PP_LL = PP_LL_TMP
+#undef PP_LL_TMP
+#define CTMP2 r7:6
+#define CTMP2H r7
+#define CTMP2L r6
+		CTMP2 = #0
+		EXPC = sub(EXPA,EXPC)
+	}
+	{
+		if (P_TMP) PP_HH = PP_HH_TMP
+		P_TMP = cmp.gt(EXPC,#63)
+		if (SWAP) PP_LL = CTMP2
+		if (SWAP) CTMP2 = PP_LL
+	}
+#undef PP_HH_TMP
+//#define ONE r15:14
+//#define S_ONE r14
+#define ZERO r15:14
+#define S_ZERO r15
+#undef PROD_NEG
+#define P_CARRY p3
+	{
+		if (SWAP) PP_HH = CTMP	// Swap C and PP
+		if (SWAP) CTMP = PP_HH
+		if (P_TMP) EXPC = add(EXPC,#-64)
+		TMP = #63
+	}
+	{
+		// If diff > 63, pre-shift-right by 64...
+		if (P_TMP) CTMP2 = CTMP
+		TMP = asr(CTMPH,#31)
+		RIGHTSHIFT = min(EXPC,TMP)
+		LEFTSHIFT = #0
+	}
+#undef C
+#undef CH
+#undef CL
+#define STICKIES r5:4
+#define STICKIESH r5
+#define STICKIESL r4
+	{
+		if (P_TMP) CTMP = combine(TMP,TMP)	// sign extension of pre-shift-right-64
+		STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
+		CTMP2 = lsr(CTMP2,RIGHTSHIFT)
+		LEFTSHIFT = sub(#64,RIGHTSHIFT)
+	}
+	{
+		ZERO = #0
+		TMP = #-2
+		CTMP2 |= lsl(CTMP,LEFTSHIFT)
+		CTMP = asr(CTMP,RIGHTSHIFT)
+	}
+	{
+		P_CARRY = cmp.gtu(STICKIES,ZERO)	// If we have sticky bits from C shift
+		if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
+#undef ZERO
+#define ONE r15:14
+#define S_ONE r14
+		ONE = #1
+		STICKIES = #0
+	}
+	{
+		PP_LL = add(CTMP2,PP_LL,P_CARRY):carry	// use the carry to add the sticky
+	}
+	{
+		PP_HH = add(CTMP,PP_HH,P_CARRY):carry
+		TMP = #62
+	}
+	/*
+	 * PP_HH:PP_LL now holds the sum
+	 * We may need to normalize left, up to ??? bits.
+	 *
+	 * I think that if we have massive cancellation, the range we normalize by
+	 * is still limited
+	 */
+	{
+		LEFTSHIFT = add(clb(PP_HH),#-2)
+		if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f	// all sign bits?
+	}
+	/* We had all sign bits, shift left by 62. */
+	{
+		CTMP = extractu(PP_LL,#62,#2)
+		PP_LL = asl(PP_LL,#62)
+		EXPA = add(EXPA,#-62)			// And adjust exponent of result
+	}
+	{
+		PP_HH = insert(CTMP,#62,#0)		// Then shift 63
+	}
+	{
+		LEFTSHIFT = add(clb(PP_HH),#-2)
+	}
+	.falign
+1:
+	{
+		CTMP = asl(PP_HH,LEFTSHIFT)
+		STICKIES |= asl(PP_LL,LEFTSHIFT)
+		RIGHTSHIFT = sub(#64,LEFTSHIFT)
+		EXPA = sub(EXPA,LEFTSHIFT)
+	}
+	{
+		CTMP |= lsr(PP_LL,RIGHTSHIFT)
+		EXACT = cmp.gtu(ONE,STICKIES)
+		TMP = #BIAS+BIAS-2
+	}
+	{
+		if (!EXACT) CTMPL = or(CTMPL,S_ONE)
+		// If EXPA is overflow/underflow, jump to ovf_unf
+		P_TMP = !cmp.gt(EXPA,TMP)
+		P_TMP = cmp.gt(EXPA,#1)
+		if (!P_TMP.new) jump:nt .Lfma_ovf_unf
+	}
+	{
+		// XXX: FIXME: should PP_HH for check of zero be CTMP?
+		P_TMP = cmp.gtu(ONE,CTMP)		// is result true zero?
+		A = convert_d2df(CTMP)
+		EXPA = add(EXPA,#-BIAS-60)
+		PP_HH = memd(r29+#0)
+	}
+	{
+		AH += asl(EXPA,#HI_MANTBITS)
+		EXPCA = memd(r29+#8)
+		if (!P_TMP) dealloc_return		// not zero, return
+	}
+.Ladd_yields_zero:
+	/* We had full cancellation.  Return +/- zero (-0 when round-down) */
+	{
+		TMP = USR
+		A = #0
+	}
+	{
+		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+		PP_HH = memd(r29+#0)
+		EXPCA = memd(r29+#8)
+	}
+	{
+		p0 = cmp.eq(TMP,#2)
+		if (p0.new) AH = ##0x80000000
+		dealloc_return
+	}
+
+#undef RIGHTLEFTSHIFT
+#undef RIGHTSHIFT
+#undef LEFTSHIFT
+#undef CTMP2
+#undef CTMP2H
+#undef CTMP2L
+
+.Lfma_ovf_unf:
+	{
+		p0 = cmp.gtu(ONE,CTMP)
+		if (p0.new) jump:nt .Ladd_yields_zero
+	}
+	{
+		A = convert_d2df(CTMP)
+		EXPA = add(EXPA,#-BIAS-60)
+		TMP = EXPA
+	}
+#define NEW_EXPB r7
+#define NEW_EXPA r6
+	{
+		AH += asl(EXPA,#HI_MANTBITS)
+		NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
+	}
+	{
+		NEW_EXPA = add(EXPA,NEW_EXPB)
+		PP_HH = memd(r29+#0)
+		EXPCA = memd(r29+#8)
+#undef PP_HH
+#undef PP_HH_H
+#undef PP_HH_L
+#undef EXPCA
+#undef EXPC
+#undef EXPA
+#undef PP_LL
+#undef PP_LL_H
+#undef PP_LL_L
+#define EXPA r6
+#define EXPB r7
+#define EXPBA r7:6
+#define ATMP r9:8
+#define ATMPH r9
+#define ATMPL r8
+#undef NEW_EXPB
+#undef NEW_EXPA
+		ATMP = abs(CTMP)
+	}
+	{
+		p0 = cmp.gt(EXPA,##BIAS+BIAS)
+		if (p0.new) jump:nt .Lfma_ovf
+	}
+	{
+		p0 = cmp.gt(EXPA,#0)
+		if (p0.new) jump:nt .Lpossible_unf
+	}
+	{
+		// TMP has original EXPA.
+		// ATMP is corresponding value
+		// Normalize ATMP and shift right to correct location
+		EXPB = add(clb(ATMP),#-2)		// Amount to left shift to normalize
+		EXPA = sub(#1+5,TMP)			// Amount to right shift to denormalize
+		p3 = cmp.gt(CTMPH,#-1)
+	}
+	/* Underflow */
+	/* We know that the infinte range exponent should be EXPA */
+	/* CTMP is 2's complement, ATMP is abs(CTMP) */
+	{
+		EXPA = add(EXPA,EXPB)		// how much to shift back right
+		ATMP = asl(ATMP,EXPB)		// shift left
+		AH = USR
+		TMP = #63
+	}
+	{
+		EXPB = min(EXPA,TMP)
+		EXPA = #0
+		AL = #0x0030
+	}
+	{
+		B = extractu(ATMP,EXPBA)
+		ATMP = asr(ATMP,EXPB)
+	}
+	{
+		p0 = cmp.gtu(ONE,B)
+		if (!p0.new) ATMPL = or(ATMPL,S_ONE)
+		ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
+	}
+	{
+		CTMP = neg(ATMP)
+		p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
+		if (!p1.new) AH = or(AH,AL)
+		B = #0
+	}
+	{
+		if (p3) CTMP = ATMP
+		USR = AH
+		TMP = #-BIAS-(MANTBITS+FUDGE2)
+	}
+	{
+		A = convert_d2df(CTMP)
+	}
+	{
+		AH += asl(TMP,#HI_MANTBITS)
+		dealloc_return
+	}
+.Lpossible_unf:
+	{
+		TMP = ##0x7fefffff
+		ATMP = abs(CTMP)
+	}
+	{
+		p0 = cmp.eq(AL,#0)
+		p0 = bitsclr(AH,TMP)
+		if (!p0.new) dealloc_return:t
+		TMP = #0x7fff
+	}
+	{
+		p0 = bitsset(ATMPH,TMP)
+		BH = USR
+		BL = #0x0030
+	}
+	{
+		if (p0) BH = or(BH,BL)
+	}
+	{
+		USR = BH
+	}
+	{
+		p0 = dfcmp.eq(A,A)
+		dealloc_return
+	}
+.Lfma_ovf:
+	{
+		TMP = USR
+		CTMP = combine(##0x7fefffff,#-1)
+		A = CTMP
+	}
+	{
+		ATMP = combine(##0x7ff00000,#0)
+		BH = extractu(TMP,#2,#SR_ROUND_OFF)
+		TMP = or(TMP,#0x28)
+	}
+	{
+		USR = TMP
+		BH ^= lsr(AH,#31)
+		BL = BH
+	}
+	{
+		p0 = !cmp.eq(BL,#1)
+		p0 = !cmp.eq(BH,#2)
+	}
+	{
+		p0 = dfcmp.eq(ATMP,ATMP)
+		if (p0.new) CTMP = ATMP
+	}
+	{
+		A = insert(CTMP,#63,#0)
+		dealloc_return
+	}
+#undef CTMP
+#undef CTMPH
+#undef CTMPL
+#define BTMP r11:10
+#define BTMPH r11
+#define BTMPL r10
+
+#undef STICKIES
+#undef STICKIESH
+#undef STICKIESL
+#define C r5:4
+#define CH r5
+#define CL r4
+
+.Lfma_abnormal_ab:
+	{
+		ATMP = extractu(A,#63,#0)
+		BTMP = extractu(B,#63,#0)
+		deallocframe
+	}
+	{
+		p3 = cmp.gtu(ATMP,BTMP)
+		if (!p3.new) A = B		// sort values
+		if (!p3.new) B = A
+	}
+	{
+		p0 = dfclass(A,#0x0f)		// A NaN?
+		if (!p0.new) jump:nt .Lnan
+		if (!p3) ATMP = BTMP
+		if (!p3) BTMP = ATMP
+	}
+	{
+		p1 = dfclass(A,#0x08)		// A is infinity
+		p1 = dfclass(B,#0x0e)		// B is nonzero
+	}
+	{
+		p0 = dfclass(A,#0x08)		// a is inf
+		p0 = dfclass(B,#0x01)		// b is zero
+	}
+	{
+		if (p1) jump .Lab_inf
+		p2 = dfclass(B,#0x01)
+	}
+	{
+		if (p0) jump .Linvalid
+		if (p2) jump .Lab_true_zero
+		TMP = ##0x7c000000
+	}
+	// We are left with a normal or subnormal times a subnormal, A > B
+	// If A and B are both very small, we will go to a single sticky bit; replace
+	// A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
+	// if A and B might multiply to something bigger, decrease A exp and increase B exp
+	// and start over
+	{
+		p0 = bitsclr(AH,TMP)
+		if (p0.new) jump:nt .Lfma_ab_tiny
+	}
+	{
+		TMP = add(clb(BTMP),#-EXPBITS)
+	}
+	{
+		BTMP = asl(BTMP,TMP)
+	}
+	{
+		B = insert(BTMP,#63,#0)
+		AH -= asl(TMP,#HI_MANTBITS)
+	}
+	jump fma
+
+.Lfma_ab_tiny:
+	ATMP = combine(##0x00100000,#0)
+	{
+		A = insert(ATMP,#63,#0)
+		B = insert(ATMP,#63,#0)
+	}
+	jump fma
+
+.Lab_inf:
+	{
+		B = lsr(B,#63)
+		p0 = dfclass(C,#0x10)
+	}
+	{
+		A ^= asl(B,#63)
+		if (p0) jump .Lnan
+	}
+	{
+		p1 = dfclass(C,#0x08)
+		if (p1.new) jump:nt .Lfma_inf_plus_inf
+	}
+	/* A*B is +/- inf, C is finite.  Return A */
+	{
+		jumpr r31
+	}
+	.falign
+.Lfma_inf_plus_inf:
+	{	// adding infinities of different signs is invalid
+		p0 = dfcmp.eq(A,C)
+		if (!p0.new) jump:nt .Linvalid
+	}
+	{
+		jumpr r31
+	}
+
+.Lnan:
+	{
+		p0 = dfclass(B,#0x10)
+		p1 = dfclass(C,#0x10)
+		if (!p0.new) B = A
+		if (!p1.new) C = A
+	}
+	{	// find sNaNs
+		BH = convert_df2sf(B)
+		BL = convert_df2sf(C)
+	}
+	{
+		BH = convert_df2sf(A)
+		A = #-1
+		jumpr r31
+	}
+
+.Linvalid:
+	{
+		TMP = ##0x7f800001		// sp snan
+	}
+	{
+		A = convert_sf2df(TMP)
+		jumpr r31
+	}
+
+.Lab_true_zero:
+	// B is zero, A is finite number
+	{
+		p0 = dfclass(C,#0x10)
+		if (p0.new) jump:nt .Lnan
+		if (p0.new) A = C
+	}
+	{
+		p0 = dfcmp.eq(B,C)		// is C also zero?
+		AH = lsr(AH,#31)		// get sign
+	}
+	{
+		BH ^= asl(AH,#31)		// form correctly signed zero in B
+		if (!p0) A = C			// If C is not zero, return C
+		if (!p0) jumpr r31
+	}
+	/* B has correctly signed zero, C is also zero */
+.Lzero_plus_zero:
+	{
+		p0 = cmp.eq(B,C)		// yes, scalar equals.  +0++0 or -0+-0
+		if (p0.new) jumpr:t r31
+		A = B
+	}
+	{
+		TMP = USR
+	}
+	{
+		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
+		A = #0
+	}
+	{
+		p0 = cmp.eq(TMP,#2)
+		if (p0.new) AH = ##0x80000000
+		jumpr r31
+	}
+#undef BTMP
+#undef BTMPH
+#undef BTMPL
+#define CTMP r11:10
+	.falign
+.Lfma_abnormal_c:
+	/* We know that AB is normal * normal */
+	/* C is not normal: zero, subnormal, inf, or NaN. */
+	{
+		p0 = dfclass(C,#0x10)		// is C NaN?
+		if (p0.new) jump:nt .Lnan
+		if (p0.new) A = C		// move NaN to A
+		deallocframe
+	}
+	{
+		p0 = dfclass(C,#0x08)		// is C inf?
+		if (p0.new) A = C		// return C
+		if (p0.new) jumpr:nt r31
+	}
+	// zero or subnormal
+	// If we have a zero, and we know AB is normal*normal, we can just call normal multiply
+	{
+		p0 = dfclass(C,#0x01)		// is C zero?
+		if (p0.new) jump:nt __hexagon_muldf3
+		TMP = #1
+	}
+	// Left with: subnormal
+	// Adjust C and jump back to restart
+	{
+		allocframe(#STACKSPACE)		// oops, deallocated above, re-allocate frame
+		CTMP = #0
+		CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
+		jump .Lfma_abnormal_c_restart
+	}
+END(fma)
--- a/lib/builtins/hexagon/dfminmax.S
+++ b/lib/builtins/hexagon/dfminmax.S
@ -0,0 +1,79 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define A r1:0
+#define B r3:2
+#define ATMP r5:4
+
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+/*
+ * Min and Max return A if B is NaN, or B if A is NaN
+ * Otherwise, they return the smaller or bigger value
+ *
+ * If values are equal, we want to favor -0.0 for min and +0.0 for max.
+ */
+
+/*
+ * Compares always return false for NaN
+ * if (isnan(A)) A = B; if (A > B) A = B will only trigger at most one of those options.
+ */
+	.text
+	.global __hexagon_mindf3
+	.global __hexagon_maxdf3
+	.global fmin
+	.type fmin,@function
+	.global fmax
+	.type fmax,@function
+	.type __hexagon_mindf3,@function
+	.type __hexagon_maxdf3,@function
+	Q6_ALIAS(mindf3)
+	Q6_ALIAS(maxdf3)
+	.p2align 5
+__hexagon_mindf3:
+fmin:
+	{
+		p0 = dfclass(A,#0x10)		// If A is a number
+		p1 = dfcmp.gt(A,B)		// AND B > A, don't swap
+		ATMP = A
+	}
+	{
+		if (p0) A = B			// if A is NaN use B
+		if (p1) A = B			// gt is always false if either is NaN
+		p2 = dfcmp.eq(A,B)		// if A == B
+		if (!p2.new) jumpr:t r31
+	}
+	/* A == B, return A|B to select -0.0 over 0.0 */
+	{
+		A = or(ATMP,B)
+		jumpr r31
+	}
+END(__hexagon_mindf3)
+	.falign
+__hexagon_maxdf3:
+fmax:
+	{
+		p0 = dfclass(A,#0x10)
+		p1 = dfcmp.gt(B,A)
+		ATMP = A
+	}
+	{
+		if (p0) A = B
+		if (p1) A = B
+		p2 = dfcmp.eq(A,B)
+		if (!p2.new) jumpr:t r31
+	}
+	/* A == B, return A&B to select 0.0 over -0.0 */
+	{
+		A = and(ATMP,B)
+		jumpr r31
+	}
+END(__hexagon_maxdf3)
--- a/lib/builtins/hexagon/dfmul.S
+++ b/lib/builtins/hexagon/dfmul.S
@ -0,0 +1,418 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision Multiply */
+#define A r1:0
+#define AH r1
+#define AL r0
+#define B r3:2
+#define BH r3
+#define BL r2
+
+#define BTMP r5:4
+#define BTMPH r5
+#define BTMPL r4
+
+#define PP_ODD r7:6
+#define PP_ODD_H r7
+#define PP_ODD_L r6
+
+#define ONE r9:8
+#define S_ONE r8
+#define S_ZERO r9
+
+#define PP_HH r11:10
+#define PP_HH_H r11
+#define PP_HH_L r10
+
+#define ATMP r13:12
+#define ATMPH r13
+#define ATMPL r12
+
+#define PP_LL r15:14
+#define PP_LL_H r15
+#define PP_LL_L r14
+
+#define TMP r28
+
+#define MANTBITS 52
+#define HI_MANTBITS 20
+#define EXPBITS 11
+#define BIAS 1024
+#define MANTISSA_TO_INT_BIAS 52
+
+/* Some constant to adjust normalization amount in error code */
+/* Amount to right shift the partial product to get to a denorm */
+#define FUDGE 5
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+#define END(TAG) .size TAG,.-TAG
+
+#define SR_ROUND_OFF 22
+	.text
+	.global __hexagon_muldf3
+	.type __hexagon_muldf3,@function
+	Q6_ALIAS(muldf3)
+  FAST_ALIAS(muldf3)
+  FAST2_ALIAS(muldf3)
+	.p2align 5
+__hexagon_muldf3:
+	{
+		p0 = dfclass(A,#2)
+		p0 = dfclass(B,#2)
+		ATMP = combine(##0x40000000,#0)
+	}
+	{
+		ATMP = insert(A,#MANTBITS,#EXPBITS-1)
+		BTMP = asl(B,#EXPBITS-1)
+		TMP = #-BIAS
+		ONE = #1
+	}
+	{
+		PP_ODD = mpyu(BTMPL,ATMPH)
+		BTMP = insert(ONE,#2,#62)
+	}
+	/* since we know that the MSB of the H registers is zero, we should never carry */
+	/* H <= 2^31-1.  L <= 2^32-1.  Therefore, HL <= 2^63-2^32-2^31+1 */
+	/* Adding 2 HLs, we get 2^64-3*2^32+2 maximum.  */
+	/* Therefore, we can add 3 2^32-1 values safely without carry.  We only need one. */
+	{
+		PP_LL = mpyu(ATMPL,BTMPL)
+		PP_ODD += mpyu(ATMPL,BTMPH)
+	}
+	{
+		PP_ODD += lsr(PP_LL,#32)
+		PP_HH = mpyu(ATMPH,BTMPH)
+		BTMP = combine(##BIAS+BIAS-4,#0)
+	}
+	{
+		PP_HH += lsr(PP_ODD,#32)
+		if (!p0) jump .Lmul_abnormal
+		p1 = cmp.eq(PP_LL_L,#0)		// 64 lsb's 0?
+		p1 = cmp.eq(PP_ODD_L,#0)	// 64 lsb's 0?
+	}
+	/*
+	 * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
+	 * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
+	 */
+#undef PP_ODD
+#undef PP_ODD_H
+#undef PP_ODD_L
+#define EXP10 r7:6
+#define EXP1 r7
+#define EXP0 r6
+	{
+		if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
+		EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
+	}
+	{
+		PP_LL = neg(PP_HH)
+		EXP0 += add(TMP,EXP1)
+		TMP = xor(AH,BH)
+	}
+	{
+		if (!p2.new) PP_HH = PP_LL
+		p2 = cmp.gt(TMP,#-1)
+		p0 = !cmp.gt(EXP0,BTMPH)
+		p0 = cmp.gt(EXP0,BTMPL)
+		if (!p0.new) jump:nt .Lmul_ovf_unf
+	}
+	{
+		A = convert_d2df(PP_HH)
+		EXP0 = add(EXP0,#-BIAS-58)
+	}
+	{
+		AH += asl(EXP0,#HI_MANTBITS)
+		jumpr r31
+	}
+
+	.falign
+.Lpossible_unf:
+	/* We end up with a positive exponent */
+	/* But we may have rounded up to an exponent of 1. */
+	/* If the exponent is 1, if we rounded up to it
+	 * we need to also raise underflow
+	 * Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
+	 * And the PP should also have more than one bit set
+	 */
+	/* Note: ATMP should have abs(PP_HH) */
+	/* Note: BTMPL should have 0x7FEFFFFF */
+	{
+		p0 = cmp.eq(AL,#0)
+		p0 = bitsclr(AH,BTMPL)
+		if (!p0.new) jumpr:t r31
+		BTMPH = #0x7fff
+	}
+	{
+		p0 = bitsset(ATMPH,BTMPH)
+		BTMPL = USR
+		BTMPH = #0x030
+	}
+	{
+		if (p0) BTMPL = or(BTMPL,BTMPH)
+	}
+	{
+		USR = BTMPL
+	}
+	{
+		p0 = dfcmp.eq(A,A)
+		jumpr r31
+	}
+	.falign
+.Lmul_ovf_unf:
+	{
+		A = convert_d2df(PP_HH)
+		ATMP = abs(PP_HH)			// take absolute value
+		EXP1 = add(EXP0,#-BIAS-58)
+	}
+	{
+		AH += asl(EXP1,#HI_MANTBITS)
+		EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
+		BTMPL = ##0x7FEFFFFF
+	}
+	{
+		EXP1 += add(EXP0,##-BIAS-58)
+		//BTMPH = add(clb(ATMP),#-2)
+		BTMPH = #0
+	}
+	{
+		p0 = cmp.gt(EXP1,##BIAS+BIAS-2)	// overflow
+		if (p0.new) jump:nt .Lmul_ovf
+	}
+	{
+		p0 = cmp.gt(EXP1,#0)
+		if (p0.new) jump:nt .Lpossible_unf
+		BTMPH = sub(EXP0,BTMPH)
+		TMP = #63				// max amount to shift
+	}
+	/* Underflow */
+	/*
+	 * PP_HH has the partial product with sticky LSB.
+	 * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
+	 * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
+	 * The exponent of PP_HH is in  EXP1, which is non-positive (0 or negative)
+	 * That's the exponent that happens after the normalization
+	 *
+	 * EXP0 has the exponent that, when added to the normalized value, is out of range.
+	 *
+	 * Strategy:
+	 *
+	 * * Shift down bits, with sticky bit, such that the bits are aligned according
+	 *   to the LZ count and appropriate exponent, but not all the way to mantissa
+	 *   field, keep around the last few bits.
+	 * * Put a 1 near the MSB
+	 * * Check the LSBs for inexact; if inexact also set underflow
+	 * * Convert [u]d2df -- will correctly round according to rounding mode
+	 * * Replace exponent field with zero
+	 *
+	 *
+	 */
+
+
+	{
+		BTMPL = #0	 			// offset for extract
+		BTMPH = sub(#FUDGE,BTMPH)		// amount to right shift
+	}
+	{
+		p3 = cmp.gt(PP_HH_H,#-1)		// is it positive?
+		BTMPH = min(BTMPH,TMP)			// Don't shift more than 63
+		PP_HH = ATMP
+	}
+	{
+		TMP = USR
+		PP_LL = extractu(PP_HH,BTMP)
+	}
+	{
+		PP_HH = asr(PP_HH,BTMPH)
+		BTMPL = #0x0030					// underflow flag
+		AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
+	}
+	{
+		p0 = cmp.gtu(ONE,PP_LL)				// Did we extract all zeros?
+		if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE)	// add sticky bit
+		PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3)	// Add back in a bit so we can use convert instruction
+	}
+	{
+		PP_LL = neg(PP_HH)
+		p1 = bitsclr(PP_HH_L,#0x7)		// Are the LSB's clear?
+		if (!p1.new) TMP = or(BTMPL,TMP)	// If not, Inexact+Underflow
+	}
+	{
+		if (!p3) PP_HH = PP_LL
+		USR = TMP
+	}
+	{
+		A = convert_d2df(PP_HH)			// Do rounding
+		p0 = dfcmp.eq(A,A)			// realize exception
+	}
+	{
+		AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1)		// Insert correct exponent
+		jumpr r31
+	}
+	.falign
+.Lmul_ovf:
+	// We get either max finite value or infinity.  Either way, overflow+inexact
+	{
+		TMP = USR
+		ATMP = combine(##0x7fefffff,#-1)	// positive max finite
+		A = PP_HH
+	}
+	{
+		PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF)	// rounding bits
+		TMP = or(TMP,#0x28)			// inexact + overflow
+		BTMP = combine(##0x7ff00000,#0)		// positive infinity
+	}
+	{
+		USR = TMP
+		PP_LL_L ^= lsr(AH,#31)			// Does sign match rounding?
+		TMP = PP_LL_L				// unmodified rounding mode
+	}
+	{
+		p0 = !cmp.eq(TMP,#1)			// If not round-to-zero and
+		p0 = !cmp.eq(PP_LL_L,#2)		// Not rounding the other way,
+		if (p0.new) ATMP = BTMP			// we should get infinity
+		p0 = dfcmp.eq(A,A)			// Realize FP exception if enabled
+	}
+	{
+		A = insert(ATMP,#63,#0)			// insert inf/maxfinite, leave sign
+		jumpr r31
+	}
+
+.Lmul_abnormal:
+	{
+		ATMP = extractu(A,#63,#0)		// strip off sign
+		BTMP = extractu(B,#63,#0)		// strip off sign
+	}
+	{
+		p3 = cmp.gtu(ATMP,BTMP)
+		if (!p3.new) A = B			// sort values
+		if (!p3.new) B = A			// sort values
+	}
+	{
+		// Any NaN --> NaN, possibly raise invalid if sNaN
+		p0 = dfclass(A,#0x0f)		// A not NaN?
+		if (!p0.new) jump:nt .Linvalid_nan
+		if (!p3) ATMP = BTMP
+		if (!p3) BTMP = ATMP
+	}
+	{
+		// Infinity * nonzero number is infinity
+		p1 = dfclass(A,#0x08)		// A is infinity
+		p1 = dfclass(B,#0x0e)		// B is nonzero
+	}
+	{
+		// Infinity * zero --> NaN, raise invalid
+		// Other zeros return zero
+		p0 = dfclass(A,#0x08)		// A is infinity
+		p0 = dfclass(B,#0x01)		// B is zero
+	}
+	{
+		if (p1) jump .Ltrue_inf
+		p2 = dfclass(B,#0x01)
+	}
+	{
+		if (p0) jump .Linvalid_zeroinf
+		if (p2) jump .Ltrue_zero		// so return zero
+		TMP = ##0x7c000000
+	}
+	// We are left with a normal or subnormal times a subnormal. A > B
+	// If A and B are both very small (exp(a) < BIAS-MANTBITS),
+	// we go to a single sticky bit, which we can round easily.
+	// If A and B might multiply to something bigger, decrease A exponent and increase
+	// B exponent and try again
+	{
+		p0 = bitsclr(AH,TMP)
+		if (p0.new) jump:nt .Lmul_tiny
+	}
+	{
+		TMP = cl0(BTMP)
+	}
+	{
+		TMP = add(TMP,#-EXPBITS)
+	}
+	{
+		BTMP = asl(BTMP,TMP)
+	}
+	{
+		B = insert(BTMP,#63,#0)
+		AH -= asl(TMP,#HI_MANTBITS)
+	}
+	jump __hexagon_muldf3
+.Lmul_tiny:
+	{
+		TMP = USR
+		A = xor(A,B)				// get sign bit
+	}
+	{
+		TMP = or(TMP,#0x30)			// Inexact + Underflow
+		A = insert(ONE,#63,#0)			// put in rounded up value
+		BTMPH = extractu(TMP,#2,#SR_ROUND_OFF)	// get rounding mode
+	}
+	{
+		USR = TMP
+		p0 = cmp.gt(BTMPH,#1)			// Round towards pos/neg inf?
+		if (!p0.new) AL = #0			// If not, zero
+		BTMPH ^= lsr(AH,#31)			// rounding my way --> set LSB
+	}
+	{
+		p0 = cmp.eq(BTMPH,#3)			// if rounding towards right inf
+		if (!p0.new) AL = #0			// don't go to zero
+		jumpr r31
+	}
+.Linvalid_zeroinf:
+	{
+		TMP = USR
+	}
+	{
+		A = #-1
+		TMP = or(TMP,#2)
+	}
+	{
+		USR = TMP
+	}
+	{
+		p0 = dfcmp.uo(A,A)			// force exception if enabled
+		jumpr r31
+	}
+.Linvalid_nan:
+	{
+		p0 = dfclass(B,#0x0f)			// if B is not NaN
+		TMP = convert_df2sf(A)			// will generate invalid if sNaN
+		if (p0.new) B = A 			// make it whatever A is
+	}
+	{
+		BL = convert_df2sf(B)			// will generate invalid if sNaN
+		A = #-1
+		jumpr r31
+	}
+	.falign
+.Ltrue_zero:
+	{
+		A = B
+		B = A
+	}
+.Ltrue_inf:
+	{
+		BH = extract(BH,#1,#31)
+	}
+	{
+		AH ^= asl(BH,#31)
+		jumpr r31
+	}
+END(__hexagon_muldf3)
+
+#undef ATMP
+#undef ATMPL
+#undef ATMPH
+#undef BTMP
+#undef BTMPL
+#undef BTMPH
--- a/lib/builtins/hexagon/dfsqrt.S
+++ b/lib/builtins/hexagon/dfsqrt.S
@ -0,0 +1,406 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+/* Double Precision square root */
+
+#define EXP r28
+
+#define A r1:0
+#define AH r1
+#define AL r0
+
+#define SFSH r3:2
+#define SF_S r3
+#define SF_H r2
+
+#define SFHALF_SONE r5:4
+#define S_ONE r4
+#define SFHALF r5
+#define SF_D r6
+#define SF_E r7
+#define RECIPEST r8
+#define SFRAD r9
+
+#define FRACRAD r11:10
+#define FRACRADH r11
+#define FRACRADL r10
+
+#define ROOT r13:12
+#define ROOTHI r13
+#define ROOTLO r12
+
+#define PROD r15:14
+#define PRODHI r15
+#define PRODLO r14
+
+#define P_TMP p0
+#define P_EXP1 p1
+#define NORMAL p2
+
+#define SF_EXPBITS 8
+#define SF_MANTBITS 23
+
+#define DF_EXPBITS 11
+#define DF_MANTBITS 52
+
+#define DF_BIAS 0x3ff
+
+#define DFCLASS_ZERO     0x01
+#define DFCLASS_NORMAL   0x02
+#define DFCLASS_DENORMAL 0x02
+#define DFCLASS_INFINITE 0x08
+#define DFCLASS_NAN      0x10
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function
+#define END(TAG) .size TAG,.-TAG
+
+	.text
+	.global __hexagon_sqrtdf2
+	.type __hexagon_sqrtdf2,@function
+	.global __hexagon_sqrt
+	.type __hexagon_sqrt,@function
+	Q6_ALIAS(sqrtdf2)
+	Q6_ALIAS(sqrt)
+	FAST_ALIAS(sqrtdf2)
+	FAST_ALIAS(sqrt)
+	FAST2_ALIAS(sqrtdf2)
+	FAST2_ALIAS(sqrt)
+	.type sqrt,@function
+	.p2align 5
+__hexagon_sqrtdf2:
+__hexagon_sqrt:
+	{
+		PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)
+		EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
+		SFHALF_SONE = combine(##0x3f000004,#1)
+	}
+	{
+		NORMAL = dfclass(A,#DFCLASS_NORMAL)		// Is it normal
+		NORMAL = cmp.gt(AH,#-1)				// and positive?
+		if (!NORMAL.new) jump:nt .Lsqrt_abnormal
+		SFRAD = or(SFHALF,PRODLO)
+	}
+#undef NORMAL
+.Ldenormal_restart:
+	{
+		FRACRAD = A
+		SF_E,P_TMP = sfinvsqrta(SFRAD)
+		SFHALF = and(SFHALF,#-16)
+		SFSH = #0
+	}
+#undef A
+#undef AH
+#undef AL
+#define ERROR r1:0
+#define ERRORHI r1
+#define ERRORLO r0
+	// SF_E : reciprocal square root
+	// SF_H : half rsqrt
+	// sf_S : square root
+	// SF_D : error term
+	// SFHALF: 0.5
+	{
+		SF_S += sfmpy(SF_E,SFRAD):lib		// s0: root
+		SF_H += sfmpy(SF_E,SFHALF):lib		// h0: 0.5*y0. Could also decrement exponent...
+		SF_D = SFHALF
+#undef SFRAD
+#define SHIFTAMT r9
+		SHIFTAMT = and(EXP,#1)
+	}
+	{
+		SF_D -= sfmpy(SF_S,SF_H):lib		// d0: 0.5-H*S = 0.5-0.5*~1
+		FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32)	// replace upper bits with hidden
+		P_EXP1 = cmp.gtu(SHIFTAMT,#0)
+	}
+	{
+		SF_S += sfmpy(SF_S,SF_D):lib		// s1: refine sqrt
+		SF_H += sfmpy(SF_H,SF_D):lib		// h1: refine half-recip
+		SF_D = SFHALF
+		SHIFTAMT = mux(P_EXP1,#8,#9)
+	}
+	{
+		SF_D -= sfmpy(SF_S,SF_H):lib		// d1: error term
+		FRACRAD = asl(FRACRAD,SHIFTAMT)		// Move fracrad bits to right place
+		SHIFTAMT = mux(P_EXP1,#3,#2)
+	}
+	{
+		SF_H += sfmpy(SF_H,SF_D):lib		// d2: rsqrt
+		// cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x).
+		PROD = asl(FRACRAD,SHIFTAMT)		// fracrad<<(2+exp1)
+	}
+	{
+		SF_H = and(SF_H,##0x007fffff)
+	}
+	{
+		SF_H = add(SF_H,##0x00800000 - 3)
+		SHIFTAMT = mux(P_EXP1,#7,#8)
+	}
+	{
+		RECIPEST = asl(SF_H,SHIFTAMT)
+		SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0))
+	}
+	{
+		ROOT = mpyu(RECIPEST,PRODHI)		// root = mpyu_full(recipest,hi(fracrad<<(2+exp1)))
+	}
+
+#undef SFSH	// r3:2
+#undef SF_H	// r2
+#undef SF_S	// r3
+#undef S_ONE	// r4
+#undef SFHALF	// r5
+#undef SFHALF_SONE	// r5:4
+#undef SF_D	// r6
+#undef SF_E	// r7
+
+#define HL r3:2
+#define LL r5:4
+#define HH r7:6
+
+#undef P_EXP1
+#define P_CARRY0 p1
+#define P_CARRY1 p2
+#define P_CARRY2 p3
+
+	/* Iteration 0 */
+	/* Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply */
+	/* We can shift and subtract instead of shift and add? */
+	{
+		ERROR = asl(FRACRAD,#15)
+		PROD = mpyu(ROOTHI,ROOTHI)
+		P_CARRY0 = cmp.eq(r0,r0)
+	}
+	{
+		ERROR -= asl(PROD,#15)
+		PROD = mpyu(ROOTHI,ROOTLO)
+		P_CARRY1 = cmp.eq(r0,r0)
+	}
+	{
+		ERROR -= lsr(PROD,#16)
+		P_CARRY2 = cmp.eq(r0,r0)
+	}
+	{
+		ERROR = mpyu(ERRORHI,RECIPEST)
+	}
+	{
+		ROOT += lsr(ERROR,SHIFTAMT)
+		SHIFTAMT = add(SHIFTAMT,#16)
+		ERROR = asl(FRACRAD,#31)		// for next iter
+	}
+	/* Iteration 1 */
+	{
+		PROD = mpyu(ROOTHI,ROOTHI)
+		ERROR -= mpyu(ROOTHI,ROOTLO)	// amount is 31, no shift needed
+	}
+	{
+		ERROR -= asl(PROD,#31)
+		PROD = mpyu(ROOTLO,ROOTLO)
+	}
+	{
+		ERROR -= lsr(PROD,#33)
+	}
+	{
+		ERROR = mpyu(ERRORHI,RECIPEST)
+	}
+	{
+		ROOT += lsr(ERROR,SHIFTAMT)
+		SHIFTAMT = add(SHIFTAMT,#16)
+		ERROR = asl(FRACRAD,#47)	// for next iter
+	}
+	/* Iteration 2 */
+	{
+		PROD = mpyu(ROOTHI,ROOTHI)
+	}
+	{
+		ERROR -= asl(PROD,#47)
+		PROD = mpyu(ROOTHI,ROOTLO)
+	}
+	{
+		ERROR -= asl(PROD,#16)		// bidir shr 31-47
+		PROD = mpyu(ROOTLO,ROOTLO)
+	}
+	{
+		ERROR -= lsr(PROD,#17)		// 64-47
+	}
+	{
+		ERROR = mpyu(ERRORHI,RECIPEST)
+	}
+	{
+		ROOT += lsr(ERROR,SHIFTAMT)
+	}
+#undef ERROR
+#undef PROD
+#undef PRODHI
+#undef PRODLO
+#define REM_HI r15:14
+#define REM_HI_HI r15
+#define REM_LO r1:0
+#undef RECIPEST
+#undef SHIFTAMT
+#define TWOROOT_LO r9:8
+	/* Adjust Root */
+	{
+		HL = mpyu(ROOTHI,ROOTLO)
+		LL = mpyu(ROOTLO,ROOTLO)
+		REM_HI = #0
+		REM_LO = #0
+	}
+	{
+		HL += lsr(LL,#33)
+		LL += asl(HL,#33)
+		P_CARRY0 = cmp.eq(r0,r0)
+	}
+	{
+		HH = mpyu(ROOTHI,ROOTHI)
+		REM_LO = sub(REM_LO,LL,P_CARRY0):carry
+		TWOROOT_LO = #1
+	}
+	{
+		HH += lsr(HL,#31)
+		TWOROOT_LO += asl(ROOT,#1)
+	}
+#undef HL
+#undef LL
+#define REM_HI_TMP r3:2
+#define REM_HI_TMP_HI r3
+#define REM_LO_TMP r5:4
+	{
+		REM_HI = sub(FRACRAD,HH,P_CARRY0):carry
+		REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry
+#undef FRACRAD
+#undef HH
+#define ZERO r11:10
+#define ONE r7:6
+		ONE = #1
+		ZERO = #0
+	}
+	{
+		REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry
+		ONE = add(ROOT,ONE)
+		EXP = add(EXP,#-DF_BIAS)			// subtract bias --> signed exp
+	}
+	{
+				// If carry set, no borrow: result was still positive
+		if (P_CARRY1) ROOT = ONE
+		if (P_CARRY1) REM_LO = REM_LO_TMP
+		if (P_CARRY1) REM_HI = REM_HI_TMP
+	}
+	{
+		REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry
+		ONE = #1
+		EXP = asr(EXP,#1)				// divide signed exp by 2
+	}
+	{
+		REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry
+		ONE = add(ROOT,ONE)
+	}
+	{
+		if (P_CARRY2) ROOT = ONE
+		if (P_CARRY2) REM_LO = REM_LO_TMP
+								// since tworoot <= 2^32, remhi must be zero
+#undef REM_HI_TMP
+#undef REM_HI_TMP_HI
+#define S_ONE r2
+#define ADJ r3
+		S_ONE = #1
+	}
+	{
+		P_TMP = cmp.eq(REM_LO,ZERO)			// is the low part zero
+		if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE)	// if so, it's exact... hopefully
+		ADJ = cl0(ROOT)
+		EXP = add(EXP,#-63)
+	}
+#undef REM_LO
+#define RET r1:0
+#define RETHI r1
+	{
+		RET = convert_ud2df(ROOT)			// set up mantissa, maybe set inexact flag
+		EXP = add(EXP,ADJ)				// add back bias
+	}
+	{
+		RETHI += asl(EXP,#DF_MANTBITS-32)		// add exponent adjust
+		jumpr r31
+	}
+#undef REM_LO_TMP
+#undef REM_HI_TMP
+#undef REM_HI_TMP_HI
+#undef REM_LO
+#undef REM_HI
+#undef TWOROOT_LO
+
+#undef RET
+#define A r1:0
+#define AH r1
+#define AL r1
+#undef S_ONE
+#define TMP r3:2
+#define TMPHI r3
+#define TMPLO r2
+#undef P_CARRY0
+#define P_NEG p1
+
+
+#define SFHALF r5
+#define SFRAD r9
+.Lsqrt_abnormal:
+	{
+		P_TMP = dfclass(A,#DFCLASS_ZERO)			// zero?
+		if (P_TMP.new) jumpr:t r31
+	}
+	{
+		P_TMP = dfclass(A,#DFCLASS_NAN)
+		if (P_TMP.new) jump:nt .Lsqrt_nan
+	}
+	{
+		P_TMP = cmp.gt(AH,#-1)
+		if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg
+		if (!P_TMP.new) EXP = ##0x7F800001			// sNaN
+	}
+	{
+		P_TMP = dfclass(A,#DFCLASS_INFINITE)
+		if (P_TMP.new) jumpr:nt r31
+	}
+	// If we got here, we're denormal
+	// prepare to restart
+	{
+		A = extractu(A,#DF_MANTBITS,#0)		// Extract mantissa
+	}
+	{
+		EXP = add(clb(A),#-DF_EXPBITS)		// how much to normalize?
+	}
+	{
+		A = asl(A,EXP)				// Shift mantissa
+		EXP = sub(#1,EXP)			// Form exponent
+	}
+	{
+		AH = insert(EXP,#1,#DF_MANTBITS-32)		// insert lsb of exponent
+	}
+	{
+		TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)	// get sf value (mant+exp1)
+		SFHALF = ##0x3f000004						// form half constant
+	}
+	{
+		SFRAD = or(SFHALF,TMPLO)			// form sf value
+		SFHALF = and(SFHALF,#-16)
+		jump .Ldenormal_restart				// restart
+	}
+.Lsqrt_nan:
+	{
+		EXP = convert_df2sf(A)				// if sNaN, get invalid
+		A = #-1						// qNaN
+		jumpr r31
+	}
+.Lsqrt_invalid_neg:
+	{
+		A = convert_sf2df(EXP)				// Invalid,NaNval
+		jumpr r31
+	}
+END(__hexagon_sqrt)
+END(__hexagon_sqrtdf2)
--- a/lib/builtins/hexagon/divdi3.S
+++ b/lib/builtins/hexagon/divdi3.S
@ -0,0 +1,85 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+	.macro FUNCTION_BEGIN name
+	.text
+        .p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_divdi3
+	{
+		p2 = tstbit(r1,#31)
+		p3 = tstbit(r3,#31)
+	}
+	{
+		r1:0 = abs(r1:0)
+		r3:2 = abs(r3:2)
+	}
+	{
+		r6 = cl0(r1:0)              // count leading 0's of dividend (numerator)
+		r7 = cl0(r3:2)              // count leading 0's of divisor (denominator)
+		r5:4 = r3:2                 // divisor moved into working registers
+		r3:2 = r1:0                 // dividend is the initial remainder, r3:2 contains remainder
+	}
+	{
+		p3 = xor(p2,p3)
+		r10 = sub(r7,r6)            // left shift count for bit & divisor
+		r1:0 = #0                   // initialize quotient to 0
+		r15:14 = #1                 // initialize bit to 1
+	}
+	{
+		r11 = add(r10,#1)           // loop count is 1 more than shift count
+		r13:12 = lsl(r5:4,r10)      // shift divisor msb into same bit position as dividend msb
+		r15:14 = lsl(r15:14,r10)    // shift the bit left by same amount as divisor
+	}
+	{
+		p0 = cmp.gtu(r5:4,r3:2)     // check if divisor > dividend
+		loop0(1f,r11)               // register loop
+	}
+	{
+		if (p0) jump .hexagon_divdi3_return          // if divisor > dividend, we're done, so return
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r13:12,r3:2)   // set predicate reg if shifted divisor > current remainder
+	}
+	{
+		r7:6 = sub(r3:2, r13:12)    // subtract shifted divisor from current remainder
+		r9:8 = add(r1:0, r15:14)    // save current quotient to temp (r9:8)
+	}
+	{
+		r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+		r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+	}
+	{
+		r15:14 = lsr(r15:14, #1)    // shift bit right by 1 for next iteration
+		r13:12 = lsr(r13:12, #1)    // shift "shifted divisor" right by 1 for next iteration
+	}:endloop0
+
+.hexagon_divdi3_return:
+	{
+		r3:2 = neg(r1:0)
+	}
+	{
+		r1:0 = vmux(p3,r3:2,r1:0)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_divdi3
+
+  .globl __qdsp_divdi3
+  .set   __qdsp_divdi3, __hexagon_divdi3
--- a/lib/builtins/hexagon/divsi3.S
+++ b/lib/builtins/hexagon/divsi3.S
@ -0,0 +1,84 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+        .p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_divsi3
+	{
+		p0 = cmp.ge(r0,#0)
+		p1 = cmp.ge(r1,#0)
+		r1 = abs(r0)
+		r2 = abs(r1)
+	}
+	{
+		r3 = cl0(r1)
+		r4 = cl0(r2)
+		r5 = sub(r1,r2)
+		p2 = cmp.gtu(r2,r1)
+	}
+#if (__HEXAGON_ARCH__ == 60)
+	{
+		r0 = #0
+		p1 = xor(p0,p1)
+		p0 = cmp.gtu(r2,r5)
+	}
+		if (p2) jumpr r31
+#else
+	{
+		r0 = #0
+		p1 = xor(p0,p1)
+		p0 = cmp.gtu(r2,r5)
+		if (p2) jumpr r31
+	}
+#endif
+	{
+		r0 = mux(p1,#-1,#1)
+		if (p0) jumpr r31
+		r4 = sub(r4,r3)
+		r3 = #1
+	}
+	{
+		r0 = #0
+		r3:2 = vlslw(r3:2,r4)
+		loop0(1f,r4)
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r1 = sub(r1,r2)
+		if (!p0.new) r0 = add(r0,r3)
+		r3:2 = vlsrw(r3:2,#1)
+	}:endloop0
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r0 = add(r0,r3)
+		if (!p1) jumpr r31
+	}
+	{
+		r0 = neg(r0)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_divsi3
+
+  .globl __qdsp_divsi3
+  .set   __qdsp_divsi3, __hexagon_divsi3
--- a/lib/builtins/hexagon/fabs_opt.S
+++ b/lib/builtins/hexagon/fabs_opt.S
@ -0,0 +1,37 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type  \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size  \name, . - \name
+.endm
+
+FUNCTION_BEGIN fabs
+  {
+    r1 = clrbit(r1, #31)
+    jumpr r31
+  }
+FUNCTION_END fabs
+
+FUNCTION_BEGIN fabsf
+  {
+    r0 = clrbit(r0, #31)
+    jumpr r31
+  }
+FUNCTION_END fabsf
+
+  .globl fabsl
+  .set fabsl, fabs
--- a/lib/builtins/hexagon/fastmath2_dlib_asm.S
+++ b/lib/builtins/hexagon/fastmath2_dlib_asm.S
@ -0,0 +1,491 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/* ==================================================================== */
+/*   FUNCTIONS Optimized double floating point operators                */
+/* ==================================================================== */
+/*      c = dadd_asm(a, b)                                              */
+/* ==================================================================== *
+fast2_QDOUBLE fast2_dadd(fast2_QDOUBLE a,fast2_QDOUBLE b) {
+      fast2_QDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = Q6_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = Q6_R_sxth_R(b) ;
+      int  exp, expdiff, j, k, hi, lo, cn;
+      lint mant;
+
+        expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+        expdiff = Q6_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) + (mantb>>expb);
+
+        hi = (int) (mant>>32);
+        lo = (int) (mant);
+
+        k =  Q6_R_normamt_R(hi);
+        if(hi == 0 || hi == -1) k = 31+Q6_R_normamt_R(lo);
+
+        mant = (mant << k);
+        cn  = (mant == 0x8000000000000000LL);
+        exp = exp - k + cn;
+
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_dadd_asm
+        .type fast2_dadd_asm, @function
+fast2_dadd_asm:
+#define manta      R0
+#define mantexpa   R1:0
+#define lmanta     R1:0
+#define mantb      R2
+#define mantexpb   R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define mantexpd   R7:6
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define manth      R1
+#define mantl      R0
+#define minmin     R11:10  // exactly 0x000000000000008001LL
+#define minminl    R10
+#define k          R4
+#define ce         P0
+        .falign
+      {
+        mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+        c63 = #62
+        expa = SXTH(manta)
+        expb = SXTH(mantb)
+      } {
+        expd = SXTH(expd)
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        manta.L = #0
+        expd = MIN(expd, c63)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+        mantb.L = #0
+        minmin = #0
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+      } {
+        lmant = add(lmanta, lmantb)
+        minminl.L = #0x8001
+      } {
+        k  = clb(lmant)
+        c63 = #58
+      } {
+        k = add(k, #-1)
+        p0 = cmp.gt(k, c63)
+      } {
+        mantexpa = ASL(lmant, k)
+        exp = SUB(exp, k)
+        if(p0) jump .Ldenorma
+      } {
+        manta = insert(exp, #16, #0)
+        jumpr  r31
+      }
+.Ldenorma:
+      {
+        mantexpa = minmin
+        jumpr  r31
+      }
+/* =================================================================== *
+ fast2_QDOUBLE fast2_dsub(fast2_QDOUBLE a,fast2_QDOUBLE b) {
+      fast2_QDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = Q6_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = Q6_R_sxth_R(b) ;
+      int  exp, expdiff, j, k;
+      lint mant;
+
+        expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+        expdiff = Q6_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) - (mantb>>expb);
+        k =  Q6_R_clb_P(mant)-1;
+        mant = (mant << k);
+        exp = exp - k;
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_dsub_asm
+        .type fast2_dsub_asm, @function
+fast2_dsub_asm:
+
+#define manta      R0
+#define mantexpa   R1:0
+#define lmanta     R1:0
+#define mantb      R2
+#define mantexpb   R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define mantexpd   R7:6
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define manth      R1
+#define mantl      R0
+#define minmin     R11:10  // exactly 0x000000000000008001LL
+#define minminl    R10
+#define k          R4
+#define ce         P0
+        .falign
+      {
+        mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+        c63 = #62
+        expa = SXTH(manta)
+        expb = SXTH(mantb)
+      } {
+        expd = SXTH(expd)
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        manta.L = #0
+        expd = MIN(expd, c63)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+        mantb.L = #0
+        minmin = #0
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+      } {
+        lmant = sub(lmanta, lmantb)
+        minminl.L = #0x8001
+      } {
+        k  = clb(lmant)
+        c63 = #58
+      } {
+        k = add(k, #-1)
+        p0 = cmp.gt(k, c63)
+      } {
+        mantexpa = ASL(lmant, k)
+        exp = SUB(exp, k)
+        if(p0) jump .Ldenorm
+      } {
+        manta = insert(exp, #16, #0)
+        jumpr  r31
+      }
+.Ldenorm:
+      {
+        mantexpa = minmin
+        jumpr  r31
+      }
+/* ==================================================================== *
+ fast2_QDOUBLE fast2_dmpy(fast2_QDOUBLE a,fast2_QDOUBLE b) {
+        fast2_QDOUBLE c;
+        lint manta = a & MANTMASK;
+        int  expa  = Q6_R_sxth_R(a) ;
+        lint mantb = b & MANTMASK;
+        int  expb  = Q6_R_sxth_R(b) ;
+        int exp, k;
+        lint mant;
+        int          hia, hib, hi, lo;
+        unsigned int loa, lob;
+
+        hia = (int)(a >> 32);
+        loa = Q6_R_extractu_RII((int)manta, 31, 1);
+        hib = (int)(b >> 32);
+        lob = Q6_R_extractu_RII((int)mantb, 31, 1);
+
+        mant = Q6_P_mpy_RR(hia, lob);
+        mant = Q6_P_mpyacc_RR(mant,hib, loa);
+        mant = (mant >> 30) + (Q6_P_mpy_RR(hia, hib)<<1);
+
+        hi = (int) (mant>>32);
+
+        k =  Q6_R_normamt_R(hi);
+        mant = mant << k;
+        exp = expa + expb - k;
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+        return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_dmpy_asm
+        .type fast2_dmpy_asm, @function
+fast2_dmpy_asm:
+
+#define mantal     R0
+#define mantah     R1
+#define mantexpa   R1:0
+#define mantbl     R2
+#define mantbh     R3
+#define mantexpb   R3:2
+#define expa       R4
+#define expb       R5
+#define c8001      R12
+#define mantexpd   R7:6
+#define mantdh     R7
+#define exp        R8
+#define lmantc     R11:10
+#define kb         R9
+#define guard      R11
+#define mantal_    R12
+#define mantbl_    R13
+#define min        R15:14
+#define minh       R15
+
+        .falign
+      {
+        mantbl_= lsr(mantbl, #16)
+        expb = sxth(mantbl)
+        expa = sxth(mantal)
+        mantal_= lsr(mantal, #16)
+      }
+      {
+        lmantc = mpy(mantah, mantbh)
+        mantexpd = mpy(mantah, mantbl_)
+        mantal.L = #0x0
+        min = #0
+      }
+      {
+        lmantc = add(lmantc, lmantc)
+        mantexpd+= mpy(mantbh, mantal_)
+        mantbl.L = #0x0
+        minh.H = #0x8000
+      }
+      {
+        mantexpd = asr(mantexpd, #15)
+        c8001.L =  #0x8001
+        p1 = cmp.eq(mantexpa, mantexpb)
+      }
+      {
+        mantexpd = add(mantexpd, lmantc)
+        exp = add(expa, expb)
+        p2 = cmp.eq(mantexpa, min)
+      }
+      {
+        kb  = clb(mantexpd)
+        mantexpb = abs(mantexpd)
+        guard = #58
+      }
+      {
+        p1 = and(p1, p2)
+        exp = sub(exp, kb)
+        kb = add(kb, #-1)
+	p0 = cmp.gt(kb, guard)
+      }
+      {
+        exp = add(exp, #1)
+        mantexpa = asl(mantexpd, kb)
+        if(p1) jump .Lsat   //rarely happens
+      }
+      {
+        mantal = insert(exp,#16, #0)
+        if(!p0) jumpr  r31
+      }
+      {
+        mantal = insert(c8001,#16, #0)
+        jumpr  r31
+      }
+.Lsat:
+      {
+        mantexpa = #-1
+      }
+      {
+        mantexpa = lsr(mantexpa, #1)
+      }
+      {
+        mantal = insert(exp,#16, #0)
+        jumpr  r31
+      }
+
+/* ==================================================================== *
+ int fast2_qd2f(fast2_QDOUBLE a) {
+        int exp;
+        long long int manta;
+        int ic, rnd, mantb;
+
+        manta = a>>32;
+        exp = Q6_R_sxth_R(a) ;
+        ic = 0x80000000 & manta;
+        manta = Q6_R_abs_R_sat(manta);
+        mantb = (manta + rnd)>>7;
+        rnd = 0x40
+        exp = (exp + 126);
+        if((manta & 0xff) == rnd) rnd = 0x00;
+        if((manta & 0x7fffffc0) == 0x7fffffc0) {
+           manta = 0x0; exp++;
+        } else {
+           manta= mantb & 0x007fffff;
+        }
+        exp = (exp << 23) & 0x7fffffc0;
+        ic = Q6_R_addacc_RR(ic, exp, manta);
+        return (ic);
+ }
+ * ==================================================================== */
+
+        .text
+        .global fast2_qd2f_asm
+        .type fast2_qd2f_asm, @function
+fast2_qd2f_asm:
+#define mantah   R1
+#define mantal   R0
+#define cff      R0
+#define mant     R3
+#define expo     R4
+#define rnd      R5
+#define mask     R6
+#define c07f     R7
+#define c80      R0
+#define mantb    R2
+#define ic       R0
+
+      .falign
+     {
+       mant = abs(mantah):sat
+       expo = sxth(mantal)
+       rnd = #0x40
+       mask.L = #0xffc0
+     }
+     {
+       cff = extractu(mant, #8, #0)
+       p2 = cmp.gt(expo, #126)
+       p3 = cmp.ge(expo, #-126)
+       mask.H = #0x7fff
+     }
+     {
+       p1 = cmp.eq(cff,#0x40)
+       if(p1.new) rnd = #0
+       expo = add(expo, #126)
+       if(!p3) jump .Lmin
+     }
+     {
+       p0 = bitsset(mant, mask)
+       c80.L = #0x0000
+       mantb = add(mant, rnd)
+       c07f = lsr(mask, #8)
+     }
+     {
+       if(p0) expo = add(expo, #1)
+       if(p0) mant = #0
+       mantb = lsr(mantb, #7)
+       c80.H = #0x8000
+     }
+     {
+       ic = and(c80, mantah)
+       mask &= asl(expo, #23)
+       if(!p0) mant = and(mantb, c07f)
+       if(p2) jump .Lmax
+     }
+     {
+       ic += add(mask, mant)
+       jumpr r31
+     }
+.Lmax:
+     {
+       ic.L = #0xffff;
+     }
+     {
+       ic.H = #0x7f7f;
+       jumpr r31
+     }
+.Lmin:
+     {
+       ic = #0x0
+       jumpr r31
+     }
+
+/* ==================================================================== *
+fast2_QDOUBLE fast2_f2qd(int ia) {
+        lint exp;
+        lint mant;
+        fast2_QDOUBLE c;
+
+        mant = ((ia << 7) | 0x40000000)&0x7fffff80 ;
+        if (ia & 0x80000000) mant = -mant;
+        exp =  ((ia >> 23) & 0xFFLL) - 126;
+        c = (mant<<32) | Q6_R_zxth_R(exp);;
+        return(c);
+}
+ * ==================================================================== */
+        .text
+        .global fast2_f2qd_asm
+        .type fast2_f2qd_asm, @function
+fast2_f2qd_asm:
+#define ia    R0
+#define mag   R3
+#define mantr R1
+#define expr  R0
+#define zero  R2
+#define maxneg R5:4
+#define maxnegl R4
+        .falign
+  {
+       mantr = asl(ia, #7)
+       p0 = tstbit(ia, #31)
+       maxneg = #0
+       mag = add(ia,ia)
+  }
+  {
+       mantr = setbit(mantr, #30)
+       expr= extractu(ia,#8,#23)
+       maxnegl.L = #0x8001
+       p1 = cmp.eq(mag, #0)
+  }
+  {
+       mantr= extractu(mantr, #31, #0)
+       expr= add(expr, #-126)
+       zero = #0
+       if(p1) jump .Lminqd
+  }
+  {
+       expr = zxth(expr)
+       if(p0) mantr= sub(zero, mantr)
+       jumpr r31
+  }
+.Lminqd:
+  {
+       R1:0 = maxneg
+       jumpr r31
+  }
--- a/lib/builtins/hexagon/fastmath2_ldlib_asm.S
+++ b/lib/builtins/hexagon/fastmath2_ldlib_asm.S
@ -0,0 +1,345 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/* ==================================================================== *
+
+fast2_QLDOUBLE fast2_ldadd(fast2_QLDOUBLE a,fast2_QLDOUBLE b) {
+      fast2_QLDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = Q6_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = Q6_R_sxth_R(b) ;
+      int  exp, expdiff, j, k, hi, lo, cn;
+      lint mant;
+
+        expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+        expdiff = Q6_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) + (mantb>>expb);
+
+        hi = (int) (mant>>32);
+        lo = (int) (mant);
+
+        k =  Q6_R_normamt_R(hi);
+        if(hi == 0 || hi == -1) k = 31+Q6_R_normamt_R(lo);
+
+        mant = (mant << k);
+        cn  = (mant == 0x8000000000000000LL);
+        exp = exp - k + cn;
+
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_ldadd_asm
+        .type fast2_ldadd_asm, @function
+fast2_ldadd_asm:
+#define manta      R1:0
+#define lmanta     R1:0
+#define mantb      R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define k          R4
+#define ce         P0
+#define zero       R3:2
+        .falign
+      {
+        expa = memw(r29+#8)
+        expb = memw(r29+#24)
+        r7 = r0
+      }
+      {
+        expd = sub(expa, expb):sat
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        expd = abs(expd):sat
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        c63 = #62
+      } {
+        expd = MIN(expd, c63)
+        manta = memd(r29+#0)
+        mantb = memd(r29+#16)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+      } {
+        lmant = add(lmanta, lmantb)
+        zero = #0
+      } {
+        k  = clb(lmant)
+        c63.L =#0x0001
+      } {
+        exp -= add(k, #-1)  //exp =  exp - (k-1)
+        k = add(k, #-1)
+        p0 = cmp.gt(k, #58)
+        c63.H =#0x8000
+      } {
+        if(!p0)memw(r7+#8) = exp
+        lmant = ASL(lmant, k)
+        if(p0) jump .Ldenorma
+      } {
+        memd(r7+#0) = lmant
+        jumpr  r31
+      }
+.Ldenorma:
+        memd(r7+#0) = zero
+      {
+        memw(r7+#8) = c63
+        jumpr  r31
+      }
+/* =================================================================== *
+ fast2_QLDOUBLE fast2_ldsub(fast2_QLDOUBLE a,fast2_QLDOUBLE b) {
+      fast2_QLDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = Q6_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = Q6_R_sxth_R(b) ;
+      int  exp, expdiff, j, k;
+      lint mant;
+
+        expdiff = (int) Q6_P_vabsdiffh_PP(a, b);
+        expdiff = Q6_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) - (mantb>>expb);
+        k =  Q6_R_clb_P(mant)-1;
+        mant = (mant << k);
+        exp = exp - k;
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_ldsub_asm
+        .type fast2_ldsub_asm, @function
+fast2_ldsub_asm:
+#define manta      R1:0
+#define lmanta     R1:0
+#define mantb      R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define k          R4
+#define ce         P0
+#define zero       R3:2
+        .falign
+      {
+        expa = memw(r29+#8)
+        expb = memw(r29+#24)
+        r7 = r0
+      }
+      {
+        expd = sub(expa, expb):sat
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        expd = abs(expd):sat
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        c63 = #62
+      } {
+        expd = min(expd, c63)
+        manta = memd(r29+#0)
+        mantb = memd(r29+#16)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+      } {
+        lmant = sub(lmanta, lmantb)
+        zero = #0
+      } {
+        k  = clb(lmant)
+        c63.L =#0x0001
+      } {
+        exp -= add(k, #-1)  //exp =  exp - (k+1)
+        k = add(k, #-1)
+        p0 = cmp.gt(k, #58)
+        c63.H =#0x8000
+      } {
+        if(!p0)memw(r7+#8) = exp
+        lmant = asl(lmant, k)
+        if(p0) jump .Ldenorma_s
+      } {
+        memd(r7+#0) = lmant
+        jumpr  r31
+      }
+.Ldenorma_s:
+        memd(r7+#0) = zero
+      {
+        memw(r7+#8) = c63
+        jumpr  r31
+      }
+
+/* ==================================================================== *
+ fast2_QLDOUBLE fast2_ldmpy(fast2_QLDOUBLE a,fast2_QLDOUBLE b) {
+        fast2_QLDOUBLE c;
+        lint manta = a & MANTMASK;
+        int  expa  = Q6_R_sxth_R(a) ;
+        lint mantb = b & MANTMASK;
+        int  expb  = Q6_R_sxth_R(b) ;
+        int exp, k;
+        lint mant;
+        int          hia, hib, hi, lo;
+        unsigned int loa, lob;
+
+        hia = (int)(a >> 32);
+        loa = Q6_R_extractu_RII((int)manta, 31, 1);
+        hib = (int)(b >> 32);
+        lob = Q6_R_extractu_RII((int)mantb, 31, 1);
+
+        mant = Q6_P_mpy_RR(hia, lob);
+        mant = Q6_P_mpyacc_RR(mant,hib, loa);
+        mant = (mant >> 30) + (Q6_P_mpy_RR(hia, hib)<<1);
+
+        hi = (int) (mant>>32);
+
+        k =  Q6_R_normamt_R(hi);
+        mant = mant << k;
+        exp = expa + expb - k;
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+        return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global fast2_ldmpy_asm
+        .type fast2_ldmpy_asm, @function
+fast2_ldmpy_asm:
+
+#define mantxl_    R9
+#define mantxl     R14
+#define mantxh     R15
+#define mantx      R15:14
+#define mantbl     R2
+#define mantbl_    R8
+#define mantbh     R3
+#define mantb      R3:2
+#define expa       R4
+#define expb       R5
+#define c8001      R8
+#define mantd      R7:6
+#define lmantc     R11:10
+#define kp         R9
+#define min        R13:12
+#define minh       R13
+#define max        R13:12
+#define maxh       R13
+#define ret        R0
+
+        .falign
+      {
+        mantx = memd(r29+#0)
+        mantb = memd(r29+#16)
+        min = #0
+      }
+      {
+        mantbl_= extractu(mantbl, #31, #1)
+        mantxl_= extractu(mantxl, #31, #1)
+        minh.H = #0x8000
+      }
+      {
+        lmantc = mpy(mantxh, mantbh)
+        mantd = mpy(mantxh, mantbl_)
+        expa = memw(r29+#8)
+        expb = memw(r29+#24)
+      }
+      {
+        lmantc = add(lmantc, lmantc)
+        mantd += mpy(mantbh, mantxl_)
+      }
+      {
+        mantd = asr(mantd, #30)
+        c8001.L =  #0x0001
+        p1 = cmp.eq(mantx, mantb)
+      }
+      {
+        mantd = add(mantd, lmantc)
+        expa= add(expa, expb)
+        p2 = cmp.eq(mantb, min)
+      }
+      {
+        kp  = clb(mantd)
+        c8001.H =  #0x8000
+        p1 = and(p1, p2)
+      }
+      {
+        expa-= add(kp, #-1)
+        kp = add(kp, #-1)
+        if(p1) jump .Lsat
+      }
+      {
+        mantd = asl(mantd, kp)
+        memw(ret+#8) = expa
+	p0 = cmp.gt(kp, #58)
+        if(p0.new) jump:NT .Ldenorm   //rarely happens
+      }
+      {
+        memd(ret+#0) = mantd
+        jumpr  r31
+      }
+.Lsat:
+      {
+        max = #0
+        expa+= add(kp, #1)
+      }
+      {
+        maxh.H = #0x4000
+        memw(ret+#8) = expa
+      }
+      {
+        memd(ret+#0) = max
+        jumpr  r31
+      }
+.Ldenorm:
+      {
+        memw(ret+#8) = c8001
+        mantx = #0
+      }
+      {
+        memd(ret+#0) = mantx
+        jumpr  r31
+      }
--- a/lib/builtins/hexagon/fastmath_dlib_asm.S
+++ b/lib/builtins/hexagon/fastmath_dlib_asm.S
@ -0,0 +1,400 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/* ==================================================================== */
+/*   FUNCTIONS Optimized double floating point operators                */
+/* ==================================================================== */
+/*      c = dadd_asm(a, b)                                              */
+/* ====================================================================
+
+QDOUBLE dadd(QDOUBLE a,QDOUBLE b) {
+      QDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = HEXAGON_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = HEXAGON_R_sxth_R(b) ;
+      int  exp, expdiff, j, k, hi, lo, cn;
+      lint mant;
+
+        expdiff = (int) HEXAGON_P_vabsdiffh_PP(a, b);
+        expdiff = HEXAGON_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) + (mantb>>expb);
+
+        hi = (int) (mant>>32);
+        lo = (int) (mant);
+
+        k =  HEXAGON_R_normamt_R(hi);
+        if(hi == 0 || hi == -1) k =  31+HEXAGON_R_normamt_R(lo);
+
+        mant = (mant << k);
+        cn  = (mant == 0x8000000000000000LL);
+        exp = exp - k + cn;
+
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global dadd_asm
+        .type dadd_asm, @function
+dadd_asm:
+
+#define manta      R0
+#define mantexpa   R1:0
+#define lmanta     R1:0
+#define mantb      R2
+#define mantexpb   R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define mantexpd   R7:6
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define manth      R1
+#define mantl      R0
+#define zero       R7:6
+#define zerol      R6
+#define minus      R3:2
+#define minusl     R2
+#define maxneg     R9
+#define minmin     R11:10  // exactly 0x800000000000000000LL
+#define minminh    R11
+#define k          R4
+#define kl         R5
+#define ce         P0
+        .falign
+      {
+        mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+        c63 = #62
+        expa = SXTH(manta)
+        expb = SXTH(mantb)
+      } {
+        expd = SXTH(expd)
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        manta.L = #0
+        expd = MIN(expd, c63)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+        mantb.L = #0
+        zero = #0
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+        minmin = #0
+      } {
+        lmant = add(lmanta, lmantb)
+        minus = #-1
+        minminh.H = #0x8000
+      } {
+        k  = NORMAMT(manth)
+        kl = NORMAMT(mantl)
+        p0 = cmp.eq(manth, zerol)
+        p1 = cmp.eq(manth, minusl)
+      } {
+        p0 = OR(p0, p1)
+        if(p0.new) k = add(kl, #31)
+        maxneg.H = #0
+      } {
+        mantexpa = ASL(lmant, k)
+        exp = SUB(exp, k)
+        maxneg.L = #0x8001
+      } {
+        p0 = cmp.eq(mantexpa, zero)
+        p1 = cmp.eq(mantexpa, minus)
+        manta.L = #0
+        exp = ZXTH(exp)
+      } {
+        p2 = cmp.eq(mantexpa, minmin)    //is result 0x80....0
+        if(p2.new) exp = add(exp, #1)
+      }
+#if (__HEXAGON_ARCH__ == 60)
+      {
+        p0 = OR(p0, p1)
+        if( p0.new) manta = OR(manta,maxneg)
+        if(!p0.new) manta = OR(manta,exp)
+      }
+        jumpr  r31
+#else
+      {
+        p0 = OR(p0, p1)
+        if( p0.new) manta = OR(manta,maxneg)
+        if(!p0.new) manta = OR(manta,exp)
+        jumpr  r31
+      }
+#endif
+/* =================================================================== *
+ QDOUBLE dsub(QDOUBLE a,QDOUBLE b) {
+      QDOUBLE c;
+      lint manta = a & MANTMASK;
+      int  expa  = HEXAGON_R_sxth_R(a) ;
+      lint mantb = b & MANTMASK;
+      int  expb  = HEXAGON_R_sxth_R(b) ;
+      int  exp, expdiff, j, k, hi, lo, cn;
+      lint mant;
+
+        expdiff = (int) HEXAGON_P_vabsdiffh_PP(a, b);
+        expdiff = HEXAGON_R_sxth_R(expdiff) ;
+        if (expdiff > 63) { expdiff = 62;}
+        if (expa > expb) {
+          exp = expa + 1;
+          expa = 1;
+          expb = expdiff + 1;
+        } else {
+          exp = expb + 1;
+          expb = 1;
+          expa = expdiff + 1;
+        }
+        mant = (manta>>expa) - (mantb>>expb);
+
+        hi = (int) (mant>>32);
+        lo = (int) (mant);
+
+        k =  HEXAGON_R_normamt_R(hi);
+        if(hi == 0 || hi == -1) k =  31+HEXAGON_R_normamt_R(lo);
+
+        mant = (mant << k);
+        cn  = (mant == 0x8000000000000000LL);
+        exp = exp - k + cn;
+
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+      return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global dsub_asm
+        .type dsub_asm, @function
+dsub_asm:
+
+#define manta      R0
+#define mantexpa   R1:0
+#define lmanta     R1:0
+#define mantb      R2
+#define mantexpb   R3:2
+#define lmantb     R3:2
+#define expa       R4
+#define expb       R5
+#define mantexpd   R7:6
+#define expd       R6
+#define exp        R8
+#define c63        R9
+#define lmant      R1:0
+#define manth      R1
+#define mantl      R0
+#define zero       R7:6
+#define zerol      R6
+#define minus      R3:2
+#define minusl     R2
+#define maxneg     R9
+#define minmin     R11:10  // exactly 0x800000000000000000LL
+#define minminh    R11
+#define k          R4
+#define kl         R5
+#define ce         P0
+        .falign
+      {
+        mantexpd = VABSDIFFH(mantexpa, mantexpb) //represented as 0x08001LL
+        c63 = #62
+        expa = SXTH(manta)
+        expb = SXTH(mantb)
+      } {
+        expd = SXTH(expd)
+        ce = CMP.GT(expa, expb);
+        if ( ce.new) exp = add(expa, #1)
+        if (!ce.new) exp = add(expb, #1)
+      } {
+        if ( ce) expa = #1
+        if (!ce) expb = #1
+        manta.L = #0
+        expd = MIN(expd, c63)
+      } {
+        if (!ce) expa = add(expd, #1)
+        if ( ce) expb = add(expd, #1)
+        mantb.L = #0
+        zero = #0
+      } {
+        lmanta = ASR(lmanta, expa)
+        lmantb = ASR(lmantb, expb)
+        minmin = #0
+      } {
+        lmant = sub(lmanta, lmantb)
+        minus = #-1
+        minminh.H = #0x8000
+      } {
+        k  = NORMAMT(manth)
+        kl = NORMAMT(mantl)
+        p0 = cmp.eq(manth, zerol)
+        p1 = cmp.eq(manth, minusl)
+      } {
+        p0 = OR(p0, p1)
+        if(p0.new) k = add(kl, #31)
+        maxneg.H = #0
+      } {
+        mantexpa = ASL(lmant, k)
+        exp = SUB(exp, k)
+        maxneg.L = #0x8001
+      } {
+        p0 = cmp.eq(mantexpa, zero)
+        p1 = cmp.eq(mantexpa, minus)
+        manta.L = #0
+        exp = ZXTH(exp)
+      } {
+        p2 = cmp.eq(mantexpa, minmin)    //is result 0x80....0
+        if(p2.new) exp = add(exp, #1)
+      }
+#if (__HEXAGON_ARCH__ == 60)
+      {
+        p0 = OR(p0, p1)
+        if( p0.new) manta = OR(manta,maxneg)
+        if(!p0.new) manta = OR(manta,exp)
+      }
+        jumpr  r31
+#else
+      {
+        p0 = OR(p0, p1)
+        if( p0.new) manta = OR(manta,maxneg)
+        if(!p0.new) manta = OR(manta,exp)
+        jumpr  r31
+      }
+#endif
+/* ==================================================================== *
+ QDOUBLE dmpy(QDOUBLE a,QDOUBLE b) {
+        QDOUBLE c;
+        lint manta = a & MANTMASK;
+        int  expa  = HEXAGON_R_sxth_R(a) ;
+        lint mantb = b & MANTMASK;
+        int  expb  = HEXAGON_R_sxth_R(b) ;
+        int exp, k;
+        lint mant;
+        int          hia, hib, hi, lo;
+        unsigned int loa, lob;
+
+        hia = (int)(a >> 32);
+        loa = HEXAGON_R_extractu_RII((int)manta, 31, 1);
+        hib = (int)(b >> 32);
+        lob = HEXAGON_R_extractu_RII((int)mantb, 31, 1);
+
+        mant = HEXAGON_P_mpy_RR(hia, lob);
+        mant = HEXAGON_P_mpyacc_RR(mant,hib, loa);
+        mant = (mant >> 30) + (HEXAGON_P_mpy_RR(hia, hib)<<1);
+
+        hi = (int) (mant>>32);
+        lo = (int) (mant);
+
+        k =  HEXAGON_R_normamt_R(hi);
+        if(hi == 0 || hi == -1) k =  31+HEXAGON_R_normamt_R(lo);
+        mant = mant << k;
+        exp = expa + expb - k;
+        if (mant ==  0 || mant == -1)  exp = 0x8001;
+        c = (mant & MANTMASK) | (((lint) exp) & EXP_MASK);
+        return(c);
+ }
+ * ==================================================================== */
+        .text
+        .global dmpy_asm
+        .type dmpy_asm, @function
+dmpy_asm:
+
+#define mantal     R0
+#define mantah     R1
+#define mantexpa   R1:0
+#define mantbl     R2
+#define mantbh     R3
+#define mantexpb   R3:2
+#define expa       R4
+#define expb       R5
+#define mantexpd   R7:6
+#define exp        R8
+#define lmantc     R11:10
+#define mantch     R11
+#define mantcl     R10
+#define zero0      R7:6
+#define zero0l     R6
+#define minus1     R3:2
+#define minus1l    R2
+#define maxneg     R9
+#define k          R4
+#define kl         R5
+
+        .falign
+      {
+        mantbl = lsr(mantbl, #16)
+        mantal = lsr(mantal, #16)
+        expa = sxth(mantal)
+        expb = sxth(mantbl)
+      }
+      {
+        lmantc = mpy(mantah, mantbh)
+        mantexpd = mpy(mantah, mantbl)
+      }
+      {
+        lmantc = add(lmantc, lmantc) //<<1
+        mantexpd+= mpy(mantbh, mantal)
+      }
+      {
+        lmantc += asr(mantexpd, #15)
+        exp = add(expa, expb)
+        zero0 = #0
+        minus1 = #-1
+      }
+      {
+        k  = normamt(mantch)
+        kl = normamt(mantcl)
+        p0 = cmp.eq(mantch, zero0l)
+        p1 = cmp.eq(mantch, minus1l)
+      }
+      {
+        p0 = or(p0, p1)
+        if(p0.new) k = add(kl, #31)
+        maxneg.H = #0
+      }
+      {
+        mantexpa = asl(lmantc, k)
+        exp = sub(exp, k)
+        maxneg.L = #0x8001
+      }
+      {
+        p0 = cmp.eq(mantexpa, zero0)
+        p1 = cmp.eq(mantexpa, minus1)
+        mantal.L = #0
+        exp = zxth(exp)
+      }
+#if (__HEXAGON_ARCH__ == 60)
+      {
+        p0 = or(p0, p1)
+        if( p0.new) mantal = or(mantal,maxneg)
+        if(!p0.new) mantal = or(mantal,exp)
+      }
+        jumpr  r31
+#else
+      {
+        p0 = or(p0, p1)
+        if( p0.new) mantal = or(mantal,maxneg)
+        if(!p0.new) mantal = or(mantal,exp)
+        jumpr  r31
+      }
+#endif
--- a/lib/builtins/hexagon/fma_opt.S
+++ b/lib/builtins/hexagon/fma_opt.S
@ -0,0 +1,31 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type  \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size  \name, . - \name
+.endm
+
+FUNCTION_BEGIN fmaf
+  r2 += sfmpy(r0, r1)
+  {
+    r0 = r2
+    jumpr r31
+  }
+FUNCTION_END fmaf
+
+  .globl fmal
+  .set fmal, fma
--- a/lib/builtins/hexagon/fmax_opt.S
+++ b/lib/builtins/hexagon/fmax_opt.S
@ -0,0 +1,30 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type  \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size  \name, . - \name
+.endm
+
+FUNCTION_BEGIN fmaxf
+  {
+    r0 = sfmax(r0, r1)
+    jumpr r31
+  }
+FUNCTION_END fmaxf
+
+  .globl fmaxl
+  .set fmaxl, fmax
--- a/lib/builtins/hexagon/fmin_opt.S
+++ b/lib/builtins/hexagon/fmin_opt.S
@ -0,0 +1,30 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+.macro FUNCTION_BEGIN name
+.text
+.p2align 5
+.globl \name
+.type  \name, @function
+\name:
+.endm
+
+.macro FUNCTION_END name
+.size  \name, . - \name
+.endm
+
+FUNCTION_BEGIN fminf
+  {
+    r0 = sfmin(r0, r1)
+    jumpr r31
+  }
+FUNCTION_END fminf
+
+  .globl fminl
+  .set fminl, fmin
--- a/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S
+++ b/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S
@ -0,0 +1,125 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// An optimized version of a memcpy which is equivalent to the following loop:
+//
+//   volatile unsigned *dest;
+//   unsigned *src;
+//
+//   for (i = 0; i < num_words; ++i)
+//     *dest++ = *src++;
+//
+// The corresponding C prototype for this function would be
+// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest,
+//                                      const unsigned *src,
+//                                      unsigned num_words);
+//
+// *** Both dest and src must be aligned to 32-bit boundaries. ***
+// The code does not perform any runtime checks for this, and will fail
+// in bad ways if this requirement is not met.
+//
+// The "forward" in the name refers to the fact that the function copies
+// the words going forward in memory.  It is incorrect to use this function
+// for cases where the original code copied words in any other order.
+//
+// *** This function is only for the use by the compiler. ***
+// The only indended use is for the LLVM compiler to generate calls to
+// this function, when a mem-copy loop, like the one above, is detected.
+
+  .text
+
+// Inputs:
+//   r0: dest
+//   r1: src
+//   r2: num_words
+
+  .globl  hexagon_memcpy_forward_vp4cp4n2
+  .balign 32
+  .type   hexagon_memcpy_forward_vp4cp4n2,@function
+hexagon_memcpy_forward_vp4cp4n2:
+
+    // Compute r3 to be the number of words remaining in the current page.
+    // At the same time, compute r4 to be the number of 32-byte blocks
+    // remaining in the page (for prefetch).
+  {
+    r3 = sub(##4096, r1)
+    r5 = lsr(r2, #3)
+  }
+  {
+    // The word count before end-of-page is in the 12 lowest bits of r3.
+    // (If the address in r1 was already page-aligned, the bits are 0.)
+    r3 = extractu(r3, #10, #2)
+    r4 = extractu(r3, #7, #5)
+  }
+  {
+    r3 = minu(r2, r3)
+    r4 = minu(r5, r4)
+  }
+  {
+    r4 = or(r4, ##2105344)      // 2105344 = 0x202000
+    p0 = cmp.eq(r3, #0)
+    if (p0.new) jump:nt .Lskipprolog
+  }
+    l2fetch(r1, r4)
+  {
+    loop0(.Lprolog, r3)
+    r2 = sub(r2, r3)            // r2 = number of words left after the prolog.
+  }
+  .falign
+.Lprolog:
+  {
+    r4 = memw(r1++#4)
+    memw(r0++#4) = r4.new
+  } :endloop0
+.Lskipprolog:
+  {
+    // Let r3 = number of whole pages left (page = 1024 words).
+    r3 = lsr(r2, #10)
+    if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
+  }
+  {
+    loop1(.Lout, r3)
+    r2 = extractu(r2, #10, #0)  // r2 = r2 & 1023
+    r3 = ##2105472              // r3 = 0x202080 (prefetch info)
+  }
+    // Iterate over pages.
+  .falign
+.Lout:
+    // Prefetch each individual page.
+    l2fetch(r1, r3)
+    loop0(.Lpage, #512)
+  .falign
+.Lpage:
+    r5:4 = memd(r1++#8)
+  {
+    memw(r0++#8) = r4
+    memw(r0+#4) = r5
+  } :endloop0:endloop1
+.Lskipmain:
+  {
+    r3 = ##2105344              // r3 = 0x202000 (prefetch info)
+    r4 = lsr(r2, #3)            // r4 = number of 32-byte blocks remaining.
+    p0 = cmp.eq(r2, #0)
+    if (p0.new) jumpr:nt r31
+  }
+  {
+    r3 = or(r3, r4)
+    loop0(.Lepilog, r2)
+  }
+    l2fetch(r1, r3)
+  .falign
+.Lepilog:
+  {
+    r4 = memw(r1++#4)
+    memw(r0++#4) = r4.new
+  } :endloop0
+
+    jumpr r31
+
+.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
--- a/lib/builtins/hexagon/memcpy_likely_aligned.S
+++ b/lib/builtins/hexagon/memcpy_likely_aligned.S
@ -0,0 +1,64 @@
+//===------------------------- memcopy routines ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+FUNCTION_BEGIN __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
+	{
+		p0 = bitsclr(r1,#7)
+		p0 = bitsclr(r0,#7)
+		if (p0.new) r5:4 = memd(r1)
+		r3 = #-3
+	}
+	{
+		if (!p0) jump .Lmemcpy_call
+		if (p0) memd(r0++#8) = r5:4
+		if (p0) r5:4 = memd(r1+#8)
+		r3 += lsr(r2,#3)
+	}
+	{
+		memd(r0++#8) = r5:4
+		r5:4 = memd(r1+#16)
+		r1 = add(r1,#24)
+		loop0(1f,r3)
+	}
+	.falign
+1:
+	{
+		memd(r0++#8) = r5:4
+		r5:4 = memd(r1++#8)
+	}:endloop0
+	{
+		memd(r0) = r5:4
+		r0 -= add(r2,#-8)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
+
+.Lmemcpy_call:
+#ifdef __PIC__
+	jump memcpy@PLT
+#else
+	jump memcpy
+#endif
+
+  .globl __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes
+  .set   __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes, \
+         __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
--- a/lib/builtins/hexagon/moddi3.S
+++ b/lib/builtins/hexagon/moddi3.S
@ -0,0 +1,83 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_moddi3
+	{
+		p3 = tstbit(r1,#31)
+	}
+	{
+		r1:0 = abs(r1:0)
+		r3:2 = abs(r3:2)
+	}
+	{
+		r6 = cl0(r1:0)              // count leading 0's of dividend (numerator)
+		r7 = cl0(r3:2)              // count leading 0's of divisor (denominator)
+		r5:4 = r3:2                 // divisor moved into working registers
+		r3:2 = r1:0                 // dividend is the initial remainder, r3:2 contains remainder
+	}
+	{
+		r10 = sub(r7,r6)            // left shift count for bit & divisor
+		r1:0 = #0                   // initialize quotient to 0
+		r15:14 = #1                 // initialize bit to 1
+	}
+	{
+		r11 = add(r10,#1)           // loop count is 1 more than shift count
+		r13:12 = lsl(r5:4,r10)      // shift divisor msb into same bit position as dividend msb
+		r15:14 = lsl(r15:14,r10)    // shift the bit left by same amount as divisor
+	}
+	{
+		p0 = cmp.gtu(r5:4,r3:2)     // check if divisor > dividend
+		loop0(1f,r11)               // register loop
+	}
+	{
+		if (p0) jump .hexagon_moddi3_return          // if divisor > dividend, we're done, so return
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r13:12,r3:2)   // set predicate reg if shifted divisor > current remainder
+	}
+	{
+		r7:6 = sub(r3:2, r13:12)    // subtract shifted divisor from current remainder
+		r9:8 = add(r1:0, r15:14)    // save current quotient to temp (r9:8)
+	}
+	{
+		r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+		r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+	}
+	{
+		r15:14 = lsr(r15:14, #1)    // shift bit right by 1 for next iteration
+		r13:12 = lsr(r13:12, #1)    // shift "shifted divisor" right by 1 for next iteration
+	}:endloop0
+
+.hexagon_moddi3_return:
+	{
+		r1:0 = neg(r3:2)
+	}
+	{
+		r1:0 = vmux(p3,r1:0,r3:2)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_moddi3
+
+  .globl __qdsp_moddi3
+  .set   __qdsp_moddi3, __hexagon_moddi3
--- a/lib/builtins/hexagon/modsi3.S
+++ b/lib/builtins/hexagon/modsi3.S
@ -0,0 +1,66 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_modsi3
+	{
+		p2 = cmp.ge(r0,#0)
+		r2 = abs(r0)
+		r1 = abs(r1)
+	}
+	{
+		r3 = cl0(r2)
+		r4 = cl0(r1)
+		p0 = cmp.gtu(r1,r2)
+	}
+	{
+		r3 = sub(r4,r3)
+		if (p0) jumpr r31
+	}
+	{
+		p1 = cmp.eq(r3,#0)
+		loop0(1f,r3)
+		r0 = r2
+		r2 = lsl(r1,r3)
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r2,r0)
+		if (!p0.new) r0 = sub(r0,r2)
+		r2 = lsr(r2,#1)
+		if (p1) r1 = #0
+	}:endloop0
+	{
+		p0 = cmp.gtu(r2,r0)
+		if (!p0.new) r0 = sub(r0,r1)
+		if (p2) jumpr r31
+	}
+	{
+		r0 = neg(r0)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_modsi3
+
+  .globl __qdsp_modsi3
+  .set   __qdsp_modsi3, __hexagon_modsi3
--- a/lib/builtins/hexagon/sfdiv_opt.S
+++ b/lib/builtins/hexagon/sfdiv_opt.S
@ -0,0 +1,66 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+
+FUNCTION_BEGIN __hexagon_divsf3
+  {
+    r2,p0 = sfrecipa(r0,r1)
+    r4 = sffixupd(r0,r1)
+    r3 = ##0x3f800000   // 1.0
+  }
+  {
+    r5 = sffixupn(r0,r1)
+    r3 -= sfmpy(r4,r2):lib  // 1-(den/recip) yields error?
+    r6 = ##0x80000000
+    r7 = r3
+  }
+  {
+    r2 += sfmpy(r3,r2):lib
+    r3 = r7
+    r6 = r5
+    r0 = and(r6,r5)
+  }
+  {
+    r3 -= sfmpy(r4,r2):lib
+    r0 += sfmpy(r5,r2):lib
+  }
+  {
+    r2 += sfmpy(r3,r2):lib
+    r6 -= sfmpy(r0,r4):lib
+  }
+  {
+    r0 += sfmpy(r6,r2):lib
+  }
+  {
+    r5 -= sfmpy(r0,r4):lib
+  }
+  {
+    r0 += sfmpy(r5,r2,p0):scale
+    jumpr r31
+  }
+FUNCTION_END __hexagon_divsf3
+
+Q6_ALIAS(divsf3)
+FAST_ALIAS(divsf3)
+FAST2_ALIAS(divsf3)
--- a/lib/builtins/hexagon/sfsqrt_opt.S
+++ b/lib/builtins/hexagon/sfsqrt_opt.S
@ -0,0 +1,82 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+#define RIN r0
+#define S r0
+#define H r1
+#define D r2
+#define E r3
+#define HALF r4
+#define R r5
+
+#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
+#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
+#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
+
+FUNCTION_BEGIN __hexagon_sqrtf
+  {
+    E,p0 = sfinvsqrta(RIN)
+    R = sffixupr(RIN)
+    HALF = ##0x3f000000   // 0.5
+    r1:0 = combine(#0,#0)   // clear S/H
+  }
+  {
+    S += sfmpy(E,R):lib   // S0
+    H += sfmpy(E,HALF):lib    // H0
+    D = HALF
+    E = R
+  }
+  {
+    D -= sfmpy(S,H):lib   // d0
+    p1 = sfclass(R,#1)    // is zero?
+    //E -= sfmpy(S,S):lib   // e0
+  }
+  {
+    S += sfmpy(S,D):lib   // S1
+    H += sfmpy(H,D):lib   // H1
+    D = HALF
+    E = R
+  }
+  {
+    D -= sfmpy(S,H):lib   // d0
+    E -= sfmpy(S,S):lib   // e0
+  }
+  {
+    S += sfmpy(H,E):lib   // S2
+    H += sfmpy(H,D):lib   // H2
+    D = HALF
+    E = R
+  }
+  {
+    //D -= sfmpy(S,H):lib   // d2
+    E -= sfmpy(S,S):lib   // e2
+    if (p1) r0 = or(r0,R)     // sqrt(-0.0) = -0.0
+  }
+  {
+    S += sfmpy(H,E,p0):scale  // S3
+    jumpr r31
+  }
+
+FUNCTION_END __hexagon_sqrtf
+
+Q6_ALIAS(sqrtf)
+FAST_ALIAS(sqrtf)
+FAST2_ALIAS(sqrtf)
--- a/lib/builtins/hexagon/udivdi3.S
+++ b/lib/builtins/hexagon/udivdi3.S
@ -0,0 +1,71 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+        .p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_udivdi3
+	{
+		r6 = cl0(r1:0)              // count leading 0's of dividend (numerator)
+		r7 = cl0(r3:2)              // count leading 0's of divisor (denominator)
+		r5:4 = r3:2                 // divisor moved into working registers
+		r3:2 = r1:0                 // dividend is the initial remainder, r3:2 contains remainder
+	}
+	{
+		r10 = sub(r7,r6)            // left shift count for bit & divisor
+		r1:0 = #0                   // initialize quotient to 0
+		r15:14 = #1                 // initialize bit to 1
+	}
+	{
+		r11 = add(r10,#1)           // loop count is 1 more than shift count
+		r13:12 = lsl(r5:4,r10)      // shift divisor msb into same bit position as dividend msb
+		r15:14 = lsl(r15:14,r10)    // shift the bit left by same amount as divisor
+	}
+	{
+		p0 = cmp.gtu(r5:4,r3:2)     // check if divisor > dividend
+		loop0(1f,r11)               // register loop
+	}
+	{
+		if (p0) jumpr r31           // if divisor > dividend, we're done, so return
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r13:12,r3:2)   // set predicate reg if shifted divisor > current remainder
+	}
+	{
+		r7:6 = sub(r3:2, r13:12)    // subtract shifted divisor from current remainder
+		r9:8 = add(r1:0, r15:14)    // save current quotient to temp (r9:8)
+	}
+	{
+		r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+		r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+	}
+	{
+		r15:14 = lsr(r15:14, #1)    // shift bit right by 1 for next iteration
+		r13:12 = lsr(r13:12, #1)    // shift "shifted divisor" right by 1 for next iteration
+	}:endloop0
+	{
+		jumpr r31                   // return
+	}
+FUNCTION_END __hexagon_udivdi3
+
+  .globl __qdsp_udivdi3
+  .set   __qdsp_udivdi3, __hexagon_udivdi3
--- a/lib/builtins/hexagon/udivmoddi4.S
+++ b/lib/builtins/hexagon/udivmoddi4.S
@ -0,0 +1,71 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_udivmoddi4
+	{
+		r6 = cl0(r1:0)              // count leading 0's of dividend (numerator)
+		r7 = cl0(r3:2)              // count leading 0's of divisor (denominator)
+		r5:4 = r3:2                 // divisor moved into working registers
+		r3:2 = r1:0                 // dividend is the initial remainder, r3:2 contains remainder
+	}
+	{
+		r10 = sub(r7,r6)            // left shift count for bit & divisor
+		r1:0 = #0                   // initialize quotient to 0
+		r15:14 = #1                 // initialize bit to 1
+	}
+	{
+		r11 = add(r10,#1)           // loop count is 1 more than shift count
+		r13:12 = lsl(r5:4,r10)      // shift divisor msb into same bit position as dividend msb
+		r15:14 = lsl(r15:14,r10)    // shift the bit left by same amount as divisor
+	}
+	{
+		p0 = cmp.gtu(r5:4,r3:2)     // check if divisor > dividend
+		loop0(1f,r11)               // register loop
+	}
+	{
+		if (p0) jumpr r31           // if divisor > dividend, we're done, so return
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r13:12,r3:2)   // set predicate reg if shifted divisor > current remainder
+	}
+	{
+		r7:6 = sub(r3:2, r13:12)    // subtract shifted divisor from current remainder
+		r9:8 = add(r1:0, r15:14)    // save current quotient to temp (r9:8)
+	}
+	{
+		r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+		r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+	}
+	{
+		r15:14 = lsr(r15:14, #1)    // shift bit right by 1 for next iteration
+		r13:12 = lsr(r13:12, #1)    // shift "shifted divisor" right by 1 for next iteration
+	}:endloop0
+	{
+		jumpr r31                   // return
+	}
+FUNCTION_END __hexagon_udivmoddi4
+
+  .globl __qdsp_udivmoddi4
+  .set   __qdsp_udivmoddi4, __hexagon_udivmoddi4
--- a/lib/builtins/hexagon/udivmodsi4.S
+++ b/lib/builtins/hexagon/udivmodsi4.S
@ -0,0 +1,60 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_udivmodsi4
+	{
+		r2 = cl0(r0)
+		r3 = cl0(r1)
+		r5:4 = combine(#1,#0)
+		p0 = cmp.gtu(r1,r0)
+	}
+	{
+		r6 = sub(r3,r2)
+		r4 = r1
+		r1:0 = combine(r0,r4)
+		if (p0) jumpr r31
+	}
+	{
+		r3:2 = vlslw(r5:4,r6)
+		loop0(1f,r6)
+		p0 = cmp.eq(r6,#0)
+		if (p0.new) r4 = #0
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r1 = sub(r1,r2)
+		if (!p0.new) r0 = add(r0,r3)
+		r3:2 = vlsrw(r3:2,#1)
+	}:endloop0
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r1 = sub(r1,r4)
+		if (!p0.new) r0 = add(r0,r3)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_udivmodsi4
+
+  .globl __qdsp_udivmodsi4
+  .set   __qdsp_udivmodsi4, __hexagon_udivmodsi4
--- a/lib/builtins/hexagon/udivsi3.S
+++ b/lib/builtins/hexagon/udivsi3.S
@ -0,0 +1,56 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+	.macro FUNCTION_BEGIN name
+	.text
+        .p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_udivsi3
+	{
+		r2 = cl0(r0)
+		r3 = cl0(r1)
+		r5:4 = combine(#1,#0)
+		p0 = cmp.gtu(r1,r0)
+	}
+	{
+		r6 = sub(r3,r2)
+		r4 = r1
+		r1:0 = combine(r0,r4)
+		if (p0) jumpr r31
+	}
+	{
+		r3:2 = vlslw(r5:4,r6)
+		loop0(1f,r6)
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r1 = sub(r1,r2)
+		if (!p0.new) r0 = add(r0,r3)
+		r3:2 = vlsrw(r3:2,#1)
+	}:endloop0
+	{
+		p0 = cmp.gtu(r2,r1)
+		if (!p0.new) r0 = add(r0,r3)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_udivsi3
+
+  .globl __qdsp_udivsi3
+  .set   __qdsp_udivsi3, __hexagon_udivsi3
--- a/lib/builtins/hexagon/umoddi3.S
+++ b/lib/builtins/hexagon/umoddi3.S
@ -0,0 +1,74 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_umoddi3
+	{
+		r6 = cl0(r1:0)              // count leading 0's of dividend (numerator)
+		r7 = cl0(r3:2)              // count leading 0's of divisor (denominator)
+		r5:4 = r3:2                 // divisor moved into working registers
+		r3:2 = r1:0                 // dividend is the initial remainder, r3:2 contains remainder
+	}
+	{
+		r10 = sub(r7,r6)            // left shift count for bit & divisor
+		r1:0 = #0                   // initialize quotient to 0
+		r15:14 = #1                 // initialize bit to 1
+	}
+	{
+		r11 = add(r10,#1)           // loop count is 1 more than shift count
+		r13:12 = lsl(r5:4,r10)      // shift divisor msb into same bit position as dividend msb
+		r15:14 = lsl(r15:14,r10)    // shift the bit left by same amount as divisor
+	}
+	{
+		p0 = cmp.gtu(r5:4,r3:2)     // check if divisor > dividend
+		loop0(1f,r11)               // register loop
+	}
+	{
+		if (p0) jump .hexagon_umoddi3_return           // if divisor > dividend, we're done, so return
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r13:12,r3:2)   // set predicate reg if shifted divisor > current remainder
+	}
+	{
+		r7:6 = sub(r3:2, r13:12)    // subtract shifted divisor from current remainder
+		r9:8 = add(r1:0, r15:14)    // save current quotient to temp (r9:8)
+	}
+	{
+		r1:0 = vmux(p0, r1:0, r9:8) // choose either current quotient or new quotient (r9:8)
+		r3:2 = vmux(p0, r3:2, r7:6) // choose either current remainder or new remainder (r7:6)
+	}
+	{
+		r15:14 = lsr(r15:14, #1)    // shift bit right by 1 for next iteration
+		r13:12 = lsr(r13:12, #1)    // shift "shifted divisor" right by 1 for next iteration
+	}:endloop0
+
+.hexagon_umoddi3_return:
+	{
+		r1:0 = r3:2
+		jumpr r31
+	}
+FUNCTION_END __hexagon_umoddi3
+
+  .globl __qdsp_umoddi3
+  .set   __qdsp_umoddi3, __hexagon_umoddi3
--- a/lib/builtins/hexagon/umodsi3.S
+++ b/lib/builtins/hexagon/umodsi3.S
@ -0,0 +1,55 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+	.macro FUNCTION_BEGIN name
+	.text
+	.p2align 5
+	.globl \name
+	.type  \name, @function
+\name:
+	.endm
+
+	.macro FUNCTION_END name
+	.size  \name, . - \name
+	.endm
+
+
+FUNCTION_BEGIN __hexagon_umodsi3
+	{
+		r2 = cl0(r0)
+		r3 = cl0(r1)
+		p0 = cmp.gtu(r1,r0)
+	}
+	{
+		r2 = sub(r3,r2)
+		if (p0) jumpr r31
+	}
+	{
+		loop0(1f,r2)
+		p1 = cmp.eq(r2,#0)
+		r2 = lsl(r1,r2)
+	}
+	.falign
+1:
+	{
+		p0 = cmp.gtu(r2,r0)
+		if (!p0.new) r0 = sub(r0,r2)
+		r2 = lsr(r2,#1)
+		if (p1) r1 = #0
+	}:endloop0
+	{
+		p0 = cmp.gtu(r2,r0)
+		if (!p0.new) r0 = sub(r0,r1)
+		jumpr r31
+	}
+FUNCTION_END __hexagon_umodsi3
+
+  .globl __qdsp_umodsi3
+  .set   __qdsp_umodsi3, __hexagon_umodsi3
--- a/lib/builtins/int_types.h
+++ b/lib/builtins/int_types.h
@ -60,7 +60,7 @@ typedef union
    }s;
 } udwords;

-#if (defined(__LP64__) || defined(__wasm__) || defined(__mips64))
+#if (defined(__LP64__) || defined(__wasm__) || defined(__mips64)) || defined(__riscv)
 #define CRT_HAS_128BIT
 #endif

--- a/lib/builtins/os_version_check.c
+++ b/lib/builtins/os_version_check.c
@ -16,8 +16,8 @@
 #ifdef __APPLE__

 #include <CoreFoundation/CoreFoundation.h>
-#include <dispatch/dispatch.h>
 #include <TargetConditionals.h>
+#include <dispatch/dispatch.h>
 #include <dlfcn.h>
 #include <stdint.h>
 #include <stdio.h>
@ -28,6 +28,26 @@
 static int32_t GlobalMajor, GlobalMinor, GlobalSubminor;
 static dispatch_once_t DispatchOnceCounter;

+typedef CFDataRef (*CFDataCreateWithBytesNoCopyFuncTy)(CFAllocatorRef,
+                                                       const UInt8 *, CFIndex,
+                                                       CFAllocatorRef);
+typedef CFPropertyListRef (*CFPropertyListCreateWithDataFuncTy)(
+    CFAllocatorRef, CFDataRef, CFOptionFlags, CFPropertyListFormat *,
+    CFErrorRef *);
+typedef CFPropertyListRef (*CFPropertyListCreateFromXMLDataFuncTy)(
+    CFAllocatorRef, CFDataRef, CFOptionFlags, CFStringRef *);
+typedef CFStringRef (*CFStringCreateWithCStringNoCopyFuncTy)(CFAllocatorRef,
+                                                             const char *,
+                                                             CFStringEncoding,
+                                                             CFAllocatorRef);
+typedef const void *(*CFDictionaryGetValueFuncTy)(CFDictionaryRef,
+                                                  const void *);
+typedef CFTypeID (*CFGetTypeIDFuncTy)(CFTypeRef);
+typedef CFTypeID (*CFStringGetTypeIDFuncTy)(void);
+typedef Boolean (*CFStringGetCStringFuncTy)(CFStringRef, char *, CFIndex,
+                                            CFStringEncoding);
+typedef void (*CFReleaseFuncTy)(CFTypeRef);
+
 /* Find and parse the SystemVersion.plist file. */
 static void parseSystemVersionPList(void *Unused) {
  (void)Unused;
@ -37,50 +57,49 @@ static void parseSystemVersionPList(void *Unused) {
    return;
  const CFAllocatorRef kCFAllocatorNull =
      *(const CFAllocatorRef *)NullAllocator;
-  typeof(CFDataCreateWithBytesNoCopy) *CFDataCreateWithBytesNoCopyFunc =
-      (typeof(CFDataCreateWithBytesNoCopy) *)dlsym(
-          RTLD_DEFAULT, "CFDataCreateWithBytesNoCopy");
+  CFDataCreateWithBytesNoCopyFuncTy CFDataCreateWithBytesNoCopyFunc =
+      (CFDataCreateWithBytesNoCopyFuncTy)dlsym(RTLD_DEFAULT,
+                                               "CFDataCreateWithBytesNoCopy");
  if (!CFDataCreateWithBytesNoCopyFunc)
    return;
-  typeof(CFPropertyListCreateWithData) *CFPropertyListCreateWithDataFunc =
-      (typeof(CFPropertyListCreateWithData) *)dlsym(
+  CFPropertyListCreateWithDataFuncTy CFPropertyListCreateWithDataFunc =
+      (CFPropertyListCreateWithDataFuncTy)dlsym(
          RTLD_DEFAULT, "CFPropertyListCreateWithData");
-  /* CFPropertyListCreateWithData was introduced only in macOS 10.6+, so it
-   * will be NULL on earlier OS versions. */
+/* CFPropertyListCreateWithData was introduced only in macOS 10.6+, so it
+ * will be NULL on earlier OS versions. */
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
-  typeof(CFPropertyListCreateFromXMLData) *CFPropertyListCreateFromXMLDataFunc =
-      (typeof(CFPropertyListCreateFromXMLData) *)dlsym(
+  CFPropertyListCreateFromXMLDataFuncTy CFPropertyListCreateFromXMLDataFunc =
+      (CFPropertyListCreateFromXMLDataFuncTy)dlsym(
          RTLD_DEFAULT, "CFPropertyListCreateFromXMLData");
 #pragma clang diagnostic pop
  /* CFPropertyListCreateFromXMLDataFunc is deprecated in macOS 10.10, so it
   * might be NULL in future OS versions. */
  if (!CFPropertyListCreateWithDataFunc && !CFPropertyListCreateFromXMLDataFunc)
    return;
-  typeof(CFStringCreateWithCStringNoCopy) *CFStringCreateWithCStringNoCopyFunc =
-      (typeof(CFStringCreateWithCStringNoCopy) *)dlsym(
+  CFStringCreateWithCStringNoCopyFuncTy CFStringCreateWithCStringNoCopyFunc =
+      (CFStringCreateWithCStringNoCopyFuncTy)dlsym(
          RTLD_DEFAULT, "CFStringCreateWithCStringNoCopy");
  if (!CFStringCreateWithCStringNoCopyFunc)
    return;
-  typeof(CFDictionaryGetValue) *CFDictionaryGetValueFunc =
-      (typeof(CFDictionaryGetValue) *)dlsym(RTLD_DEFAULT,
-                                            "CFDictionaryGetValue");
+  CFDictionaryGetValueFuncTy CFDictionaryGetValueFunc =
+      (CFDictionaryGetValueFuncTy)dlsym(RTLD_DEFAULT, "CFDictionaryGetValue");
  if (!CFDictionaryGetValueFunc)
    return;
-  typeof(CFGetTypeID) *CFGetTypeIDFunc =
-      (typeof(CFGetTypeID) *)dlsym(RTLD_DEFAULT, "CFGetTypeID");
+  CFGetTypeIDFuncTy CFGetTypeIDFunc =
+      (CFGetTypeIDFuncTy)dlsym(RTLD_DEFAULT, "CFGetTypeID");
  if (!CFGetTypeIDFunc)
    return;
-  typeof(CFStringGetTypeID) *CFStringGetTypeIDFunc =
-      (typeof(CFStringGetTypeID) *)dlsym(RTLD_DEFAULT, "CFStringGetTypeID");
+  CFStringGetTypeIDFuncTy CFStringGetTypeIDFunc =
+      (CFStringGetTypeIDFuncTy)dlsym(RTLD_DEFAULT, "CFStringGetTypeID");
  if (!CFStringGetTypeIDFunc)
    return;
-  typeof(CFStringGetCString) *CFStringGetCStringFunc =
-      (typeof(CFStringGetCString) *)dlsym(RTLD_DEFAULT, "CFStringGetCString");
+  CFStringGetCStringFuncTy CFStringGetCStringFunc =
+      (CFStringGetCStringFuncTy)dlsym(RTLD_DEFAULT, "CFStringGetCString");
  if (!CFStringGetCStringFunc)
    return;
-  typeof(CFRelease) *CFReleaseFunc =
-      (typeof(CFRelease) *)dlsym(RTLD_DEFAULT, "CFRelease");
+  CFReleaseFuncTy CFReleaseFunc =
+      (CFReleaseFuncTy)dlsym(RTLD_DEFAULT, "CFRelease");
  if (!CFReleaseFunc)
    return;

@ -163,10 +182,14 @@ int32_t __isOSVersionAtLeast(int32_t Major, int32_t Minor, int32_t Subminor) {
  /* Populate the global version variables, if they haven't already. */
  dispatch_once_f(&DispatchOnceCounter, NULL, parseSystemVersionPList);

-  if (Major < GlobalMajor) return 1;
-  if (Major > GlobalMajor) return 0;
-  if (Minor < GlobalMinor) return 1;
-  if (Minor > GlobalMinor) return 0;
+  if (Major < GlobalMajor)
+    return 1;
+  if (Major > GlobalMajor)
+    return 0;
+  if (Minor < GlobalMinor)
+    return 1;
+  if (Minor > GlobalMinor)
+    return 0;
  return Subminor <= GlobalSubminor;
 }

--- a/lib/builtins/riscv/mulsi3.S
+++ b/lib/builtins/riscv/mulsi3.S
@ -0,0 +1,28 @@
+//===--- mulsi3.S - Integer multiplication routines routines ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#if !defined(__riscv_mul) && __riscv_xlen == 32
+	.text
+	.align 2
+
+	.globl __mulsi3
+	.type  __mulsi3, @function
+__mulsi3:
+	mv     a2, a0
+	mv     a0, zero
+.L1:
+	andi   a3, a1, 1
+	beqz   a3, .L2
+	add    a0, a0, a2
+.L2:
+	srli   a1, a1, 1
+	slli   a2, a2, 1
+	bnez   a1, .L1
+	ret
+#endif
--- a/lib/cfi/CMakeLists.txt
+++ b/lib/cfi/CMakeLists.txt
@ -30,6 +30,8 @@ if(OS_NAME MATCHES "Linux")
      OBJECT_LIBS RTInterception
                  RTSanitizerCommon
                  RTSanitizerCommonLibc
+                  RTSanitizerCommonCoverage
+                  RTSanitizerCommonSymbolizer
                  RTUbsan
      CFLAGS ${CFI_CFLAGS} ${CFI_DIAG_CFLAGS}
      PARENT_TARGET cfi)
--- a/lib/cfi/cfi.cc
+++ b/lib/cfi/cfi.cc
@ -132,7 +132,11 @@ void ShadowBuilder::Start() {
 void ShadowBuilder::AddUnchecked(uptr begin, uptr end) {
  uint16_t *shadow_begin = MemToShadow(begin, shadow_);
  uint16_t *shadow_end = MemToShadow(end - 1, shadow_) + 1;
-  memset(shadow_begin, kUncheckedShadow,
+  // memset takes a byte, so our unchecked shadow value requires both bytes to
+  // be the same. Make sure we're ok during compilation.
+  static_assert((kUncheckedShadow & 0xff) == ((kUncheckedShadow >> 8) & 0xff),
+                "Both bytes of the 16-bit value must be the same!");
+  memset(shadow_begin, kUncheckedShadow & 0xff,
         (shadow_end - shadow_begin) * sizeof(*shadow_begin));
 }

@ -379,6 +383,8 @@ __cfi_slowpath_diag(u64 CallSiteTypeId, void *Ptr, void *DiagData) {
 }
 #endif

+static void EnsureInterceptorsInitialized();
+
 // Setup shadow for dlopen()ed libraries.
 // The actual shadow setup happens after dlopen() returns, which means that
 // a library can not be a target of any CFI checks while its constructors are
@ -388,6 +394,7 @@ __cfi_slowpath_diag(u64 CallSiteTypeId, void *Ptr, void *DiagData) {
 // We could insert a high-priority constructor into the library, but that would
 // not help with the uninstrumented libraries.
 INTERCEPTOR(void*, dlopen, const char *filename, int flag) {
+  EnsureInterceptorsInitialized();
  EnterLoader();
  void *handle = REAL(dlopen)(filename, flag);
  ExitLoader();
@ -395,12 +402,27 @@ INTERCEPTOR(void*, dlopen, const char *filename, int flag) {
 }

 INTERCEPTOR(int, dlclose, void *handle) {
+  EnsureInterceptorsInitialized();
  EnterLoader();
  int res = REAL(dlclose)(handle);
  ExitLoader();
  return res;
 }

+static BlockingMutex interceptor_init_lock(LINKER_INITIALIZED);
+static bool interceptors_inited = false;
+
+static void EnsureInterceptorsInitialized() {
+  BlockingMutexLock lock(&interceptor_init_lock);
+  if (interceptors_inited)
+    return;
+
+  INTERCEPT_FUNCTION(dlopen);
+  INTERCEPT_FUNCTION(dlclose);
+
+  interceptors_inited = true;
+}
+
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE
 #if !SANITIZER_CAN_USE_PREINIT_ARRAY
 // On ELF platforms, the constructor is invoked using .preinit_array (see below)
@ -411,9 +433,6 @@ void __cfi_init() {
  InitializeFlags();
  InitShadow();

-  INTERCEPT_FUNCTION(dlopen);
-  INTERCEPT_FUNCTION(dlclose);
-
 #ifdef CFI_ENABLE_DIAG
  __ubsan::InitAsPlugin();
 #endif
--- a/Show More
+++ b/Show More