Import llvm 3.7.0 release (r246257).

svn path=/vendor/llvm/dist/; revision=287510 svn path=/vendor/llvm/llvm-release_370-r246257/; revision=287511; tag=vendor/llvm/llvm-release_370-r246257
2015-09-06 18:34:38 +00:00 · 2015-09-06 18:34:38 +00:00 · 69156b4c20 · 2020-12-20 02:59:44 +00:00
commit 69156b4c20
parent ee8648bdac
182 changed files with 3775 additions and 1316 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -61,7 +61,7 @@ set(CMAKE_MODULE_PATH
 set(LLVM_VERSION_MAJOR 3)
 set(LLVM_VERSION_MINOR 7)
 set(LLVM_VERSION_PATCH 0)
-set(LLVM_VERSION_SUFFIX svn)
+set(LLVM_VERSION_SUFFIX "")

 if (NOT PACKAGE_VERSION)
  set(PACKAGE_VERSION
@ -518,7 +518,7 @@ if (APPLE)
 else(UNIX)
  if(NOT DEFINED CMAKE_INSTALL_RPATH)
    set(CMAKE_INSTALL_RPATH "\$ORIGIN/../lib${LLVM_LIBDIR_SUFFIX}")
-    if (${CMAKE_SYSTEM_NAME} MATCHES FreeBSD)
+    if(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)")
      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,origin")
      set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,origin")
    endif()
@ -544,12 +544,12 @@ if(LLVM_USE_HOST_TOOLS)
  include(CrossCompile)
 endif(LLVM_USE_HOST_TOOLS)

-if( ${CMAKE_SYSTEM_NAME} MATCHES FreeBSD )
+if(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)")
  # On FreeBSD, /usr/local/* is not used by default. In order to build LLVM
  # with libxml2, iconv.h, etc., we must add /usr/local paths.
  include_directories("/usr/local/include")
  link_directories("/usr/local/lib")
-endif( ${CMAKE_SYSTEM_NAME} MATCHES FreeBSD )
+endif(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)")

 if( ${CMAKE_SYSTEM_NAME} MATCHES SunOS )
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include llvm/Support/Solaris.h")
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@ -465,3 +465,47 @@ N: Bob Wilson
 E: bob.wilson@acm.org
 D: Advanced SIMD (NEON) support in the ARM backend.

+N: Alexey Bataev
+E: a.bataev@hotmail.com
+D: Clang OpenMP implementation
+
+N: Andrey Bokhanko
+E: andreybokhanko@gmail.com 
+D: Clang OpenMP implementation
+
+N: Carlo Bertolli
+E: cbertol@us.ibm.com 
+D: Clang OpenMP implementation
+
+N: Eric Stotzer
+E: estotzer@ti.com 
+D: Clang OpenMP implementation
+
+N: Kelvin Li
+E: kkwli0@gmail.com 
+D: Clang OpenMP implementation
+
+N: Samuel Antao
+E: sfantao@us.ibm.com 
+D: Clang OpenMP implementation
+
+N: Sergey Ostanevich
+E: sergos.gnu@gmail.com 
+D: Clang OpenMP implementation
+
+N: Alexandre Eichenberger
+E: alexe@us.ibm.com 
+D: Clang OpenMP implementation
+
+N: Guansong Zhang
+E: guansong.zhang@amd.com 
+D: Clang OpenMP implementation
+
+N: Sunita Chandrasekaran
+E: sunisg123@gmail.com  
+D: Clang OpenMP implementation
+
+N: Michael Wong
+E: fraggamuffin@gmail.com 
+D: Clang OpenMP implementation
+
--- a/Makefile.config.in
+++ b/Makefile.config.in
@ -58,7 +58,7 @@ LLVM_OBJ_ROOT   := $(call realpath, @abs_top_builddir@)
 PROJ_SRC_ROOT   := $(LLVM_SRC_ROOT)
 PROJ_SRC_DIR    := $(LLVM_SRC_ROOT)$(patsubst $(PROJ_OBJ_ROOT)%,%,$(PROJ_OBJ_DIR))

-# See: http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20150323/268067.html
+# See: http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20150323/268067.html
 ifeq ($(LLVM_SRC_ROOT), $(LLVM_OBJ_ROOT))
  $(error In-source builds are not allowed. Please configure from a separate build directory!)
 endif
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@ -32,12 +32,12 @@ dnl===-----------------------------------------------------------------------===
 dnl Initialize autoconf and define the package name, version number and
 dnl address for reporting bugs.

-AC_INIT([LLVM],[3.7.0svn],[http://llvm.org/bugs/])
+AC_INIT([LLVM],[3.7.0],[http://llvm.org/bugs/])

 LLVM_VERSION_MAJOR=3
 LLVM_VERSION_MINOR=7
 LLVM_VERSION_PATCH=0
-LLVM_VERSION_SUFFIX=svn
+LLVM_VERSION_SUFFIX=

 AC_DEFINE_UNQUOTED([LLVM_VERSION_MAJOR], $LLVM_VERSION_MAJOR, [Major version of the LLVM API])
 AC_DEFINE_UNQUOTED([LLVM_VERSION_MINOR], $LLVM_VERSION_MINOR, [Minor version of the LLVM API])
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@ -131,7 +131,7 @@ endif()

 # Pass -Wl,-z,defs. This makes sure all symbols are defined. Otherwise a DSO
 # build might work on ELF but fail on MachO/COFF.
-if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" OR WIN32 OR
+if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" OR WIN32 OR CYGWIN OR
        ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") AND
   NOT LLVM_USE_SANITIZER)
  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,defs")
--- a/20
+++ b/20
@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.60 for LLVM 3.7.0svn.
+# Generated by GNU Autoconf 2.60 for LLVM 3.7.0.
 #
 # Report bugs to <http://llvm.org/bugs/>.
 #
@ -561,8 +561,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='LLVM'
 PACKAGE_TARNAME='llvm'
-PACKAGE_VERSION='3.7.0svn'
-PACKAGE_STRING='LLVM 3.7.0svn'
+PACKAGE_VERSION='3.7.0'
+PACKAGE_STRING='LLVM 3.7.0'
 PACKAGE_BUGREPORT='http://llvm.org/bugs/'

 ac_unique_file="lib/IR/Module.cpp"
@ -1333,7 +1333,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures LLVM 3.7.0svn to adapt to many kinds of systems.
+\`configure' configures LLVM 3.7.0 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@ -1399,7 +1399,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of LLVM 3.7.0svn:";;
+     short | recursive ) echo "Configuration of LLVM 3.7.0:";;
   esac
  cat <<\_ACEOF

@ -1583,7 +1583,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-LLVM configure 3.7.0svn
+LLVM configure 3.7.0
 generated by GNU Autoconf 2.60

 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@ -1599,7 +1599,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by LLVM $as_me 3.7.0svn, which was
+It was created by LLVM $as_me 3.7.0, which was
 generated by GNU Autoconf 2.60.  Invocation command line was

  $ $0 $@
@ -1956,7 +1956,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 LLVM_VERSION_MAJOR=3
 LLVM_VERSION_MINOR=7
 LLVM_VERSION_PATCH=0
-LLVM_VERSION_SUFFIX=svn
+LLVM_VERSION_SUFFIX=


 cat >>confdefs.h <<_ACEOF
@ -18610,7 +18610,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by LLVM $as_me 3.7.0svn, which was
+This file was extended by LLVM $as_me 3.7.0, which was
 generated by GNU Autoconf 2.60.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@ -18663,7 +18663,7 @@ Report bugs to <bug-autoconf@gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-LLVM config.status 3.7.0svn
+LLVM config.status 3.7.0
 configured by $0, generated by GNU Autoconf 2.60,
  with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"

--- a/docs/Atomics.rst
+++ b/docs/Atomics.rst
@ -173,7 +173,7 @@ Notes for code generation
  also expected to generate an i8 store as an i8 store, and not an instruction
  which writes to surrounding bytes.  (If you are writing a backend for an
  architecture which cannot satisfy these restrictions and cares about
-  concurrency, please send an email to llvmdev.)
+  concurrency, please send an email to llvm-dev.)

 Unordered
 ---------
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@ -387,6 +387,10 @@ LLVM-specific variables
  ``-DLLVM_ENABLE_DOXYGEN_QT_HELP=ON``; otherwise this has no
  effect.

+**LLVM_DOXYGEN_SVG**:BOOL
+  Uses .svg files instead of .png files for graphs in the Doxygen output.
+  Defaults to OFF.
+
 **LLVM_ENABLE_SPHINX**:BOOL
  If enabled CMake will search for the ``sphinx-build`` executable and will make
  the ``SPHINX_OUTPUT_HTML`` and ``SPHINX_OUTPUT_MAN`` CMake options available.
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@ -56,6 +56,14 @@ if (LLVM_ENABLE_DOXYGEN)
    set(llvm_doxygen_qhp_cust_filter_attrs "")
  endif()
  
+  option(LLVM_DOXYGEN_SVG
+    "Use svg instead of png files for doxygen graphs." OFF)
+  if (LLVM_DOXYGEN_SVG)
+    set(DOT_IMAGE_FORMAT "svg")
+  else()
+    set(DOT_IMAGE_FORMAT "png")
+  endif()
+
  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/doxygen.cfg.in
    ${CMAKE_CURRENT_BINARY_DIR}/doxygen.cfg @ONLY)

@ -73,6 +81,7 @@ if (LLVM_ENABLE_DOXYGEN)
  set(llvm_doxygen_qhelpgenerator_path)
  set(llvm_doxygen_qhp_cust_filter_name)
  set(llvm_doxygen_qhp_cust_filter_attrs)
+  set(DOT_IMAGE_FORMAT)

  add_custom_target(doxygen-llvm
    COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/doxygen.cfg
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@ -1814,6 +1814,7 @@ Here is the table:
 :raw-html:`<th>SystemZ</th>`
 :raw-html:`<th>X86</th>`
 :raw-html:`<th>XCore</th>`
+:raw-html:`<th>eBPF</th>`
 :raw-html:`</tr>`

 :raw-html:`<tr>`
@ -1828,6 +1829,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="yes"></td> <!-- XCore -->`
+:raw-html:`<td class="yes"></td> <!-- eBPF -->`
 :raw-html:`</tr>`

 :raw-html:`<tr>`
@ -1842,6 +1844,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`<td class="no"></td> <!-- eBPF -->`
 :raw-html:`</tr>`

 :raw-html:`<tr>`
@ -1856,6 +1859,7 @@ Here is the table:
 :raw-html:`<td class="no"></td> <!-- Sparc -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="yes"></td> <!-- XCore -->`
+:raw-html:`<td class="yes"></td> <!-- eBPF -->`
 :raw-html:`</tr>`

 :raw-html:`<tr>`
@ -1870,6 +1874,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="yes"></td> <!-- XCore -->`
+:raw-html:`<td class="no"></td> <!-- eBPF -->`
 :raw-html:`</tr>`

 :raw-html:`<tr>`
@ -1884,6 +1889,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`<td class="yes"></td> <!-- eBPF -->`
 :raw-html:`</tr>`

 :raw-html:`<tr>`
@ -1898,6 +1904,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`<td class="yes"></td> <!-- eBPF -->`
 :raw-html:`</tr>`

 :raw-html:`<tr>`
@ -1912,6 +1919,7 @@ Here is the table:
 :raw-html:`<td class="no"></td> <!-- SystemZ -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
 :raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`<td class="no"></td> <!-- eBPF -->`
 :raw-html:`</tr>`

 :raw-html:`<tr>`
@ -1926,6 +1934,7 @@ Here is the table:
 :raw-html:`<td class="no"></td> <!-- SystemZ -->`
 :raw-html:`<td class="partial"><a href="#feat_segstacks_x86">*</a></td> <!-- X86 -->`
 :raw-html:`<td class="no"></td> <!-- XCore -->`
+:raw-html:`<td class="no"></td> <!-- eBPF -->`
 :raw-html:`</tr>`

 :raw-html:`</table>`
@ -2448,3 +2457,191 @@ Code Generator Options:
 :raw-html:`</tr>`
 :raw-html:`</table>`

+The extended Berkeley Packet Filter (eBPF) backend
+--------------------------------------------------
+
+Extended BPF (or eBPF) is similar to the original ("classic") BPF (cBPF) used
+to filter network packets.  The
+`bpf() system call <http://man7.org/linux/man-pages/man2/bpf.2.html>`_
+performs a range of operations related to eBPF.  For both cBPF and eBPF
+programs, the Linux kernel statically analyzes the programs before loading
+them, in order to ensure that they cannot harm the running system.  eBPF is
+a 64-bit RISC instruction set designed for one to one mapping to 64-bit CPUs.
+Opcodes are 8-bit encoded, and 87 instructions are defined.  There are 10
+registers, grouped by function as outlined below.
+
+::
+
+  R0        return value from in-kernel functions; exit value for eBPF program
+  R1 - R5   function call arguments to in-kernel functions
+  R6 - R9   callee-saved registers preserved by in-kernel functions
+  R10       stack frame pointer (read only)
+
+Instruction encoding (arithmetic and jump)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+eBPF is reusing most of the opcode encoding from classic to simplify conversion
+of classic BPF to eBPF.  For arithmetic and jump instructions the 8-bit 'code'
+field is divided into three parts:
+
+::
+
+  +----------------+--------+--------------------+
+  |   4 bits       |  1 bit |   3 bits           |
+  | operation code | source | instruction class  |
+  +----------------+--------+--------------------+
+  (MSB)                                      (LSB)
+
+Three LSB bits store instruction class which is one of:
+
+::
+
+  BPF_LD     0x0
+  BPF_LDX    0x1
+  BPF_ST     0x2
+  BPF_STX    0x3
+  BPF_ALU    0x4
+  BPF_JMP    0x5
+  (unused)   0x6
+  BPF_ALU64  0x7
+
+When BPF_CLASS(code) == BPF_ALU or BPF_ALU64 or BPF_JMP,
+4th bit encodes source operand
+
+::
+
+  BPF_X     0x0  use src_reg register as source operand
+  BPF_K     0x1  use 32 bit immediate as source operand
+
+and four MSB bits store operation code
+
+::
+
+  BPF_ADD   0x0  add
+  BPF_SUB   0x1  subtract
+  BPF_MUL   0x2  multiply
+  BPF_DIV   0x3  divide
+  BPF_OR    0x4  bitwise logical OR
+  BPF_AND   0x5  bitwise logical AND
+  BPF_LSH   0x6  left shift
+  BPF_RSH   0x7  right shift (zero extended)
+  BPF_NEG   0x8  arithmetic negation
+  BPF_MOD   0x9  modulo
+  BPF_XOR   0xa  bitwise logical XOR
+  BPF_MOV   0xb  move register to register
+  BPF_ARSH  0xc  right shift (sign extended)
+  BPF_END   0xd  endianness conversion
+
+If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of
+
+::
+
+  BPF_JA    0x0  unconditional jump
+  BPF_JEQ   0x1  jump ==
+  BPF_JGT   0x2  jump >
+  BPF_JGE   0x3  jump >=
+  BPF_JSET  0x4  jump if (DST & SRC)
+  BPF_JNE   0x5  jump !=
+  BPF_JSGT  0x6  jump signed >
+  BPF_JSGE  0x7  jump signed >=
+  BPF_CALL  0x8  function call
+  BPF_EXIT  0x9  function return
+
+Instruction encoding (load, store)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+For load and store instructions the 8-bit 'code' field is divided as:
+
+::
+
+  +--------+--------+-------------------+
+  | 3 bits | 2 bits |   3 bits          |
+  |  mode  |  size  | instruction class |
+  +--------+--------+-------------------+
+  (MSB)                             (LSB)
+
+Size modifier is one of
+
+::
+
+  BPF_W       0x0  word
+  BPF_H       0x1  half word
+  BPF_B       0x2  byte
+  BPF_DW      0x3  double word
+
+Mode modifier is one of
+
+::
+
+  BPF_IMM     0x0  immediate
+  BPF_ABS     0x1  used to access packet data
+  BPF_IND     0x2  used to access packet data
+  BPF_MEM     0x3  memory
+  (reserved)  0x4
+  (reserved)  0x5
+  BPF_XADD    0x6  exclusive add
+
+
+Packet data access (BPF_ABS, BPF_IND)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Two non-generic instructions: (BPF_ABS | <size> | BPF_LD) and
+(BPF_IND | <size> | BPF_LD) which are used to access packet data.
+Register R6 is an implicit input that must contain pointer to sk_buff.
+Register R0 is an implicit output which contains the data fetched
+from the packet.  Registers R1-R5 are scratch registers and must not
+be used to store the data across BPF_ABS | BPF_LD or BPF_IND | BPF_LD
+instructions.  These instructions have implicit program exit condition
+as well.  When eBPF program is trying to access the data beyond
+the packet boundary, the interpreter will abort the execution of the program.
+
+BPF_IND | BPF_W | BPF_LD is equivalent to:
+  R0 = ntohl(\*(u32 \*) (((struct sk_buff \*) R6)->data + src_reg + imm32))
+
+eBPF maps
+^^^^^^^^^
+
+eBPF maps are provided for sharing data between kernel and user-space.
+Currently implemented types are hash and array, with potential extension to
+support bloom filters, radix trees, etc.  A map is defined by its type,
+maximum number of elements, key size and value size in bytes.  eBPF syscall
+supports create, update, find and delete functions on maps.
+
+Function calls
+^^^^^^^^^^^^^^
+
+Function call arguments are passed using up to five registers (R1 - R5).
+The return value is passed in a dedicated register (R0).  Four additional
+registers (R6 - R9) are callee-saved, and the values in these registers
+are preserved within kernel functions.  R0 - R5 are scratch registers within
+kernel functions, and eBPF programs must therefor store/restore values in
+these registers if needed across function calls.  The stack can be accessed
+using the read-only frame pointer R10.  eBPF registers map 1:1 to hardware
+registers on x86_64 and other 64-bit architectures.  For example, x86_64
+in-kernel JIT maps them as
+
+::
+
+  R0 - rax
+  R1 - rdi
+  R2 - rsi
+  R3 - rdx
+  R4 - rcx
+  R5 - r8
+  R6 - rbx
+  R7 - r13
+  R8 - r14
+  R9 - r15
+  R10 - rbp
+
+since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing
+and rbx, r12 - r15 are callee saved.
+
+Program start
+^^^^^^^^^^^^^
+
+An eBPF program receives a single argument and contains
+a single eBPF main routine; the program does not contain eBPF functions.
+Function calls are limited to a predefined set of kernel functions.  The size
+of a program is limited to 4K instructions:  this ensures fast termination and
+a limited number of kernel function calls.  Prior to running an eBPF program,
+a verifier performs static analysis to prevent loops in the code and
+to ensure valid register usage and operand types.
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@ -28,7 +28,7 @@ Note that some code bases (e.g. ``libc++``) have really good reasons to deviate
 from the coding standards.  In the case of ``libc++``, this is because the
 naming and other conventions are dictated by the C++ standard.  If you think
 there is a specific good reason to deviate from the standards here, please bring
-it up on the LLVMdev mailing list.
+it up on the LLVM-dev mailing list.

 There are some conventions that are not uniformly followed in the code base
 (e.g. the naming convention).  This is because they are relatively new, and a
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@ -30,7 +30,7 @@ This policy is also designed to accomplish the following objectives:
 This policy is aimed at frequent contributors to LLVM. People interested in
 contributing one-off patches can do so in an informal way by sending them to the
 `llvm-commits mailing list
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_ and engaging another
+<http://lists.llvm.org/mailman/listinfo/llvm-commits>`_ and engaging another
 developer to see it through the process.

 Developer Policies
@ -47,23 +47,23 @@ Stay Informed
 -------------

 Developers should stay informed by reading at least the "dev" mailing list for
-the projects you are interested in, such as `llvmdev
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ for LLVM, `cfe-dev
-<http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev>`_ for Clang, or `lldb-dev
-<http://lists.cs.uiuc.edu/mailman/listinfo/lldb-dev>`_ for LLDB.  If you are
+the projects you are interested in, such as `llvm-dev
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ for LLVM, `cfe-dev
+<http://lists.llvm.org/mailman/listinfo/cfe-dev>`_ for Clang, or `lldb-dev
+<http://lists.llvm.org/mailman/listinfo/lldb-dev>`_ for LLDB.  If you are
 doing anything more than just casual work on LLVM, it is suggested that you also
 subscribe to the "commits" mailing list for the subproject you're interested in,
 such as `llvm-commits
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_, `cfe-commits
-<http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits>`_, or `lldb-commits
-<http://lists.cs.uiuc.edu/mailman/listinfo/lldb-commits>`_.  Reading the
+<http://lists.llvm.org/mailman/listinfo/llvm-commits>`_, `cfe-commits
+<http://lists.llvm.org/mailman/listinfo/cfe-commits>`_, or `lldb-commits
+<http://lists.llvm.org/mailman/listinfo/lldb-commits>`_.  Reading the
 "commits" list and paying attention to changes being made by others is a good
 way to see what other people are interested in and watching the flow of the
 project as a whole.

 We recommend that active developers register an email account with `LLVM
 Bugzilla <http://llvm.org/bugs/>`_ and preferably subscribe to the `llvm-bugs
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmbugs>`_ email list to keep track
+<http://lists.llvm.org/mailman/listinfo/llvm-bugs>`_ email list to keep track
 of bugs and enhancements occurring in LLVM.  We really appreciate people who are
 proactive at catching incoming bugs in their components and dealing with them
 promptly.
@ -365,7 +365,7 @@ If you have recently been granted commit access, these policies apply:

 #. You are granted *commit-after-approval* to all parts of LLVM.  To get
   approval, submit a `patch`_ to `llvm-commits
-   <http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_. When approved,
+   <http://lists.llvm.org/mailman/listinfo/llvm-commits>`_. When approved,
   you may commit it yourself.

 #. You are allowed to commit patches without approval which you think are
@ -394,8 +394,8 @@ Making a Major Change
 ---------------------

 When a developer begins a major new project with the aim of contributing it back
-to LLVM, they should inform the community with an email to the `llvmdev
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ email list, to the extent
+to LLVM, they should inform the community with an email to the `llvm-dev
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ email list, to the extent
 possible. The reason for this is to:

 #. keep the community informed about future changes to LLVM,
@ -608,7 +608,7 @@ LICENSE.txt files specifically indicate that they contain GPL code.

 We have no plans to change the license of LLVM.  If you have questions or
 comments about the license, please contact the `LLVM Developer's Mailing
-List <mailto:llvmdev@cs.uiuc.edu>`_.
+List <mailto:llvm-dev@lists.llvm.org>`_.

 Patents
 -------
--- a/docs/ExtendingLLVM.rst
+++ b/docs/ExtendingLLVM.rst
@ -15,7 +15,7 @@ When you come to this realization, stop and think. Do you really need to extend
 LLVM? Is it a new fundamental capability that LLVM does not support at its
 current incarnation or can it be synthesized from already pre-existing LLVM
 elements? If you are not sure, ask on the `LLVM-dev
-<http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ list. The reason is that
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ list. The reason is that
 extending LLVM will get involved as you need to update all the different passes
 that you intend to use with your extension, and there are ``many`` LLVM analyses
 and transformations, so it may be quite a bit of work.
--- a/docs/Frontend/PerformanceTips.rst
+++ b/docs/Frontend/PerformanceTips.rst
@ -174,10 +174,10 @@ Adding to this document

 If you run across a case that you feel deserves to be covered here, please send
 a patch to `llvm-commits
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_ for review.
+<http://lists.llvm.org/mailman/listinfo/llvm-commits>`_ for review.

-If you have questions on these items, please direct them to `llvmdev 
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_.  The more relevant 
+If you have questions on these items, please direct them to `llvm-dev 
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_.  The more relevant 
 context you are able to give to your question, the more likely it is to be 
 answered.

--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@ -714,9 +714,9 @@ used by people developing LLVM.
 |                         | the configure script. The default list is defined  |
 |                         | as ``LLVM_ALL_TARGETS``, and can be set to include |
 |                         | out-of-tree targets. The default value includes:   |
-|                         | ``AArch64, ARM, CppBackend, Hexagon,               |
-|                         | Mips, MSP430, NVPTX, PowerPC, AMDGPU, Sparc,       |
-|                         | SystemZ, X86, XCore``.                             |
+|                         | ``AArch64, AMDGPU, ARM, BPF, CppBackend, Hexagon,  |
+|                         | Mips, MSP430, NVPTX, PowerPC, Sparc, SystemZ       |
+|                         | X86, XCore``.                                      |
 +-------------------------+----------------------------------------------------+
 | LLVM_ENABLE_DOXYGEN     | Build doxygen-based documentation from the source  |
 |                         | code This is disabled by default because it is     |
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -6493,7 +6493,7 @@ Example:

      %ptr = alloca i32                               ; yields i32*:ptr
      store i32 3, i32* %ptr                          ; yields void
-      %val = load i32* %ptr                           ; yields i32:val = i32 3
+      %val = load i32, i32* %ptr                      ; yields i32:val = i32 3

 .. _i_fence:

--- a/docs/Makefile
+++ b/docs/Makefile
@ -31,6 +31,7 @@ $(PROJ_OBJ_DIR)/doxygen.cfg: doxygen.cfg.in
 	  -e 's/@llvm_doxygen_qhp_cust_filter_name@//g' \
 	  -e 's/@llvm_doxygen_qhp_namespace@//g' \
 	  -e 's/@searchengine_url@//g' \
+	  -e 's/@DOT_IMAGE_FORMAT@/png/g' \
 	  > $@
 endif

--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@ -150,7 +150,7 @@ Status

 Please let us know whether you like it and what could be improved! We're still
 working on setting up a bug tracker, but you can email klimek-at-google-dot-com
-and chandlerc-at-gmail-dot-com and CC the llvmdev mailing list with questions
+and chandlerc-at-gmail-dot-com and CC the llvm-dev mailing list with questions
 until then. We also could use help implementing improvements. This sadly is
 really painful and hard because the Phabricator codebase is in PHP and not as
 testable as you might like. However, we've put exactly what we're deploying up
--- a/docs/Projects.rst
+++ b/docs/Projects.rst
@ -254,4 +254,4 @@ Further Help
 If you have any questions or need any help creating an LLVM project, the LLVM
 team would be more than happy to help.  You can always post your questions to
 the `LLVM Developers Mailing List
-<http://lists.cs.uiuc.edu/pipermail/llvmdev/>`_.
+<http://lists.llvm.org/pipermail/llvm-dev/>`_.
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@ -5,12 +5,6 @@ LLVM 3.7 Release Notes
 .. contents::
    :local:

-.. warning::
-   These are in-progress notes for the upcoming LLVM 3.7 release.  You may
-   prefer the `LLVM 3.6 Release Notes <http://llvm.org/releases/3.6.0/docs
-   /ReleaseNotes.html>`_.
-
-
 Introduction
 ============

@ -23,7 +17,7 @@ from the `LLVM releases web site <http://llvm.org/releases/>`_.
 For more information about LLVM, including information about the latest
 release, please check out the `main LLVM web site <http://llvm.org/>`_.  If you
 have questions or comments, the `LLVM Developer's Mailing List
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ is a good place to send
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
 them.

 Note that if you are reading this file from a Subversion checkout or the main
@ -48,46 +42,346 @@ Non-comprehensive list of changes in this release
  collection of tips for frontend authors on how to generate IR which LLVM is
  able to effectively optimize.

-* The DataLayout is no longer optional. All the IR level optimizations expects
+* The ``DataLayout`` is no longer optional. All the IR level optimizations expects
  it to be present and the API has been changed to use a reference instead of
  a pointer to make it explicit. The Module owns the datalayout and it has to
  match the one attached to the TargetMachine for generating code.

-* ... next change ...
+  In 3.6, a pass was inserted in the pipeline to make the ``DataLayout`` accessible:
+    ``MyPassManager->add(new DataLayoutPass(MyTargetMachine->getDataLayout()));``
+  In 3.7, you don't need a pass, you set the ``DataLayout`` on the ``Module``:
+    ``MyModule->setDataLayout(MyTargetMachine->createDataLayout());``

-.. NOTE
-   If you would like to document a larger change, then you can add a
-   subsection about it right here. You can copy the following boilerplate
-   and un-indent it (the indentation causes it to be inside this comment).
+  The LLVM C API ``LLVMGetTargetMachineData`` is deprecated to reflect the fact
+  that it won't be available anymore from ``TargetMachine`` in 3.8.

-   Special New Feature
-   -------------------
+* Comdats are now orthogonal to the linkage. LLVM will not create
+  comdats for weak linkage globals and the frontends are responsible
+  for explicitly adding them.

-   Makes programs 10x faster by doing Special New Thing.
+* On ELF we now support multiple sections with the same name and
+  comdat. This allows for smaller object files since multiple
+  sections can have a simple name (`.text`, `.rodata`, etc).

-Changes to the ARM Backend
--------------------------
+* LLVM now lazily loads metadata in some cases. Creating archives
+  with IR files with debug info is now 25X faster.

- During this release ...
+* llvm-ar can create archives in the BSD format used by OS X.

+* LLVM received a backend for the extended Berkely Packet Filter
+  instruction set that can be dynamically loaded into the Linux kernel via the
+  `bpf(2) <http://man7.org/linux/man-pages/man2/bpf.2.html>`_ syscall.
+
+  Support for BPF has been present in the kernel for some time, but starting
+  from 3.18 has been extended with such features as: 64-bit registers, 8
+  additional registers registers, conditional backwards jumps, call
+  instruction, shift instructions, map (hash table, array, etc.), 1-8 byte
+  load/store from stack, and more.
+
+  Up until now, users of BPF had to write bytecode by hand, or use
+  custom generators. This release adds a proper LLVM backend target for the BPF
+  bytecode architecture.
+
+  The BPF target is now available by default, and options exist in both Clang
+  (-target bpf) or llc (-march=bpf) to pick eBPF as a backend.
+
+* Switch-case lowering was rewritten to avoid generating unbalanced search trees
+  (`PR22262 <http://llvm.org/pr22262>`_) and to exploit profile information
+  when available. Some lowering strategies are now disabled when optimizations
+  are turned off, to save compile time.
+
+* The debug info IR class hierarchy now inherits from ``Metadata`` and has its
+  own bitcode records and assembly syntax
+  (`documented in LangRef <LangRef.html#specialized-metadata-nodes>`_).  The debug
+  info verifier has been merged with the main verifier.
+
+* LLVM IR and APIs are in a period of transition to aid in the removal of
+  pointer types (the end goal being that pointers are typeless/opaque - void*,
+  if you will). Some APIs and IR constructs have been modified to take
+  explicit types that are currently checked to match the target type of their
+  pre-existing pointer type operands. Further changes are still needed, but the
+  more you can avoid using ``PointerType::getPointeeType``, the easier the
+  migration will be.
+
+* Argument-less ``TargetMachine::getSubtarget`` and
+  ``TargetMachine::getSubtargetImpl`` have been removed from the tree. Updating
+  out of tree ports is as simple as implementing a non-virtual version in the
+  target, but implementing full ``Function`` based ``TargetSubtargetInfo``
+  support is recommended.
+
+* This is expected to be the last major release of LLVM that supports being
+  run on Windows XP and Windows Vista.  For the next major release the minimum
+  Windows version requirement will be Windows 7.

 Changes to the MIPS Target
 --------------------------

- During this release ...
+During this release the MIPS target has:

+* Added support for MIPS32R3, MIPS32R5, MIPS32R3, MIPS32R5, and microMIPS32.
+
+* Added support for dynamic stack realignment. This is of particular importance
+  to MSA on 32-bit subtargets since vectors always exceed the stack alignment on
+  the O32 ABI.
+
+* Added support for compiler-rt including:
+
+  * Support for the Address, and Undefined Behaviour Sanitizers for all MIPS
+    subtargets.
+
+  * Support for the Data Flow, and Memory Sanitizer for 64-bit subtargets.
+
+  * Support for the Profiler for all MIPS subtargets.
+
+* Added support for libcxx, and libcxxabi.
+
+* Improved inline assembly support such that memory constraints may now make use
+  of the appropriate address offsets available to the instructions. Also, added
+  support for the ``ZC`` constraint.
+
+* Added support for 128-bit integers on 64-bit subtargets and 16-bit floating
+  point conversions on all subtargets.
+
+* Added support for read-only ``.eh_frame`` sections by storing type information
+  indirectly.
+
+* Added support for MCJIT on all 64-bit subtargets as well as MIPS32R6.
+
+* Added support for fast instruction selection on MIPS32 and MIPS32R2 with PIC.
+
+* Various bug fixes. Including the following notable fixes:
+
+  * Fixed 'jumpy' debug line info around calls where calculation of the address
+    of the function would inappropriately change the line number.
+
+  * Fixed missing ``__mips_isa_rev`` macro on the MIPS32R6 and MIPS32R6
+    subtargets.
+
+  * Fixed representation of NaN when targeting systems using traditional
+    encodings. Traditionally, MIPS has used NaN encodings that were compatible
+    with IEEE754-1985 but would later be found incompatible with IEEE754-2008.
+
+  * Fixed multiple segfaults and assertions in the disassembler when
+    disassembling instructions that have memory operands.
+
+  * Fixed multiple cases of suboptimal code generation involving $zero.
+
+  * Fixed code generation of 128-bit shifts on 64-bit subtargets.
+
+  * Prevented the delay slot filler from filling call delay slots with
+    instructions that modify or use $ra.
+
+  * Fixed some remaining N32/N64 calling convention bugs when using small
+    structures on big-endian subtargets.
+
+  * Fixed missing sign-extensions that are required by the N32/N64 calling
+    convention when generating calls to library functions with 32-bit
+    parameters.
+
+  * Corrected the ``int64_t`` typedef to be ``long`` for N64.
+
+  * ``-mno-odd-spreg`` is now honoured for vector insertion/extraction
+    operations when using -mmsa.
+
+  * Fixed vector insertion and extraction for MSA on 64-bit subtargets.
+
+  * Corrected the representation of member function pointers. This makes them
+    usable on microMIPS subtargets.

 Changes to the PowerPC Target
 -----------------------------

- During this release ...
+There are numerous improvements to the PowerPC target in this release:

+* LLVM now supports the ISA 2.07B (POWER8) instruction set, including
+  direct moves between general registers and vector registers, and
+  built-in support for hardware transactional memory (HTM).  Some missing
+  instructions from ISA 2.06 (POWER7) were also added.

-Changes to the OCaml bindings
+* Code generation for the local-dynamic and global-dynamic thread-local
+  storage models has been improved.
+
+* Loops may be restructured to leverage pre-increment loads and stores.
+
+* QPX - The vector instruction set used by the IBM Blue Gene/Q supercomputers
+  is now supported.
+
+* Loads from the TOC area are now correctly treated as invariant.
+
+* PowerPC now has support for i128 and v1i128 types.  The types differ
+  in how they are passed in registers for the ELFv2 ABI.
+
+* Disassembly will now print shorter mnemonic aliases when available.
+
+* Optional register name prefixes for VSX and QPX registers are now
+  supported in the assembly parser.
+
+* The back end now contains a pass to remove unnecessary vector swaps
+  from POWER8 little-endian code generation.  Additional improvements
+  are planned for release 3.8.
+
+* The undefined-behavior sanitizer (UBSan) is now supported for PowerPC.
+
+* Many new vector programming APIs have been added to altivec.h.
+  Additional ones are planned for release 3.8.
+
+* PowerPC now supports __builtin_call_with_static_chain.
+
+* PowerPC now supports the revised -mrecip option that permits finer
+  control over reciprocal estimates.
+
+* Many bugs have been identified and fixed.
+
+Changes to the SystemZ Target
 -----------------------------

- During this release ...
+* LLVM no longer attempts to automatically detect the current host CPU when
+  invoked natively.

+* Support for all thread-local storage models. (Previous releases would support
+  only the local-exec TLS model.)
+
+* The POPCNT instruction is now used on z196 and above.
+
+* The RISBGN instruction is now used on zEC12 and above.
+
+* Support for the transactional-execution facility on zEC12 and above.
+
+* Support for the z13 processor and its vector facility.
+
+
+Changes to the JIT APIs
+-----------------------
+
+* Added a new C++ JIT API called On Request Compilation, or ORC.
+
+  ORC is a new JIT API inspired by MCJIT but designed to be more testable, and
+  easier to extend with new features. A key new feature already in tree is lazy,
+  function-at-a-time compilation for X86. Also included is a reimplementation of
+  MCJIT's API and behavior (OrcMCJITReplacement). MCJIT itself remains in tree,
+  and continues to be the default JIT ExecutionEngine, though new users are
+  encouraged to try ORC out for their projects. (A good place to start is the
+  new ORC tutorials under llvm/examples/kaleidoscope/orc).
+
+Sub-project Status Update
+=========================
+
+In addition to the core LLVM 3.7 distribution of production-quality compiler
+infrastructure, the LLVM project includes sub-projects that use the LLVM core
+and share the same distribution license. This section provides updates on these
+sub-projects.
+
+Polly - The Polyhedral Loop Optimizer in LLVM
+---------------------------------------------
+
+`Polly <http://polly.llvm.org>`_ is a polyhedral loop optimization
+infrastructure that provides data-locality optimizations to LLVM-based
+compilers. When compiled as part of clang or loaded as a module into clang,
+it can perform loop optimizations such as tiling, loop fusion or outer-loop
+vectorization. As a generic loop optimization infrastructure it allows
+developers to get a per-loop-iteration model of a loop nest on which detailed
+analysis and transformations can be performed.
+
+Changes since the last release:
+
+* isl imported into Polly distribution
+
+  `isl <http://repo.or.cz/w/isl.git>`_, the math library Polly uses, has been
+  imported into the source code repository of Polly and is now distributed as part
+  of Polly. As this was the last external library dependency of Polly, Polly can
+  now be compiled right after checking out the Polly source code without the need
+  for any additional libraries to be pre-installed.
+
+* Small integer optimization of isl
+
+  The MIT licensed imath backend using in `isl <http://repo.or.cz/w/isl.git>`_ for
+  arbitrary width integer computations has been optimized to use native integer
+  operations for the common case where the operands of a computation fit into 32
+  bit and to only fall back to large arbitrary precision integers for the
+  remaining cases. This optimization has greatly improved the compile-time
+  performance of Polly, both due to faster native operations also due to a
+  reduction in malloc traffic and pointer indirections. As a result, computations
+  that use arbitrary precision integers heavily have been speed up by almost 6x.
+  As a result, the compile-time of Polly on the Polybench test kernels in the LNT
+  suite has been reduced by 20% on average with compile time reductions between
+  9-43%.
+
+* Schedule Trees
+
+  Polly now uses internally so-called > Schedule Trees < to model the loop
+  structure it optimizes. Schedule trees are an easy to understand tree structure
+  that describes a loop nest using integer constraint sets to keep track of
+  execution constraints. It allows the developer to use per-tree-node operations
+  to modify the loop tree. Programatic analysis that work on the schedule tree
+  (e.g., as dependence analysis) also show a visible speedup as they can exploit
+  the tree structure of the schedule and need to fall back to ILP based
+  optimization problems less often. Section 6 of `Polyhedral AST generation is
+  more than scanning polyhedra
+  <http://www.grosser.es/#pub-polyhedral-AST-generation>`_ gives a detailed
+  explanation of this schedule trees.
+
+* Scalar and PHI node modeling - Polly as an analysis
+
+  Polly now requires almost no preprocessing to analyse LLVM-IR, which makes it
+  easier to use Polly as a pure analysis pass e.g. to provide more precise
+  dependence information to non-polyhedral transformation passes. Originally,
+  Polly required the input LLVM-IR to be preprocessed such that all scalar and
+  PHI-node dependences are translated to in-memory operations. Since this release,
+  Polly has full support for scalar and PHI node dependences and requires no
+  scalar-to-memory translation for such kind of dependences.
+
+* Modeling of modulo and non-affine conditions
+
+  Polly can now supports modulo operations such as A[t%2][i][j] as they appear
+  often in stencil computations and also allows data-dependent conditional
+  branches as they result e.g. from ternary conditions ala A[i] > 255 ? 255 :
+  A[i].
+
+* Delinearization
+
+  Polly now support the analysis of manually linearized multi-dimensional arrays
+  as they result form macros such as
+  "#define 2DARRAY(A,i,j) (A.data[(i) * A.size + (j)]". Similar constructs appear
+  in old C code written before C99, C++ code such as boost::ublas, LLVM exported
+  from Julia, Matlab generated code and many others. Our work titled
+  `Optimistic Delinearization of Parametrically Sized Arrays
+  <http://www.grosser.es/#pub-optimistic-delinerization>`_ gives details.
+
+* Compile time improvements
+
+  Pratik Bahtu worked on compile-time performance tuning of Polly. His work
+  together with the support for schedule trees and the small integer optimization
+  in isl notably reduced the compile time.
+
+* Increased compute timeouts
+
+  As Polly's compile time has been notabily improved, we were able to increase
+  the compile time saveguards in Polly. As a result, the default configuration
+  of Polly can now analyze larger loop nests without running into compile time
+  restrictions.
+
+* Export Debug Locations via JSCoP file
+
+  Polly's JSCoP import/export format gained support for debug locations that show
+  to the user the source code location of detected scops.
+
+* Improved windows support
+
+  The compilation of Polly on windows using cmake has been improved and several
+  visual studio build issues have been addressed.
+
+* Many bug fixes
+
+libunwind
+---------
+
+The unwind implementation which use to reside in `libc++abi` has been moved into
+a separate repository.  This implementation can still be used for `libc++abi` by
+specifying `-DLIBCXXABI_USE_LLVM_UNWINDER=YES` and
+`-DLIBCXXABI_LIBUNWIND_PATH=<path to libunwind source>` when configuring
+`libc++abi`, which defaults to `true` when building on ARM.
+
+The new repository can also be built standalone if just `libunwind` is desired.

 External Open Source Projects Using LLVM 3.7
 ============================================
@ -96,7 +390,74 @@ An exciting aspect of LLVM is that it is used as an enabling technology for
 a lot of other language and tools projects. This section lists some of the
 projects that have already been updated to work with LLVM 3.7.

-* A project
+
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on
+PowerPC (32/64 bit). Ports to other architectures like ARM, AArch64 and MIPS64
+are underway.
+
+Portable Computing Language (pocl)
+----------------------------------
+
+In addition to producing an easily portable open source OpenCL
+implementation, another major goal of `pocl <http://portablecl.org/>`_
+is improving performance portability of OpenCL programs with
+compiler optimizations, reducing the need for target-dependent manual
+optimizations. An important part of pocl is a set of LLVM passes used to
+statically parallelize multiple work-items with the kernel compiler, even in
+the presence of work-group barriers.
+
+
+TTA-based Co-design Environment (TCE)
+-------------------------------------
+
+`TCE <http://tce.cs.tut.fi/>`_ is a toolset for designing customized
+exposed datapath processors based on the Transport triggered
+architecture (TTA).
+
+The toolset provides a complete co-design flow from C/C++
+programs down to synthesizable VHDL/Verilog and parallel program binaries.
+Processor customization points include the register files, function units,
+supported operations, and the interconnection network.
+
+TCE uses Clang and LLVM for C/C++/OpenCL C language support, target independent
+optimizations and also for parts of code generation. It generates
+new LLVM-based code generators "on the fly" for the designed processors and
+loads them in to the compiler backend as runtime libraries to avoid
+per-target recompilation of larger parts of the compiler chain.
+
+BPF Compiler Collection (BCC)
+-----------------------------
+`BCC <https://github.com/iovisor/bcc>`_ is a Python + C framework for tracing and
+networking that is using Clang rewriter + 2nd pass of Clang + BPF backend to
+generate eBPF and push it into the kernel.
+
+LLVMSharp & ClangSharp
+----------------------
+
+`LLVMSharp <http://www.llvmsharp.org>`_ and
+`ClangSharp <http://www.clangsharp.org>`_ are type-safe C# bindings for
+Microsoft.NET and Mono that Platform Invoke into the native libraries.
+ClangSharp is self-hosted and is used to generated LLVMSharp using the
+LLVM-C API.
+
+`LLVMSharp Kaleidoscope Tutorials <http://www.llvmsharp.org/Kaleidoscope/>`_
+are instructive examples of writing a compiler in C#, with certain improvements
+like using the visitor pattern to generate LLVM IR.
+
+`ClangSharp PInvoke Generator <http://www.clangsharp.org/PInvoke/>`_ is the
+self-hosting mechanism for LLVM/ClangSharp and is demonstrative of using
+LibClang to generate Platform Invoke (PInvoke) signatures for C APIs.


 Additional Information
@ -111,4 +472,3 @@ going into the ``llvm/docs/`` directory in the LLVM tree.

 If you have any questions or comments about LLVM, please feel free to contact
 us via the `mailing lists <http://llvm.org/docs/#maillist>`_.
-
--- a/docs/Statepoints.rst
+++ b/docs/Statepoints.rst
@ -565,7 +565,7 @@ The existing IR Verifier pass has been extended to check most of the
 local restrictions on the intrinsics mentioned in their respective
 documentation.  The current implementation in LLVM does not check the
 key relocation invariant, but this is ongoing work on developing such
-a verifier.  Please ask on llvmdev if you're interested in
+a verifier.  Please ask on llvm-dev if you're interested in
 experimenting with the current version.

 .. _statepoint-utilities:
@ -696,7 +696,7 @@ If you are scheduling the RewriteStatepointsForGC pass late in the pass order,
 you should probably schedule this pass immediately before it.  The exception 
 would be if you need to preserve abstract frame information (e.g. for
 deoptimization or introspection) at safepoints.  In that case, ask on the 
-llvmdev mailing list for suggestions.
+llvm-dev mailing list for suggestions.


 Bugs and Enhancements
@ -707,8 +707,8 @@ tracked by performing a `bugzilla search
 <http://llvm.org/bugs/buglist.cgi?cmdtype=runnamed&namedcmd=Statepoint%20Bugs&list_id=64342>`_
 for [Statepoint] in the summary field. When filing new bugs, please
 use this tag so that interested parties see the newly filed bug.  As
-with most LLVM features, design discussions take place on `llvmdev
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_, and patches
+with most LLVM features, design discussions take place on `llvm-dev
+<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_, and patches
 should be sent to `llvm-commits
-<http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_ for review.
+<http://lists.llvm.org/mailman/listinfo/llvm-commits>`_ for review.

--- a/docs/TableGen/LangIntro.rst
+++ b/docs/TableGen/LangIntro.rst
@ -7,7 +7,7 @@ TableGen Language Introduction

 .. warning::
   This document is extremely rough. If you find something lacking, please
-   fix it, file a documentation bug, or ask about it on llvmdev.
+   fix it, file a documentation bug, or ask about it on llvm-dev.

 Introduction
 ============
--- a/docs/TableGen/LangRef.rst
+++ b/docs/TableGen/LangRef.rst
@ -7,7 +7,7 @@ TableGen Language Reference

 .. warning::
   This document is extremely rough. If you find something lacking, please
-   fix it, file a documentation bug, or ask about it on llvmdev.
+   fix it, file a documentation bug, or ask about it on llvm-dev.

 Introduction
 ============
--- a/docs/conf.py
+++ b/docs/conf.py
@ -11,6 +11,7 @@
 # serve to show the default.

 import sys, os
+from datetime import date

 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
@ -40,7 +41,7 @@

 # General information about the project.
 project = u'LLVM'
-copyright = u'2003-2014, LLVM Project'
+copyright = u'2003-%d, LLVM Project' % date.today().year

 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
--- a/docs/doxygen.cfg.in
+++ b/docs/doxygen.cfg.in
@ -2205,7 +2205,7 @@ DIRECTORY_GRAPH        = YES
 # The default value is: png.
 # This tag requires that the tag HAVE_DOT is set to YES.

-DOT_IMAGE_FORMAT       = png
+DOT_IMAGE_FORMAT       = @DOT_IMAGE_FORMAT@

 # If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
 # enable generation of interactive SVG images that allow zooming and panning.
--- a/docs/index.rst
+++ b/docs/index.rst
@ -1,11 +1,6 @@
 Overview
 ========

-.. warning::
-
-   If you are using a released version of LLVM, see `the download page
-   <http://llvm.org/releases/>`_ to find your documentation.
-
 The LLVM compiler infrastructure supports a wide range of projects, from
 industrial strength compilers to specialized JIT applications to small
 research projects.
@ -425,12 +420,12 @@ Mailing Lists
 If you can't find what you need in these docs, try consulting the mailing
 lists.

-`Developer's List (llvmdev)`__
+`Developer's List (llvm-dev)`__
  This list is for people who want to be included in technical discussions of
  LLVM. People post to this list when they have questions about writing code
  for or using the LLVM tools. It is relatively low volume.

-  .. __: http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
+  .. __: http://lists.llvm.org/mailman/listinfo/llvm-dev

 `Commits Archive (llvm-commits)`__
  This list contains all commit messages that are made when LLVM developers
@ -439,26 +434,26 @@ lists.
  stay on the bleeding edge of LLVM development. This list is very high
  volume.

-  .. __: http://lists.cs.uiuc.edu/pipermail/llvm-commits/
+  .. __: http://lists.llvm.org/pipermail/llvm-commits/

-`Bugs & Patches Archive (llvmbugs)`__
+`Bugs & Patches Archive (llvm-bugs)`__
  This list gets emailed every time a bug is opened and closed. It is
-  higher volume than the LLVMdev list.
+  higher volume than the LLVM-dev list.

-  .. __: http://lists.cs.uiuc.edu/pipermail/llvmbugs/
+  .. __: http://lists.llvm.org/pipermail/llvm-bugs/

 `Test Results Archive (llvm-testresults)`__
  A message is automatically sent to this list by every active nightly tester
  when it completes.  As such, this list gets email several times each day,
  making it a high volume list.

-  .. __: http://lists.cs.uiuc.edu/pipermail/llvm-testresults/
+  .. __: http://lists.llvm.org/pipermail/llvm-testresults/

 `LLVM Announcements List (llvm-announce)`__
  This is a low volume list that provides important announcements regarding
  LLVM.  It gets email about once a month.

-  .. __: http://lists.cs.uiuc.edu/mailman/listinfo/llvm-announce
+  .. __: http://lists.llvm.org/mailman/listinfo/llvm-announce

 IRC
 ---
--- a/docs/tutorial/LangImpl9.rst
+++ b/docs/tutorial/LangImpl9.rst
@ -90,8 +90,8 @@ For example, try adding:
 Have fun - try doing something crazy and unusual. Building a language
 like everyone else always has, is much less fun than trying something a
 little crazy or off the wall and seeing how it turns out. If you get
-stuck or want to talk about it, feel free to email the `llvmdev mailing
-list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_: it has lots
+stuck or want to talk about it, feel free to email the `llvm-dev mailing
+list <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_: it has lots
 of people who are interested in languages and are often willing to help
 out.

@ -169,8 +169,8 @@ It is certainly possible to implement a safe language in LLVM, but LLVM
 IR does not itself guarantee safety. The LLVM IR allows unsafe pointer
 casts, use after free bugs, buffer over-runs, and a variety of other
 problems. Safety needs to be implemented as a layer on top of LLVM and,
-conveniently, several groups have investigated this. Ask on the `llvmdev
-mailing list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ if
+conveniently, several groups have investigated this. Ask on the `llvm-dev
+mailing list <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ if
 you are interested in more details.

 Language-Specific Optimizations
@ -220,7 +220,7 @@ safe to optimize that into "return 0;" because C specifies what the
 In addition to simple library knowledge, it is possible to embed a
 variety of other language-specific information into the LLVM IR. If you
 have a specific need and run into a wall, please bring the topic up on
-the llvmdev list. At the very worst, you can always treat LLVM as if it
+the llvm-dev list. At the very worst, you can always treat LLVM as if it
 were a "dumb code generator" and implement the high-level optimizations
 you desire in your front-end, on the language-specific AST.

--- a/docs/tutorial/OCamlLangImpl8.rst
+++ b/docs/tutorial/OCamlLangImpl8.rst
@ -95,8 +95,8 @@ For example, try adding:
 Have fun - try doing something crazy and unusual. Building a language
 like everyone else always has, is much less fun than trying something a
 little crazy or off the wall and seeing how it turns out. If you get
-stuck or want to talk about it, feel free to email the `llvmdev mailing
-list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_: it has lots
+stuck or want to talk about it, feel free to email the `llvm-dev mailing
+list <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_: it has lots
 of people who are interested in languages and are often willing to help
 out.

@ -174,8 +174,8 @@ It is certainly possible to implement a safe language in LLVM, but LLVM
 IR does not itself guarantee safety. The LLVM IR allows unsafe pointer
 casts, use after free bugs, buffer over-runs, and a variety of other
 problems. Safety needs to be implemented as a layer on top of LLVM and,
-conveniently, several groups have investigated this. Ask on the `llvmdev
-mailing list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ if
+conveniently, several groups have investigated this. Ask on the `llvm-dev
+mailing list <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ if
 you are interested in more details.

 Language-Specific Optimizations
@ -225,7 +225,7 @@ safe to optimize that into "return 0;" because C specifies what the
 In addition to simple library knowledge, it is possible to embed a
 variety of other language-specific information into the LLVM IR. If you
 have a specific need and run into a wall, please bring the topic up on
-the llvmdev list. At the very worst, you can always treat LLVM as if it
+the llvm-dev list. At the very worst, you can always treat LLVM as if it
 were a "dumb code generator" and implement the high-level optimizations
 you desire in your front-end, on the language-specific AST.

--- a/include/llvm-c/TargetMachine.h
+++ b/include/llvm-c/TargetMachine.h
@ -115,7 +115,7 @@ char *LLVMGetTargetMachineCPU(LLVMTargetMachineRef T);
  LLVMDisposeMessage. */
 char *LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T);

-/** Returns the llvm::DataLayout used for this llvm:TargetMachine. */
+/** Deprecated: use LLVMGetDataLayout(LLVMModuleRef M) instead. */
 LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T);

 /** Set the target machine's ASM verbosity. */
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@ -315,8 +315,10 @@ class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
                                           T2>::value>::type * = nullptr) {
    // Use memcpy for PODs iterated by pointers (which includes SmallVector
    // iterators): std::uninitialized_copy optimizes to memmove, but we can
-    // use memcpy here.
-    memcpy(Dest, I, (E-I)*sizeof(T));
+    // use memcpy here. Note that I and E are iterators and thus might be
+    // invalid for memcpy if they are equal.
+    if (I != E)
+      memcpy(Dest, I, (E - I) * sizeof(T));
  }

  /// Double the size of the allocated memory, guaranteeing space for at
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@ -158,7 +158,8 @@ class StringMapEntry : public StringMapEntryBase {

    // Copy the string information.
    char *StrBuffer = const_cast<char*>(NewItem->getKeyData());
-    memcpy(StrBuffer, Key.data(), KeyLength);
+    if (KeyLength > 0)
+      memcpy(StrBuffer, Key.data(), KeyLength);
    StrBuffer[KeyLength] = 0;  // Null terminate for convenience of clients.
    return NewItem;
  }
--- a/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/include/llvm/CodeGen/LiveRegMatrix.h
@ -32,11 +32,13 @@ namespace llvm {

 class LiveInterval;
 class LiveIntervalAnalysis;
+class MachineRegisterInfo;
 class TargetRegisterInfo;
 class VirtRegMap;

 class LiveRegMatrix : public MachineFunctionPass {
  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
  LiveIntervals *LIS;
  VirtRegMap *VRM;

--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@ -95,8 +95,20 @@ class MachineRegisterInfo {
    return MO->Contents.Reg.Next;
  }

+  /// UsedRegUnits - This is a bit vector that is computed and set by the
+  /// register allocator, and must be kept up to date by passes that run after
+  /// register allocation (though most don't modify this).  This is used
+  /// so that the code generator knows which callee save registers to save and
+  /// for other target specific uses.
+  /// This vector has bits set for register units that are modified in the
+  /// current function. It doesn't include registers clobbered by function
+  /// calls with register mask operands.
+  BitVector UsedRegUnits;
+
  /// UsedPhysRegMask - Additional used physregs including aliases.
  /// This bit vector represents all the registers clobbered by function calls.
+  /// It can model things that UsedRegUnits can't, such as function calls that
+  /// clobber ymm7 but preserve the low half in xmm7.
  BitVector UsedPhysRegMask;

  /// ReservedRegs - This is a bit vector of reserved registers.  The target
@ -641,12 +653,55 @@ class MachineRegisterInfo {
  /// ignored.
  bool isPhysRegModified(unsigned PhysReg) const;

+  //===--------------------------------------------------------------------===//
+  // Physical Register Use Info
+  //===--------------------------------------------------------------------===//
+
+  /// isPhysRegUsed - Return true if the specified register is used in this
+  /// function. Also check for clobbered aliases and registers clobbered by
+  /// function calls with register mask operands.
+  ///
+  /// This only works after register allocation.
+  bool isPhysRegUsed(unsigned Reg) const {
+    if (UsedPhysRegMask.test(Reg))
+      return true;
+    for (MCRegUnitIterator Units(Reg, getTargetRegisterInfo());
+         Units.isValid(); ++Units)
+      if (UsedRegUnits.test(*Units))
+        return true;
+    return false;
+  }
+
+  /// Mark the specified register unit as used in this function.
+  /// This should only be called during and after register allocation.
+  void setRegUnitUsed(unsigned RegUnit) {
+    UsedRegUnits.set(RegUnit);
+  }
+
+  /// setPhysRegUsed - Mark the specified register used in this function.
+  /// This should only be called during and after register allocation.
+  void setPhysRegUsed(unsigned Reg) {
+    for (MCRegUnitIterator Units(Reg, getTargetRegisterInfo());
+         Units.isValid(); ++Units)
+      UsedRegUnits.set(*Units);
+  }
+
  /// addPhysRegsUsedFromRegMask - Mark any registers not in RegMask as used.
  /// This corresponds to the bit mask attached to register mask operands.
  void addPhysRegsUsedFromRegMask(const uint32_t *RegMask) {
    UsedPhysRegMask.setBitsNotInMask(RegMask);
  }

+  /// setPhysRegUnused - Mark the specified register unused in this function.
+  /// This should only be called during and after register allocation.
+  void setPhysRegUnused(unsigned Reg) {
+    UsedPhysRegMask.reset(Reg);
+    for (MCRegUnitIterator Units(Reg, getTargetRegisterInfo());
+         Units.isValid(); ++Units)
+      UsedRegUnits.reset(*Units);
+  }
+
+
  //===--------------------------------------------------------------------===//
  // Reserved Register Info
  //===--------------------------------------------------------------------===//
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@ -125,10 +125,15 @@ class TargetMachine {
    return *static_cast<const STC*>(getSubtargetImpl(F));
  }

+  /// Deprecated in 3.7, will be removed in 3.8. Use createDataLayout() instead.
+  ///
  /// This method returns a pointer to the DataLayout for the target. It should
  /// be unchanging for every subtarget.
  const DataLayout *getDataLayout() const { return &DL; }

+  /// Create a DataLayout.
+  const DataLayout createDataLayout() const { return DL; }
+
  /// \brief Reset the target options based on the function's attributes.
  // FIXME: Remove TargetOptions that affect per-function code generation
  // from TargetMachine.
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@ -206,14 +206,6 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
    return V;
  }

-  if (ConstantInt *Const = dyn_cast<ConstantInt>(V)) {
-    // if it's a constant, just convert it to an offset
-    // and remove the variable.
-    Offset += Const->getValue();
-    assert(Scale == 0 && "Constant values don't have a scale");
-    return V;
-  }
-
  if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
      switch (BOp->getOpcode()) {
@ -261,10 +253,7 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
    Value *Result = GetLinearExpression(CastOp, Scale, Offset, Extension, DL,
                                        Depth + 1, AC, DT);
    Scale = Scale.zext(OldWidth);
-
-    // We have to sign-extend even if Extension == EK_ZeroExt as we can't
-    // decompose a sign extension (i.e. zext(x - 1) != zext(x) - zext(-1)).
-    Offset = Offset.sext(OldWidth);
+    Offset = Offset.zext(OldWidth);

    return Result;
  }
@ -1135,43 +1124,12 @@ AliasResult BasicAliasAnalysis::aliasGEP(
    }
  }

+  // Try to distinguish something like &A[i][1] against &A[42][0].
+  // Grab the least significant bit set in any of the scales.
  if (!GEP1VariableIndices.empty()) {
    uint64_t Modulo = 0;
-    bool AllPositive = true;
-    for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i) {
-
-      // Try to distinguish something like &A[i][1] against &A[42][0].
-      // Grab the least significant bit set in any of the scales. We
-      // don't need std::abs here (even if the scale's negative) as we'll
-      // be ^'ing Modulo with itself later.
+    for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i)
      Modulo |= (uint64_t) GEP1VariableIndices[i].Scale;
-
-      if (AllPositive) {
-        // If the Value could change between cycles, then any reasoning about
-        // the Value this cycle may not hold in the next cycle. We'll just
-        // give up if we can't determine conditions that hold for every cycle:
-        const Value *V = GEP1VariableIndices[i].V;
-
-        bool SignKnownZero, SignKnownOne;
-        ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, *DL,
-                       0, AC1, nullptr, DT);
-
-        // Zero-extension widens the variable, and so forces the sign
-        // bit to zero.
-        bool IsZExt = GEP1VariableIndices[i].Extension == EK_ZeroExt;
-        SignKnownZero |= IsZExt;
-        SignKnownOne &= !IsZExt;
-
-        // If the variable begins with a zero then we know it's
-        // positive, regardless of whether the value is signed or
-        // unsigned.
-        int64_t Scale = GEP1VariableIndices[i].Scale;
-        AllPositive =
-          (SignKnownZero && Scale >= 0) ||
-          (SignKnownOne && Scale < 0);
-      }
-    }
-
    Modulo = Modulo ^ (Modulo & (Modulo - 1));

    // We can compute the difference between the two addresses
@ -1182,12 +1140,6 @@ AliasResult BasicAliasAnalysis::aliasGEP(
        V2Size != MemoryLocation::UnknownSize && ModOffset >= V2Size &&
        V1Size <= Modulo - ModOffset)
      return NoAlias;
-
-    // If we know all the variables are positive, then GEP1 >= GEP1BasePtr.
-    // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers
-    // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr.
-    if (AllPositive && GEP1BaseOffset > 0 && V2Size <= (uint64_t) GEP1BaseOffset)
-      return NoAlias;
  }

  // Statically, we can see that the base objects are the same, but the
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@ -440,31 +440,40 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
    }

    // Scan the function bodies for explicit loads or stores.
-    for (unsigned i = 0, e = SCC.size(); i != e && FunctionEffect != ModRef;
-         ++i)
-      for (inst_iterator II = inst_begin(SCC[i]->getFunction()),
-                         E = inst_end(SCC[i]->getFunction());
-           II != E && FunctionEffect != ModRef; ++II)
-        if (LoadInst *LI = dyn_cast<LoadInst>(&*II)) {
-          FunctionEffect |= Ref;
-          if (LI->isVolatile())
-            // Volatile loads may have side-effects, so mark them as writing
-            // memory (for example, a flag inside the processor).
-            FunctionEffect |= Mod;
-        } else if (StoreInst *SI = dyn_cast<StoreInst>(&*II)) {
-          FunctionEffect |= Mod;
-          if (SI->isVolatile())
-            // Treat volatile stores as reading memory somewhere.
-            FunctionEffect |= Ref;
-        } else if (isAllocationFn(&*II, TLI) || isFreeCall(&*II, TLI)) {
-          FunctionEffect |= ModRef;
-        } else if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(&*II)) {
-          // The callgraph doesn't include intrinsic calls.
-          Function *Callee = Intrinsic->getCalledFunction();
-          ModRefBehavior Behaviour = AliasAnalysis::getModRefBehavior(Callee);
-          FunctionEffect |= (Behaviour & ModRef);
+    for (auto *Node : SCC) {
+      if (FunctionEffect == ModRef)
+        break; // The mod/ref lattice saturates here.
+      for (Instruction &I : inst_range(Node->getFunction())) {
+        if (FunctionEffect == ModRef)
+          break; // The mod/ref lattice saturates here.
+
+        // We handle calls specially because the graph-relevant aspects are
+        // handled above.
+        if (auto CS = CallSite(&I)) {
+          if (isAllocationFn(&I, TLI) || isFreeCall(&I, TLI)) {
+            // FIXME: It is completely unclear why this is necessary and not
+            // handled by the above graph code.
+            FunctionEffect |= ModRef;
+          } else if (Function *Callee = CS.getCalledFunction()) {
+            // The callgraph doesn't include intrinsic calls.
+            if (Callee->isIntrinsic()) {
+              ModRefBehavior Behaviour =
+                  AliasAnalysis::getModRefBehavior(Callee);
+              FunctionEffect |= (Behaviour & ModRef);
+            }
+          }
+          continue;
        }

+        // All non-call instructions we use the primary predicates for whether
+        // thay read or write memory.
+        if (I.mayReadFromMemory())
+          FunctionEffect |= Ref;
+        if (I.mayWriteToMemory())
+          FunctionEffect |= Mod;
+      }
+    }
+
    if ((FunctionEffect & Mod) == 0)
      ++NumReadMemFunctions;
    if (FunctionEffect == 0)
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@ -3574,18 +3574,9 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const Query &,

  // If extracting a specified index from the vector, see if we can recursively
  // find a previously computed scalar that was inserted into the vector.
-  if (auto *IdxC = dyn_cast<ConstantInt>(Idx)) {
-    unsigned IndexVal = IdxC->getZExtValue();
-    unsigned VectorWidth = Vec->getType()->getVectorNumElements();
-
-    // If this is extracting an invalid index, turn this into undef, to avoid
-    // crashing the code below.
-    if (IndexVal >= VectorWidth)
-      return UndefValue::get(Vec->getType()->getVectorElementType());
-
-    if (Value *Elt = findScalarElement(Vec, IndexVal))
+  if (auto *IdxC = dyn_cast<ConstantInt>(Idx))
+    if (Value *Elt = findScalarElement(Vec, IdxC->getZExtValue()))
      return Elt;
-  }

  return nullptr;
 }
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp
@ -374,9 +374,10 @@ InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB,
  if (!Tmp.PHITranslateValue(CurBB, PredBB, &DT, /*MustDominate=*/true))
    return Tmp.getAddr();

-  // If we don't have an available version of this value, it must be an
-  // instruction.
-  Instruction *Inst = cast<Instruction>(InVal);
+  // We don't need to PHI translate values which aren't instructions.
+  auto *Inst = dyn_cast<Instruction>(InVal);
+  if (!Inst)
+    return nullptr;

  // Handle cast of PHI translatable value.
  if (CastInst *Cast = dyn_cast<CastInst>(Inst)) {
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@ -402,8 +402,9 @@ llvm::Value *llvm::findScalarElement(llvm::Value *V, unsigned EltNo) {
  if (match(V,
            llvm::PatternMatch::m_Add(llvm::PatternMatch::m_Value(Val),
                                      llvm::PatternMatch::m_Constant(Con)))) {
-    if (Con->getAggregateElement(EltNo)->isNullValue())
-      return findScalarElement(Val, EltNo);
+    if (Constant *Elt = Con->getAggregateElement(EltNo))
+      if (Elt->isNullValue())
+        return findScalarElement(Val, EltNo);
  }

  // Otherwise, we don't know.
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@ -733,14 +733,12 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
  // If no relevant registers are used in the function, we can skip it
  // completely.
  bool anyregs = false;
-  const MachineRegisterInfo &MRI = mf.getRegInfo();
  for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end();
-       I != E && !anyregs; ++I)
-    for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI)
-      if (!MRI.reg_nodbg_empty(*AI)) {
-        anyregs = true;
-        break;
-      }
+       I != E; ++I)
+    if (MF->getRegInfo().isPhysRegUsed(*I)) {
+      anyregs = true;
+      break;
+    }
  if (!anyregs) return false;

  // Initialize the AliasMap on the first use.
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@ -15,12 +15,12 @@
 #include "RegisterCoalescer.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"

 using namespace llvm;

@ -49,6 +49,7 @@ void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {

 bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) {
  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
  LIS = &getAnalysis<LiveIntervals>();
  VRM = &getAnalysis<VirtRegMap>();

@ -100,6 +101,7 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
               << " to " << PrintReg(PhysReg, TRI) << ':');
  assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
  VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
+  MRI->setPhysRegUsed(PhysReg);

  foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
                                         const LiveRange &Range) {
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@ -29,6 +29,7 @@ MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF)
    TracksSubRegLiveness(false) {
  VRegInfo.reserve(256);
  RegAllocHints.reserve(256);
+  UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
  UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs());

  // Create the physreg use/def lists.
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@ -624,6 +624,10 @@ struct DataDep {
 static bool getDataDeps(const MachineInstr *UseMI,
                        SmallVectorImpl<DataDep> &Deps,
                        const MachineRegisterInfo *MRI) {
+  // Debug values should not be included in any calculations.
+  if (UseMI->isDebugValue())
+    return false;
+  
  bool HasPhysRegs = false;
  for (MachineInstr::const_mop_iterator I = UseMI->operands_begin(),
       E = UseMI->operands_end(); I != E; ++I) {
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@ -1026,8 +1026,12 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
          // Replace this reference to the virtual register with the
          // scratch register.
          assert (ScratchReg && "Missing scratch register!");
+          MachineRegisterInfo &MRI = Fn.getRegInfo();
          Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
          
+          // Make sure MRI now accounts this register as used.
+          MRI.setPhysRegUsed(ScratchReg);
+
          // Because this instruction was processed by the RS before this
          // register was allocated, make sure that the RS now records the
          // register as being used.
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@ -986,6 +986,10 @@ void RAFast::AllocateBasicBlock() {
      }
    }

+    for (UsedInInstrSet::iterator
+         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
+      MRI->setRegUnitUsed(*I);
+
    // Track registers defined by instruction - early clobbers and tied uses at
    // this point.
    UsedInInstr.clear();
@ -1046,6 +1050,10 @@ void RAFast::AllocateBasicBlock() {
      killVirtReg(VirtDead[i]);
    VirtDead.clear();

+    for (UsedInInstrSet::iterator
+         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
+      MRI->setRegUnitUsed(*I);
+
    if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) {
      DEBUG(dbgs() << "-- coalescing: " << *MI);
      Coalesced.push_back(MI);
@ -1095,6 +1103,12 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
    AllocateBasicBlock();
  }

+  // Add the clobber lists for all the instructions we skipped earlier.
+  for (const MCInstrDesc *Desc : SkippedInstrs)
+    if (const uint16_t *Defs = Desc->getImplicitDefs())
+      while (*Defs)
+        MRI->setPhysRegUsed(*Defs++);
+
  // All machine operands and other references to virtual registers have been
  // replaced. Remove the virtual registers.
  MRI->clearVirtRegs();
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@ -1531,6 +1531,14 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
        DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
        return false;
      }
+
+      // We must also check for clobbers caused by regmasks.
+      for (const auto &MO : MI->operands()) {
+        if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) {
+          DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI);
+          return false;
+        }
+      }
    }

    // We're going to remove the copy which defines a physical reserved
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -8365,12 +8365,12 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
    if (N0CFP && N0CFP->isExactlyValue(1.0))
      return SDValue();

-    SmallVector<SDNode *, 4> Users;
    // Find all FDIV users of the same divisor.
-    for (auto *U : N1->uses()) {
+    // Use a set because duplicates may be present in the user list.
+    SetVector<SDNode *> Users;
+    for (auto *U : N1->uses())
      if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1)
-        Users.push_back(U);
-    }
+        Users.insert(U);

    if (TLI.combineRepeatedFPDivisors(Users.size())) {
      SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@ -163,6 +163,7 @@ class VirtRegRewriter : public MachineFunctionPass {
  SlotIndexes *Indexes;
  LiveIntervals *LIS;
  VirtRegMap *VRM;
+  SparseSet<unsigned> PhysRegs;

  void rewrite();
  void addMBBLiveIns();
@ -318,15 +319,54 @@ void VirtRegRewriter::rewrite() {
  SmallVector<unsigned, 8> SuperDeads;
  SmallVector<unsigned, 8> SuperDefs;
  SmallVector<unsigned, 8> SuperKills;
+  SmallPtrSet<const MachineInstr *, 4> NoReturnInsts;
+
+  // Here we have a SparseSet to hold which PhysRegs are actually encountered
+  // in the MF we are about to iterate over so that later when we call
+  // setPhysRegUsed, we are only doing it for physRegs that were actually found
+  // in the program and not for all of the possible physRegs for the given
+  // target architecture. If the target has a lot of physRegs, then for a small
+  // program there will be a significant compile time reduction here.
+  PhysRegs.clear();
+  PhysRegs.setUniverse(TRI->getNumRegs());
+
+  // The function with uwtable should guarantee that the stack unwinder
+  // can unwind the stack to the previous frame.  Thus, we can't apply the
+  // noreturn optimization if the caller function has uwtable attribute.
+  bool HasUWTable = MF->getFunction()->hasFnAttribute(Attribute::UWTable);

  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
       MBBI != MBBE; ++MBBI) {
    DEBUG(MBBI->print(dbgs(), Indexes));
+    bool IsExitBB = MBBI->succ_empty();
    for (MachineBasicBlock::instr_iterator
           MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
      MachineInstr *MI = MII;
      ++MII;

+      // Check if this instruction is a call to a noreturn function.  If this
+      // is a call to noreturn function and we don't need the stack unwinding
+      // functionality (i.e. this function does not have uwtable attribute and
+      // the callee function has the nounwind attribute), then we can ignore
+      // the definitions set by this instruction.
+      if (!HasUWTable && IsExitBB && MI->isCall()) {
+        for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+               MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+          MachineOperand &MO = *MOI;
+          if (!MO.isGlobal())
+            continue;
+          const Function *Func = dyn_cast<Function>(MO.getGlobal());
+          if (!Func || !Func->hasFnAttribute(Attribute::NoReturn) ||
+              // We need to keep correct unwind information
+              // even if the function will not return, since the
+              // runtime may need it.
+              !Func->hasFnAttribute(Attribute::NoUnwind))
+            continue;
+          NoReturnInsts.insert(MI);
+          break;
+        }
+      }
+
      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
           MOE = MI->operands_end(); MOI != MOE; ++MOI) {
        MachineOperand &MO = *MOI;
@ -335,6 +375,15 @@ void VirtRegRewriter::rewrite() {
        if (MO.isRegMask())
          MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());

+        // If we encounter a VirtReg or PhysReg then get at the PhysReg and add
+        // it to the physreg bitset.  Later we use only the PhysRegs that were
+        // actually encountered in the MF to populate the MRI's used physregs.
+        if (MO.isReg() && MO.getReg())
+          PhysRegs.insert(
+              TargetRegisterInfo::isVirtualRegister(MO.getReg()) ?
+              VRM->getPhys(MO.getReg()) :
+              MO.getReg());
+
        if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
          continue;
        unsigned VirtReg = MO.getReg();
@ -421,5 +470,29 @@ void VirtRegRewriter::rewrite() {
      }
    }
  }
+
+  // Tell MRI about physical registers in use.
+  if (NoReturnInsts.empty()) {
+    for (SparseSet<unsigned>::iterator
+        RegI = PhysRegs.begin(), E = PhysRegs.end(); RegI != E; ++RegI)
+      if (!MRI->reg_nodbg_empty(*RegI))
+        MRI->setPhysRegUsed(*RegI);
+  } else {
+    for (SparseSet<unsigned>::iterator
+        I = PhysRegs.begin(), E = PhysRegs.end(); I != E; ++I) {
+      unsigned Reg = *I;
+      if (MRI->reg_nodbg_empty(Reg))
+        continue;
+      // Check if this register has a use that will impact the rest of the
+      // code. Uses in debug and noreturn instructions do not impact the
+      // generated code.
+      for (MachineInstr &It : MRI->reg_nodbg_instructions(Reg)) {
+        if (!NoReturnInsts.count(&It)) {
+          MRI->setPhysRegUsed(Reg);
+          break;
+        }
+      }
+    }
+  }
 }

--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@ -180,10 +180,17 @@ uint64_t ExecutionEngineState::RemoveMapping(StringRef Name) {
 }

 std::string ExecutionEngine::getMangledName(const GlobalValue *GV) {
+  assert(GV->hasName() && "Global must have name.");
+
  MutexGuard locked(lock);
-  Mangler Mang;
  SmallString<128> FullName;
-  Mang.getNameWithPrefix(FullName, GV, false);
+
+  const DataLayout &DL =
+    GV->getParent()->getDataLayout().isDefault()
+      ? *getDataLayout()
+      : GV->getParent()->getDataLayout();
+
+  Mangler::getNameWithPrefix(FullName, GV->getName(), DL);
  return FullName.str();
 }

--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@ -266,6 +266,12 @@ void MCJIT::finalizeModule(Module *M) {
 RuntimeDyld::SymbolInfo MCJIT::findExistingSymbol(const std::string &Name) {
  SmallString<128> FullName;
  Mangler::getNameWithPrefix(FullName, Name, *TM->getDataLayout());
+
+  if (void *Addr = getPointerToGlobalIfAvailable(FullName))
+    return RuntimeDyld::SymbolInfo(static_cast<uint64_t>(
+                                     reinterpret_cast<uintptr_t>(Addr)),
+                                   JITSymbolFlags::Exported);
+
  return Dyld.getSymbol(FullName);
 }

--- a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@ -98,7 +98,7 @@ void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
                                           uint64_t LoadAddr,
                                           size_t Size) {
  // On OS X OS X __register_frame takes a single FDE as an argument.
-  // See http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-April/061768.html
+  // See http://lists.llvm.org/pipermail/llvm-dev/2013-April/061768.html
  const char *P = (const char *)Addr;
  const char *End = P + Size;
  do  {
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@ -613,6 +613,9 @@ bool StructType::isLayoutIdentical(StructType *Other) const {
  if (isPacked() != Other->isPacked() ||
      getNumElements() != Other->getNumElements())
    return false;
+
+  if (!getNumElements())
+    return true;
  
  return std::equal(element_begin(), element_end(), Other->element_begin());
 }
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@ -57,7 +57,8 @@ void MemoryBuffer::init(const char *BufStart, const char *BufEnd,
 /// CopyStringRef - Copies contents of a StringRef into a block of memory and
 /// null-terminates it.
 static void CopyStringRef(char *Memory, StringRef Data) {
-  memcpy(Memory, Data.data(), Data.size());
+  if (!Data.empty())
+    memcpy(Memory, Data.data(), Data.size());
  Memory[Data.size()] = 0; // Null terminate string.
 }

--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@ -593,6 +593,7 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
      if (Change) {
        Substs[MO.getReg()] = Reg;
        MO.setReg(Reg);
+        MRI->setPhysRegUsed(Reg);

        Changed = true;
      }
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@ -354,6 +354,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
  if (NumBytes && NeedsRealignment) {
    // Use the first callee-saved register as a scratch register.
    scratchSPReg = AArch64::X9;
+    MF.getRegInfo().setPhysRegUsed(scratchSPReg);
  }

  // If we're a leaf function, try using the red zone.
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@ -123,6 +123,11 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
        "true",
        "VI SGPR initilization bug requiring a fixed SGPR allocation size">;

+def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer",
+        "EnableHugeScratchBuffer",
+        "true",
+        "Enable scratch buffer sizes greater than 128 GB">;
+
 class SubtargetFeatureFetchLimit <string Value> :
                          SubtargetFeature <"fetch"#Value,
        "TexVTXClauseSize",
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@ -1029,6 +1029,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                                           SDValue &SLC, SDValue &TFE) const {
  SDValue Ptr, Offen, Idxen, Addr64;

+  // addr64 bit was removed for volcanic islands.
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return false;
+
  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
              GLC, SLC, TFE);

@ -1095,13 +1099,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,

  // (add n0, c1)
  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
    SDValue N1 = Addr.getOperand(1);
-    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
-
-    if (isLegalMUBUFImmOffset(C1)) {
-      VAddr = Addr.getOperand(0);
-      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
-      return true;
+    // Offsets in vaddr must be positive.
+    if (CurDAG->SignBitIsZero(N0)) {
+      ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+      if (isLegalMUBUFImmOffset(C1)) {
+        VAddr = N0;
+        ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+        return true;
+      }
    }
  }

--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@ -73,7 +73,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
      WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
      EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
      GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
-      IsaVersion(ISAVersion0_0_0),
+      IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
      FrameLowering(TargetFrameLowering::StackGrowsUp,
                    64 * 16, // Maximum stack alignment (long16)
                    0),
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@ -89,6 +89,7 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
  bool FeatureDisable;
  int LDSBankCount;
  unsigned IsaVersion; 
+  bool EnableHugeScratchBuffer;

  AMDGPUFrameLowering FrameLowering;
  std::unique_ptr<AMDGPUTargetLowering> TLInfo;
@ -271,6 +272,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
    return DevName;
  }

+  bool enableHugeScratchBuffer() const {
+    return EnableHugeScratchBuffer;
+  }
+
  bool dumpCode() const {
    return DumpCode;
  }
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@ -1719,7 +1719,6 @@ MachineBasicBlock *
 AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);

  if (!LoopHeader || !LoopLatch)
    return nullptr;
@ -1732,18 +1731,9 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
  FuncRep->push_back(DummyExitBlk);  //insert to function
  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
-  MachineBasicBlock::iterator I = BranchMI;
-  unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC);
-  llvm_unreachable("Extra register needed to handle CFG");
-  MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32);
-  MachineInstrBuilder MIB(*FuncRep, NewMI);
-  MIB.addMBB(LoopHeader);
-  MIB.addReg(ImmReg, false);
-  SHOWNEWINSTR(NewMI);
-  BranchMI->eraseFromParent();
-  LoopLatch->addSuccessor(DummyExitBlk);
-
-  return DummyExitBlk;
+  LLVMContext &Ctx = LoopHeader->getParent()->getFunction()->getContext();
+  Ctx.emitError("Extra register needed to handle CFG");
+  return nullptr;
 }

 void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@ -138,3 +138,7 @@ def : ProcessorModel<"iceland", SIQuarterSpeedModel,
 def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
  [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
 >;
+
+def : ProcessorModel<"fiji", SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+>;
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -254,6 +254,12 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
  return false;
 }

+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+  // Flat instructions do not have offsets, and only have the register
+  // address.
+  return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+}
+
 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                             const AddrMode &AM, Type *Ty,
                                             unsigned AS) const {
@ -263,8 +269,21 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,

  switch (AS) {
  case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      // Assume the we will use FLAT for all global memory accesses
+      // on VI.
+      // FIXME: This assumption is currently wrong.  On VI we still use
+      // MUBUF instructions for the r + i addressing mode.  As currently
+      // implemented, the MUBUF instructions only work on buffer < 4GB.
+      // It may be possible to support > 4GB buffers with MUBUF instructions,
+      // by setting the stride value in the resource descriptor which would
+      // increase the size limit to (stride * 4GB).  However, this is risky,
+      // because it has never been validated.
+      return isLegalFlatAddressingMode(AM);
+    }
+    // fall-through
  case AMDGPUAS::PRIVATE_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
    // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
    // additionally can do r + r + i with addr64. 32-bit has more addressing
@ -324,11 +343,9 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,

    return false;
  }
-  case AMDGPUAS::FLAT_ADDRESS: {
-    // Flat instructions do not have offsets, and only have the register
-    // address.
-    return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
-  }
+  case AMDGPUAS::FLAT_ADDRESS:
+    return isLegalFlatAddressingMode(AM);
+
  default:
    llvm_unreachable("unhandled address space");
  }
@ -812,10 +829,29 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {

 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {

+  SDLoc SL(Op);
  FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
  unsigned FrameIndex = FINode->getIndex();

-  return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+  // A FrameIndex node represents a 32-bit offset into scratch memory.  If
+  // the high bit of a frame index offset were to be set, this would mean
+  // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
+  // scratch buffer, with 64 being the number of threads per wave.
+  //
+  // If we know the machine uses less than 128GB of scratch, then we can
+  // amrk the high bit of the FrameIndex node as known zero,
+  // which is important, because it means in most situations we can
+  // prove that values derived from FrameIndex nodes are non-negative.
+  // This enables us to take advantage of more addressing modes when
+  // accessing scratch buffers, since for scratch reads/writes, the register
+  // offset must always be positive.
+
+  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+  if (Subtarget->enableHugeScratchBuffer())
+    return TFI;
+
+  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
+                    DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
 }

 /// This transforms the control flow intrinsics to get the branch destination as
@ -2034,6 +2070,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
  }
 }

+static bool isFrameIndexOp(SDValue Op) {
+  if (Op.getOpcode() == ISD::AssertZext)
+    Op = Op.getOperand(0);
+
+  return isa<FrameIndexSDNode>(Op);
+}
+
 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
 /// with frame index operands.
 /// LLVM assumes that inputs are to these instructions are registers.
@ -2042,7 +2085,7 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,

  SmallVector<SDValue, 8> Ops;
  for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
-    if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
+    if (!isFrameIndexOp(Node->getOperand(i))) {
      Ops.push_back(Node->getOperand(i));
      continue;
    }
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@ -56,6 +56,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
  SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
  SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;

+  bool isLegalFlatAddressingMode(const AddrMode &AM) const;
 public:
  SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);

--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@ -1600,12 +1600,14 @@ multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
            SIMCInstr <opName#"_e32", SISubtarget.SI> {
    let Defs = !if(DefExec, [EXEC], []);
    let hasSideEffects = DefExec;
+    let AssemblerPredicates = [isSICI];
  }

  def _vi : VOPC<op.VI, ins, asm, []>,
            SIMCInstr <opName#"_e32", SISubtarget.VI> {
    let Defs = !if(DefExec, [EXEC], []);
    let hasSideEffects = DefExec;
+    let AssemblerPredicates = [isVI];
  }
 }

--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@ -2910,9 +2910,6 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
 } // End Predicates = [isSICI]

 class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
@ -3273,13 +3270,13 @@ def : Pat <
  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
  (V_CNDMASK_B64_PSEUDO
-      $x,
      (V_MIN_F64
          SRCMODS.NONE,
          (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
          SRCMODS.NONE,
          (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
          DSTCLAMP.NONE, DSTOMOD.NONE),
+      $x,
      (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/))
 >;

@ -3291,13 +3288,13 @@ def : Pat <
      $x,
      SRCMODS.NEG,
      (V_CNDMASK_B64_PSEUDO
-         $x,
         (V_MIN_F64
             SRCMODS.NONE,
             (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
             SRCMODS.NONE,
             (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
             DSTCLAMP.NONE, DSTOMOD.NONE),
+         $x,
         (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)),
      DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -53,6 +53,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
  if (!LaneVGPRs.count(LaneVGPRIdx)) {
    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
    LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
+    MRI.setPhysRegUsed(LaneVGPR);

    // Add this register as live-in to all blocks to avoid machine verifer
    // complaining about use of an undefined physical register.
--- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
@ -91,6 +91,7 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {

  if (ScratchOffsetReg != AMDGPU::NoRegister) {
    // Found an SGPR to use
+    MRI.setPhysRegUsed(ScratchOffsetReg);
    BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
            .addReg(ScratchOffsetPreloadReg);
  } else {
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -348,7 +348,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
    &AMDGPU::SReg_128RegClass,
    &AMDGPU::VReg_256RegClass,
    &AMDGPU::SReg_256RegClass,
-    &AMDGPU::VReg_512RegClass
+    &AMDGPU::VReg_512RegClass,
+    &AMDGPU::SReg_512RegClass
  };

  for (const TargetRegisterClass *BaseClass : BaseClasses) {
@ -499,7 +500,7 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,

  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
       I != E; ++I) {
-    if (MRI.reg_nodbg_empty(*I))
+    if (!MRI.isPhysRegUsed(*I))
      return *I;
  }
  return AMDGPU::NoRegister;
--- a/lib/Target/AMDGPU/VIInstructions.td
+++ b/lib/Target/AMDGPU/VIInstructions.td
@ -103,4 +103,46 @@ def : Pat <
  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
 >;

+// Patterns for global loads with no offset
+class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr)),
+  (inst $addr, 0, 0, 0)
+>;
+
+def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
+
+class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (node vt:$data, i64:$addr),
+  (inst $data, $addr, 0, 0, 0)
+>;
+
+def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
+
+class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr, vt:$data)),
+  (inst $addr, $data, 0, 0)
+>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+
+
 } // End Predicates = [isVI]
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@ -4583,6 +4583,12 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
  SDLoc dl(Op);

+  if (CmpVT.getVectorElementType() == MVT::i64)
+    // 64-bit comparisons are not legal. We've marked SETCC as non-Custom,
+    // but it's possible that our operands are 64-bit but our result is 32-bit.
+    // Bail in this case.
+    return SDValue();
+
  if (Op1.getValueType().isFloatingPoint()) {
    switch (SetCCOpcode) {
    default: llvm_unreachable("Illegal FP comparison");
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@ -118,7 +118,6 @@ namespace {
    };
    SpecificBumpPtrAllocator<MergeCandidate> Allocator;
    SmallVector<const MergeCandidate*,4> Candidates;
-    SmallVector<MachineInstr*,4> MergeBaseCandidates;

    void moveLiveRegsBefore(const MachineBasicBlock &MBB,
                            MachineBasicBlock::const_iterator Before);
@ -141,7 +140,6 @@ namespace {
                             MachineBasicBlock::iterator &MBBI);
    bool MergeBaseUpdateLoadStore(MachineInstr *MI);
    bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
-    bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
  };
@ -933,6 +931,11 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
    if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
      CanMergeToLSMulti = false;

+    // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
+    // deprecated; LDM to PC is fine but cannot happen here.
+    if (PReg == ARM::SP || PReg == ARM::PC)
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+
    // Merge following instructions where possible.
    for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
      int NewOffset = MemOps[I].Offset;
@ -940,16 +943,15 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
        break;
      const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
      unsigned Reg = MO.getReg();
-      unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
+      if (Reg == ARM::SP || Reg == ARM::PC)
+        break;

      // See if the current load/store may be part of a multi load/store.
+      unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
      bool PartOfLSMulti = CanMergeToLSMulti;
      if (PartOfLSMulti) {
-        // Cannot load from SP
-        if (Reg == ARM::SP)
-          PartOfLSMulti = false;
        // Register numbers must be in ascending order.
-        else if (RegNum <= PRegNum)
+        if (RegNum <= PRegNum)
          PartOfLSMulti = false;
        // For VFP / NEON load/store multiples, the registers must be
        // consecutive and within the limit on the number of registers per
@ -993,6 +995,76 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
  } while (SIndex < EIndex);
 }

+static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
+                                unsigned Bytes, unsigned Limit,
+                                ARMCC::CondCodes Pred, unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  if (!MI)
+    return false;
+
+  bool CheckCPSRDef = false;
+  switch (MI->getOpcode()) {
+  default: return false;
+  case ARM::tSUBi8:
+  case ARM::t2SUBri:
+  case ARM::SUBri:
+    CheckCPSRDef = true;
+    break;
+  case ARM::tSUBspi:
+    break;
+  }
+
+  // Make sure the offset fits in 8 bits.
+  if (Bytes == 0 || (Limit && Bytes >= Limit))
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi ||
+                    MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME
+  if (!(MI->getOperand(0).getReg() == Base &&
+        MI->getOperand(1).getReg() == Base &&
+        (MI->getOperand(2).getImm() * Scale) == Bytes &&
+        getInstrPredicate(MI, MyPredReg) == Pred &&
+        MyPredReg == PredReg))
+    return false;
+
+  return CheckCPSRDef ? !definesCPSR(MI) : true;
+}
+
+static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
+                                unsigned Bytes, unsigned Limit,
+                                ARMCC::CondCodes Pred, unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  if (!MI)
+    return false;
+
+  bool CheckCPSRDef = false;
+  switch (MI->getOpcode()) {
+  default: return false;
+  case ARM::tADDi8:
+  case ARM::t2ADDri:
+  case ARM::ADDri:
+    CheckCPSRDef = true;
+    break;
+  case ARM::tADDspi:
+    break;
+  }
+
+  if (Bytes == 0 || (Limit && Bytes >= Limit))
+    // Make sure the offset fits in 8 bits.
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tADDspi ||
+                    MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME
+  if (!(MI->getOperand(0).getReg() == Base &&
+        MI->getOperand(1).getReg() == Base &&
+        (MI->getOperand(2).getImm() * Scale) == Bytes &&
+        getInstrPredicate(MI, MyPredReg) == Pred &&
+        MyPredReg == PredReg))
+    return false;
+
+  return CheckCPSRDef ? !definesCPSR(MI) : true;
+}
+
 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
                                            ARM_AM::AMSubMode Mode) {
  switch (Opc) {
@ -1060,75 +1132,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
  }
 }

-/// Check if the given instruction increments or decrements a register and
-/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
-/// generated by the instruction are possibly read as well.
-static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
-                                  ARMCC::CondCodes Pred, unsigned PredReg) {
-  bool CheckCPSRDef;
-  int Scale;
-  switch (MI.getOpcode()) {
-  case ARM::tADDi8:  Scale =  4; CheckCPSRDef = true; break;
-  case ARM::tSUBi8:  Scale = -4; CheckCPSRDef = true; break;
-  case ARM::t2SUBri:
-  case ARM::SUBri:   Scale = -1; CheckCPSRDef = true; break;
-  case ARM::t2ADDri:
-  case ARM::ADDri:   Scale =  1; CheckCPSRDef = true; break;
-  case ARM::tADDspi: Scale =  4; CheckCPSRDef = false; break;
-  case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
-  default: return 0;
-  }
-
-  unsigned MIPredReg;
-  if (MI.getOperand(0).getReg() != Reg ||
-      MI.getOperand(1).getReg() != Reg ||
-      getInstrPredicate(&MI, MIPredReg) != Pred ||
-      MIPredReg != PredReg)
-    return 0;
-
-  if (CheckCPSRDef && definesCPSR(&MI))
-    return 0;
-  return MI.getOperand(2).getImm() * Scale;
-}
-
-/// Searches for an increment or decrement of \p Reg before \p MBBI.
-static MachineBasicBlock::iterator
-findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
-                 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
-  Offset = 0;
-  MachineBasicBlock &MBB = *MBBI->getParent();
-  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
-  MachineBasicBlock::iterator EndMBBI = MBB.end();
-  if (MBBI == BeginMBBI)
-    return EndMBBI;
-
-  // Skip debug values.
-  MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
-  while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
-    --PrevMBBI;
-
-  Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
-  return Offset == 0 ? EndMBBI : PrevMBBI;
-}
-
-/// Searches for a increment or decrement of \p Reg after \p MBBI.
-static MachineBasicBlock::iterator
-findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
-                ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
-  Offset = 0;
-  MachineBasicBlock &MBB = *MBBI->getParent();
-  MachineBasicBlock::iterator EndMBBI = MBB.end();
-  MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
-  // Skip debug values.
-  while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
-    ++NextMBBI;
-  if (NextMBBI == EndMBBI)
-    return EndMBBI;
-
-  Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
-  return Offset == 0 ? EndMBBI : NextMBBI;
-}
-
 /// Fold proceeding/trailing inc/dec of base register into the
 /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
 ///
@ -1148,6 +1151,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
  const MachineOperand &BaseOP = MI->getOperand(0);
  unsigned Base = BaseOP.getReg();
  bool BaseKill = BaseOP.isKill();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
  unsigned PredReg = 0;
  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
  unsigned Opcode = MI->getOpcode();
@ -1159,24 +1163,49 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
    if (MI->getOperand(i).getReg() == Base)
      return false;

-  int Bytes = getLSMultipleTransferSize(MI);
-  MachineBasicBlock &MBB = *MI->getParent();
-  MachineBasicBlock::iterator MBBI(MI);
-  int Offset;
-  MachineBasicBlock::iterator MergeInstr
-    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
+  bool DoMerge = false;
  ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
-  if (Mode == ARM_AM::ia && Offset == -Bytes) {
-    Mode = ARM_AM::db;
-  } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
-    Mode = ARM_AM::da;
-  } else {
-    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
-    if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
-        ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
-      return false;
+
+  // Try merging with the previous instruction.
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  MachineBasicBlock::iterator MBBI(MI);
+  if (MBBI != BeginMBBI) {
+    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
+    if (Mode == ARM_AM::ia &&
+        isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      Mode = ARM_AM::db;
+      DoMerge = true;
+    } else if (Mode == ARM_AM::ib &&
+               isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      Mode = ARM_AM::da;
+      DoMerge = true;
+    }
+    if (DoMerge)
+      MBB.erase(PrevMBBI);
  }
-  MBB.erase(MergeInstr);
+
+  // Try merging with the next instruction.
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
+    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
+    if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
+        isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      DoMerge = true;
+    } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
+               isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge)
+      MBB.erase(NextMBBI);
+  }
+
+  if (!DoMerge)
+    return false;

  unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
@ -1254,6 +1283,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {

  unsigned Base = getLoadStoreBaseOp(*MI).getReg();
  bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
  unsigned Opcode = MI->getOpcode();
  DebugLoc DL = MI->getDebugLoc();
  bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
@ -1265,6 +1295,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
  if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
    return false;

+  bool isLd = isLoadSingle(Opcode);
  // Can't do the merge if the destination register is the same as the would-be
  // writeback register.
  if (MI->getOperand(0).getReg() == Base)
@ -1272,31 +1303,55 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {

  unsigned PredReg = 0;
  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
-  int Bytes = getLSMultipleTransferSize(MI);
+  bool DoMerge = false;
+  ARM_AM::AddrOpc AddSub = ARM_AM::add;
+  unsigned NewOpc = 0;
+  // AM2 - 12 bits, thumb2 - 8 bits.
+  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
+
+  // Try merging with the previous instruction.
  MachineBasicBlock &MBB = *MI->getParent();
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
  MachineBasicBlock::iterator MBBI(MI);
-  int Offset;
-  MachineBasicBlock::iterator MergeInstr
-    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
-  unsigned NewOpc;
-  if (!isAM5 && Offset == Bytes) {
-    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
-  } else if (Offset == -Bytes) {
-    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
-  } else {
-    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
-    if (Offset == Bytes) {
-      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
-    } else if (!isAM5 && Offset == -Bytes) {
-      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
-    } else
-      return false;
+  if (MBBI != BeginMBBI) {
+    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
+    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+    } else if (!isAM5 &&
+               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge) {
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
+      MBB.erase(PrevMBBI);
+    }
  }
-  MBB.erase(MergeInstr);

-  ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
+  // Try merging with the next instruction.
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
+    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
+    if (!isAM5 &&
+        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
+      MBB.erase(NextMBBI);
+    }
+  }
+
+  if (!DoMerge)
+    return false;

-  bool isLd = isLoadSingle(Opcode);
  if (isAM5) {
    // VLDM[SD]_UPD, VSTM[SD]_UPD
    // (There are no base-updating versions of VLDR/VSTR instructions, but the
@ -1313,16 +1368,18 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
    if (isAM2) {
      // LDR_PRE, LDR_POST
      if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
+        int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
          .addReg(Base, RegState::Define)
          .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
      } else {
-        int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+        int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
          .addReg(Base, RegState::Define)
-          .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+          .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
      }
    } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
      // t2LDR_PRE, t2LDR_POST
      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
        .addReg(Base, RegState::Define)
@ -1334,12 +1391,13 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
    // the vestigal zero-reg offset register. When that's fixed, this clause
    // can be removed entirely.
    if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
-      int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+      int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
      // STR_PRE, STR_POST
      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
-        .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
    } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
      // t2STR_PRE, t2STR_POST
      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
@ -1351,66 +1409,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
  return true;
 }

-bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
-  unsigned Opcode = MI.getOpcode();
-  assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
-         "Must have t2STRDi8 or t2LDRDi8");
-  if (MI.getOperand(3).getImm() != 0)
-    return false;
-
-  // Behaviour for writeback is undefined if base register is the same as one
-  // of the others.
-  const MachineOperand &BaseOp = MI.getOperand(2);
-  unsigned Base = BaseOp.getReg();
-  const MachineOperand &Reg0Op = MI.getOperand(0);
-  const MachineOperand &Reg1Op = MI.getOperand(1);
-  if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
-    return false;
-
-  unsigned PredReg;
-  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
-  MachineBasicBlock::iterator MBBI(MI);
-  MachineBasicBlock &MBB = *MI.getParent();
-  int Offset;
-  MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
-                                                            PredReg, Offset);
-  unsigned NewOpc;
-  if (Offset == 8 || Offset == -8) {
-    NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
-  } else {
-    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
-    if (Offset == 8 || Offset == -8) {
-      NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
-    } else
-      return false;
-  }
-  MBB.erase(MergeInstr);
-
-  DebugLoc DL = MI.getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
-  if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
-    MIB.addOperand(Reg0Op).addOperand(Reg1Op)
-       .addReg(BaseOp.getReg(), RegState::Define);
-  } else {
-    assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
-    MIB.addReg(BaseOp.getReg(), RegState::Define)
-       .addOperand(Reg0Op).addOperand(Reg1Op);
-  }
-  MIB.addReg(BaseOp.getReg(), RegState::Kill)
-     .addImm(Offset).addImm(Pred).addReg(PredReg);
-  assert(TII->get(Opcode).getNumOperands() == 6 &&
-         TII->get(NewOpc).getNumOperands() == 7 &&
-         "Unexpected number of operands in Opcode specification.");
-
-  // Transfer implicit operands.
-  for (const MachineOperand &MO : MI.implicit_operands())
-    MIB.addOperand(MO);
-  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
-  MBB.erase(MBBI);
-  return true;
-}
-
 /// Returns true if instruction is a memory operation that this pass is capable
 /// of operating on.
 static bool isMemoryOp(const MachineInstr *MI) {
@ -1618,7 +1616,6 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
  ARMCC::CondCodes CurrPred = ARMCC::AL;
  unsigned Position = 0;
  assert(Candidates.size() == 0);
-  assert(MergeBaseCandidates.size() == 0);
  LiveRegsValid = false;

  for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
@ -1697,15 +1694,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
      MBBI = I;
      --Position;
      // Fallthrough to look into existing chain.
-    } else if (MBBI->isDebugValue()) {
+    } else if (MBBI->isDebugValue())
      continue;
-    } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
-               MBBI->getOpcode() == ARM::t2STRDi8) {
-      // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
-      // remember them because we may still be able to merge add/sub into them.
-      MergeBaseCandidates.push_back(MBBI);
-    }
-

    // If we are here then the chain is broken; Extract candidates for a merge.
    if (MemOps.size() > 0) {
@ -1736,9 +1726,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
      if (Merged) {
        Changed = true;
        unsigned Opcode = Merged->getOpcode();
-        if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
-          MergeBaseUpdateLSDouble(*Merged);
-        else
+        if (Opcode != ARM::t2STRDi8 && Opcode != ARM::t2LDRDi8)
          MergeBaseUpdateLSMultiple(Merged);
      } else {
        for (MachineInstr *MI : Candidate->Instrs) {
@ -1753,10 +1741,6 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
    }
  }
  Candidates.clear();
-  // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
-  for (MachineInstr *MI : MergeBaseCandidates)
-    MergeBaseUpdateLSDouble(*MI);
-  MergeBaseCandidates.clear();

  return Changed;
 }
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@ -566,7 +566,7 @@ Robert Muth started working on an alternate jump table implementation that
 does not put the tables in-line in the text.  This is more like the llvm
 default jump table implementation.  This might be useful sometime.  Several
 revisions of patches are on the mailing list, beginning at:
-http://lists.cs.uiuc.edu/pipermail/llvmdev/2009-June/022763.html
+http://lists.llvm.org/pipermail/llvm-dev/2009-June/022763.html

 //===---------------------------------------------------------------------===//

--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@ -57,7 +57,7 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
    // Some things to try that should be better:
    //   * 'mov hi, $src; mov $dst, hi', with hi as either r10 or r11
    //   * 'movs $dst, $src' if cpsr isn't live
-    // See: http://lists.cs.uiuc.edu/pipermail/llvmdev/2014-August/075998.html
+    // See: http://lists.llvm.org/pipermail/llvm-dev/2014-August/075998.html

    // 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it
    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPUSH)))
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@ -864,13 +864,13 @@ static bool needToReserveScavengingSpillSlots(MachineFunction &MF,
  // Check for an unused caller-saved register.
  for ( ; *CallerSavedRegs; ++CallerSavedRegs) {
    MCPhysReg FreeReg = *CallerSavedRegs;
-    if (!MRI.reg_nodbg_empty(FreeReg))
+    if (MRI.isPhysRegUsed(FreeReg))
      continue;

    // Check aliased register usage.
    bool IsCurrentRegUsed = false;
    for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI)
-      if (!MRI.reg_nodbg_empty(*AI)) {
+      if (MRI.isPhysRegUsed(*AI)) {
        IsCurrentRegUsed = true;
        break;
      }
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@ -500,14 +500,6 @@ def : MipsPat<(trunc (assertzext GPR64:$src)),
 def : MipsPat<(i32 (trunc GPR64:$src)),
              (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;

-// Bypass trunc nodes for bitwise ops.
-def : MipsPat<(i32 (trunc (and GPR64:$lhs, GPR64:$rhs))),
-              (EXTRACT_SUBREG (AND64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
-def : MipsPat<(i32 (trunc (or GPR64:$lhs, GPR64:$rhs))),
-              (EXTRACT_SUBREG (OR64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
-def : MipsPat<(i32 (trunc (xor GPR64:$lhs, GPR64:$rhs))),
-              (EXTRACT_SUBREG (XOR64 GPR64:$lhs, GPR64:$rhs), sub_32)>;
-
 // variable shift instructions patterns
 def : MipsPat<(shl GPR64:$rt, (i32 (trunc GPR64:$rs))),
              (DSLLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@ -267,6 +267,9 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
 }

 unsigned MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+  if (!TargetSupported)
+    return 0;
+
  assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i32 &&
         "Alloca should always return a pointer.");

@ -290,12 +293,7 @@ unsigned MipsFastISel::materializeInt(const Constant *C, MVT VT) {
    return 0;
  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
  const ConstantInt *CI = cast<ConstantInt>(C);
-  int64_t Imm;
-  if ((VT != MVT::i1) && CI->isNegative())
-    Imm = CI->getSExtValue();
-  else
-    Imm = CI->getZExtValue();
-  return materialize32BitInt(Imm, RC);
+  return materialize32BitInt(CI->getZExtValue(), RC);
 }

 unsigned MipsFastISel::materialize32BitInt(int64_t Imm,
@ -382,6 +380,9 @@ unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) {
 // Materialize a constant into a register, and return the register
 // number (or zero if we failed to handle it).
 unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
+  if (!TargetSupported)
+    return 0;
+
  EVT CEVT = TLI.getValueType(DL, C->getType(), true);

  // Only handle simple types.
@ -981,6 +982,13 @@ bool MipsFastISel::selectSelect(const Instruction *I) {
  if (!Src1Reg || !Src2Reg || !CondReg)
    return false;

+  unsigned ZExtCondReg = createResultReg(&Mips::GPR32RegClass);
+  if (!ZExtCondReg)
+    return false;
+
+  if (!emitIntExt(MVT::i1, CondReg, MVT::i32, ZExtCondReg, true))
+    return false;
+
  unsigned ResultReg = createResultReg(RC);
  unsigned TempReg = createResultReg(RC);

@ -989,7 +997,7 @@ bool MipsFastISel::selectSelect(const Instruction *I) {

  emitInst(TargetOpcode::COPY, TempReg).addReg(Src2Reg);
  emitInst(CondMovOpc, ResultReg)
-    .addReg(Src1Reg).addReg(CondReg).addReg(TempReg);
+    .addReg(Src1Reg).addReg(ZExtCondReg).addReg(TempReg);
  updateValueMap(I, ResultReg);
  return true;
 }
@ -1232,12 +1240,19 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
 }

 bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  if (!TargetSupported)
+    return false;
+
  CallingConv::ID CC = CLI.CallConv;
  bool IsTailCall = CLI.IsTailCall;
  bool IsVarArg = CLI.IsVarArg;
  const Value *Callee = CLI.Callee;
  MCSymbol *Symbol = CLI.Symbol;

+  // Do not handle FastCC.
+  if (CC == CallingConv::Fast)
+    return false;
+
  // Allow SelectionDAG isel to handle tail calls.
  if (IsTailCall)
    return false;
@ -1312,6 +1327,9 @@ bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
 }

 bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+  if (!TargetSupported)
+    return false;
+
  switch (II->getIntrinsicID()) {
  default:
    return false;
@ -1415,6 +1433,11 @@ bool MipsFastISel::selectRet(const Instruction *I) {

  if (Ret->getNumOperands() > 0) {
    CallingConv::ID CC = F.getCallingConv();
+
+    // Do not handle FastCC.
+    if (CC == CallingConv::Fast)
+      return false;
+
    SmallVector<ISD::OutputArg, 4> Outs;
    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);

--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
@ -53,11 +54,6 @@ NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
               cl::desc("MIPS: Don't trap on integer division by zero."),
               cl::init(false));

-cl::opt<bool>
-EnableMipsFastISel("mips-fast-isel", cl::Hidden,
-  cl::desc("Allow mips-fast-isel to be used"),
-  cl::init(false));
-
 static const MCPhysReg Mips64DPRegs[8] = {
  Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
  Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
@ -461,7 +457,7 @@ const MipsTargetLowering *MipsTargetLowering::create(const MipsTargetMachine &TM
 FastISel *
 MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                  const TargetLibraryInfo *libInfo) const {
-  if (!EnableMipsFastISel)
+  if (!funcInfo.MF->getTarget().Options.EnableFastISel)
    return TargetLowering::createFastISel(funcInfo, libInfo);
  return Mips::createFastISel(funcInfo, libInfo);
 }
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@ -12,6 +12,7 @@
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/TargetRegistry.h"

 using namespace llvm;
@ -22,10 +23,12 @@ typedef MCDisassembler::DecodeStatus DecodeStatus;

 namespace {
 class PPCDisassembler : public MCDisassembler {
+  bool IsLittleEndian;
+
 public:
-  PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
-    : MCDisassembler(STI, Ctx) {}
-  ~PPCDisassembler() override {}
+  PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                  bool IsLittleEndian)
+      : MCDisassembler(STI, Ctx), IsLittleEndian(IsLittleEndian) {}

  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                              ArrayRef<uint8_t> Bytes, uint64_t Address,
@ -37,7 +40,13 @@ class PPCDisassembler : public MCDisassembler {
 static MCDisassembler *createPPCDisassembler(const Target &T,
                                             const MCSubtargetInfo &STI,
                                             MCContext &Ctx) {
-  return new PPCDisassembler(STI, Ctx);
+  return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/false);
+}
+
+static MCDisassembler *createPPCLEDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/true);
 }

 extern "C" void LLVMInitializePowerPCDisassembler() {
@ -47,7 +56,7 @@ extern "C" void LLVMInitializePowerPCDisassembler() {
  TargetRegistry::RegisterMCDisassembler(ThePPC64Target,
                                         createPPCDisassembler);
  TargetRegistry::RegisterMCDisassembler(ThePPC64LETarget,
-                                         createPPCDisassembler);
+                                         createPPCLEDisassembler);
 }

 // FIXME: These can be generated by TableGen from the existing register
@ -383,9 +392,9 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
    return MCDisassembler::Fail;
  }

-  // The instruction is big-endian encoded.
-  uint32_t Inst =
-      (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 8) | (Bytes[3] << 0);
+  // Read the instruction in the proper endianness.
+  uint32_t Inst = IsLittleEndian ? support::endian::read32le(Bytes.data())
+                                 : support::endian::read32be(Bytes.data());

  if (STI.getFeatureBits()[PPC::FeatureQPX]) {
    DecodeStatus result =
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@ -363,71 +363,85 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
  SM.recordPatchPoint(MI);
  PatchPointOpers Opers(&MI);

-  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
  unsigned EncodedBytes = 0;
-  if (CallTarget) {
-    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
-           "High 16 bits of call target should be zero.");
-    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
-    EncodedBytes = 0;
-    // Materialize the jump address:
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 32) & 0xFFFF));
-    ++EncodedBytes;
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm(32).addImm(16));
-    ++EncodedBytes;
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 16) & 0xFFFF));
-    ++EncodedBytes;
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm(CallTarget & 0xFFFF));
+  const MachineOperand &CalleeMO =
+    Opers.getMetaOper(PatchPointOpers::TargetPos);

-    // Save the current TOC pointer before the remote call.
-    int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD)
-                                    .addReg(PPC::X2)
-                                    .addImm(TOCSaveOffset)
-                                    .addReg(PPC::X1));
-    ++EncodedBytes;
+  if (CalleeMO.isImm()) {
+    int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+    if (CallTarget) {
+      assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+             "High 16 bits of call target should be zero.");
+      unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+      EncodedBytes = 0;
+      // Materialize the jump address:
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8)
+                                      .addReg(ScratchReg)
+                                      .addImm((CallTarget >> 32) & 0xFFFF));
+      ++EncodedBytes;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC)
+                                      .addReg(ScratchReg)
+                                      .addReg(ScratchReg)
+                                      .addImm(32).addImm(16));
+      ++EncodedBytes;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8)
+                                      .addReg(ScratchReg)
+                                      .addReg(ScratchReg)
+                                      .addImm((CallTarget >> 16) & 0xFFFF));
+      ++EncodedBytes;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8)
+                                      .addReg(ScratchReg)
+                                      .addReg(ScratchReg)
+                                      .addImm(CallTarget & 0xFFFF));
+
+      // Save the current TOC pointer before the remote call.
+      int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD)
+                                      .addReg(PPC::X2)
+                                      .addImm(TOCSaveOffset)
+                                      .addReg(PPC::X1));
+      ++EncodedBytes;


-    // If we're on ELFv1, then we need to load the actual function pointer from
-    // the function descriptor.
-    if (!Subtarget->isELFv2ABI()) {
-      // Load the new TOC pointer and the function address, but not r11
-      // (needing this is rare, and loading it here would prevent passing it
-      // via a 'nest' parameter.
+      // If we're on ELFv1, then we need to load the actual function pointer
+      // from the function descriptor.
+      if (!Subtarget->isELFv2ABI()) {
+	// Load the new TOC pointer and the function address, but not r11
+	// (needing this is rare, and loading it here would prevent passing it
+	// via a 'nest' parameter.
+        EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+                                        .addReg(PPC::X2)
+                                        .addImm(8)
+                                        .addReg(ScratchReg));
+        ++EncodedBytes;
+        EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+                                        .addReg(ScratchReg)
+                                        .addImm(0)
+                                        .addReg(ScratchReg));
+        ++EncodedBytes;
+      }
+
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8)
+                                      .addReg(ScratchReg));
+      ++EncodedBytes;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8));
+      ++EncodedBytes;
+
+      // Restore the TOC pointer after the call.
      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
                                      .addReg(PPC::X2)
-                                      .addImm(8)
-                                      .addReg(ScratchReg));
-      ++EncodedBytes;
-      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
-                                      .addReg(ScratchReg)
-                                      .addImm(0)
-                                      .addReg(ScratchReg));
+                                      .addImm(TOCSaveOffset)
+                                      .addReg(PPC::X1));
      ++EncodedBytes;
    }
+  } else if (CalleeMO.isGlobal()) {
+    const GlobalValue *GValue = CalleeMO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext);

-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8).addReg(ScratchReg));
-    ++EncodedBytes;
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8));
-    ++EncodedBytes;
-
-    // Restore the TOC pointer after the call.
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
-                                    .addReg(PPC::X2)
-                                    .addImm(TOCSaveOffset)
-                                    .addReg(PPC::X1));
-    ++EncodedBytes;
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP)
+                                    .addExpr(SymVar));
+    EncodedBytes += 2;
  }

  // Each instruction is 4 bytes.
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@ -306,10 +306,9 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
  DebugLoc dl = MI->getDebugLoc();

-  const MachineRegisterInfo &MRI = MF->getRegInfo();
  unsigned UsedRegMask = 0;
  for (unsigned i = 0; i != 32; ++i)
-    if (MRI.isPhysRegModified(VRRegNo[i]))
+    if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i]))
      UsedRegMask |= 1 << (31-i);

  // Live in and live out values already must be in the mask, so don't bother
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@ -2305,14 +2305,15 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
    if (Swap)
      std::swap(LHS, RHS);

+    EVT ResVT = VecVT.changeVectorElementTypeToInteger();
    if (Negate) {
-      SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
+      SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0);
      return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR :
                                                              PPC::VNOR,
-                                  VecVT, VCmp, VCmp);
+                                  ResVT, VCmp, VCmp);
    }

-    return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
+    return CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS);
  }

  if (PPCSubTarget->useCRBits())
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@ -580,6 +580,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

      addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);

+      addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
      addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
      addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);

@ -1416,7 +1417,7 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
  } else
    return -1;

-  if (ShuffleKind == 2 && isLE)
+  if (isLE)
    ShiftAmt = 16 - ShiftAmt;

  return ShiftAmt;
@ -1429,6 +1430,11 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
  assert(N->getValueType(0) == MVT::v16i8 &&
         (EltSize == 1 || EltSize == 2 || EltSize == 4));

+  // The consecutive indices need to specify an element, not part of two
+  // different elements.  So abandon ship early if this isn't the case.
+  if (N->getMaskElt(0) % EltSize != 0)
+    return false;
+
  // This is a splat operation if each element of the permute is the same, and
  // if the value doesn't reference the second vector.
  unsigned ElementBase = N->getMaskElt(0);
@ -7011,17 +7017,20 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
    // t = vsplti c, result = vsldoi t, t, 1
    if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
-      return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
+      unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
+      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
    }
    // t = vsplti c, result = vsldoi t, t, 2
    if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
-      return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
+      unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
+      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
    }
    // t = vsplti c, result = vsldoi t, t, 3
    if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
-      return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
+      unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
+      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
    }
  }

@ -9957,6 +9966,9 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
    if (Src.getValueType() == MVT::f32) {
      Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
      DCI.AddToWorklist(Src.getNode());
+    } else if (Src.getValueType() != MVT::f64) {
+      // Make sure that we don't pick up a ppc_fp128 source value.
+      return SDValue();
    }

    unsigned FCTOp =
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@ -106,7 +106,7 @@ for 1,2,4,8 bytes.
 //===---------------------------------------------------------------------===//

 It would be nice to revert this patch:
-http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
+http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html

 And teach the dag combiner enough to simplify the code expanded before 
 legalize.  It seems plausible that this knowledge would let it simplify other
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@ -190,11 +190,11 @@ static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
 {

  for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
-    if (!MRI->reg_nodbg_empty(reg))
+    if (MRI->isPhysRegUsed(reg))
      return false;

  for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
-    if (!MRI->reg_nodbg_empty(reg))
+    if (MRI->isPhysRegUsed(reg))
      return false;

  return true;
@ -206,10 +206,10 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
  MachineRegisterInfo &MRI = MF.getRegInfo();
  MachineFrameInfo    *MFI = MF.getFrameInfo();

-  return !(MFI->hasCalls()                 // has calls
-           || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
-           || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
-           || hasFP(MF));                  // need %FP
+  return !(MFI->hasCalls()              // has calls
+           || MRI.isPhysRegUsed(SP::L0) // Too many registers needed
+           || MRI.isPhysRegUsed(SP::O6) // %SP is used
+           || hasFP(MF));               // need %FP
 }

 void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
@ -218,13 +218,16 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {

  // Remap %i[0-7] to %o[0-7].
  for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
-    if (MRI.reg_nodbg_empty(reg))
+    if (!MRI.isPhysRegUsed(reg))
      continue;
    unsigned mapped_reg = (reg - SP::I0 + SP::O0);
-    assert(MRI.reg_nodbg_empty(mapped_reg));
+    assert(!MRI.isPhysRegUsed(mapped_reg));

    // Replace I register with O register.
    MRI.replaceRegWith(reg, mapped_reg);
+
+    // Mark the reg unused.
+    MRI.setPhysRegUnused(reg);
  }

  // Rewrite MBB's Live-ins.
--- a/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@ -53,10 +53,6 @@ def RetCC_SystemZ : CallingConv<[
  CCIfSubtarget<"hasVector()",
    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
             CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
-
-  // ABI-compliant code returns long double by reference, but that conversion
-  // is left to higher-level code.  Perhaps we could add an f128 definition
-  // here for code that doesn't care about the ABI?
 ]>;

 //===----------------------------------------------------------------------===//
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@ -1175,6 +1175,20 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
  return Chain;
 }

+bool SystemZTargetLowering::
+CanLowerReturn(CallingConv::ID CallConv,
+               MachineFunction &MF, bool isVarArg,
+               const SmallVectorImpl<ISD::OutputArg> &Outs,
+               LLVMContext &Context) const {
+  // Detect unsupported vector return types.
+  if (Subtarget.hasVector())
+    VerifyVectorTypes(Outs);
+
+  SmallVector<CCValAssign, 16> RetLocs;
+  CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
+  return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
+}
+
 SDValue
 SystemZTargetLowering::LowerReturn(SDValue Chain,
                                   CallingConv::ID CallConv, bool IsVarArg,
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@ -423,6 +423,10 @@ class SystemZTargetLowering : public TargetLowering {
  SDValue LowerCall(CallLoweringInfo &CLI,
                    SmallVectorImpl<SDValue> &InVals) const override;

+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                      const SmallVectorImpl<ISD::OutputArg> &Outs,
                      const SmallVectorImpl<SDValue> &OutVals,
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@ -681,6 +681,9 @@ class X86AsmParser : public MCTargetAsmParser {

  std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
  std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+  void AddDefaultSrcDestOperands(
+      OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+      std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
  std::unique_ptr<X86Operand> ParseOperand();
  std::unique_ptr<X86Operand> ParseATTOperand();
  std::unique_ptr<X86Operand> ParseIntelOperand();
@ -1014,6 +1017,19 @@ std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
                               Loc, Loc, 0);
 }

+void X86AsmParser::AddDefaultSrcDestOperands(
+    OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+    std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) {
+  if (isParsingIntelSyntax()) {
+    Operands.push_back(std::move(Dst));
+    Operands.push_back(std::move(Src));
+  }
+  else {
+    Operands.push_back(std::move(Src));
+    Operands.push_back(std::move(Dst));
+  }
+}
+
 std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
  if (isParsingIntelSyntax())
    return ParseIntelOperand();
@ -2228,26 +2244,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
  if (Name.startswith("ins") && Operands.size() == 1 &&
      (Name == "insb" || Name == "insw" || Name == "insl" ||
       Name == "insd" )) {
-    if (isParsingIntelSyntax()) {
-      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
-      Operands.push_back(DefaultMemDIOperand(NameLoc));
-    } else {
-      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
-      Operands.push_back(DefaultMemDIOperand(NameLoc));
-    }
+    AddDefaultSrcDestOperands(Operands, 
+                              X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
+                              DefaultMemDIOperand(NameLoc));
  }

  // Append default arguments to "outs[bwld]"
  if (Name.startswith("outs") && Operands.size() == 1 &&
      (Name == "outsb" || Name == "outsw" || Name == "outsl" ||
       Name == "outsd" )) {
-    if (isParsingIntelSyntax()) {
-      Operands.push_back(DefaultMemSIOperand(NameLoc));
-      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
-    } else {
-      Operands.push_back(DefaultMemSIOperand(NameLoc));
-      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
-    }
+    AddDefaultSrcDestOperands(Operands,
+                              DefaultMemSIOperand(NameLoc),
+                              X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
  }

  // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
@ -2279,13 +2287,9 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
      (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" ||
       Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) {
    if (Operands.size() == 1) {
-      if (isParsingIntelSyntax()) {
-        Operands.push_back(DefaultMemSIOperand(NameLoc));
-        Operands.push_back(DefaultMemDIOperand(NameLoc));
-      } else {
-        Operands.push_back(DefaultMemDIOperand(NameLoc));
-        Operands.push_back(DefaultMemSIOperand(NameLoc));
-      }
+      AddDefaultSrcDestOperands(Operands,
+                                DefaultMemDIOperand(NameLoc),
+                                DefaultMemSIOperand(NameLoc));
    } else if (Operands.size() == 3) {
      X86Operand &Op = (X86Operand &)*Operands[1];
      X86Operand &Op2 = (X86Operand &)*Operands[2];
@ -2305,13 +2309,9 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
    if (Operands.size() == 1) {
      if (Name == "movsd")
        Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
-      if (isParsingIntelSyntax()) {
-        Operands.push_back(DefaultMemDIOperand(NameLoc));
-        Operands.push_back(DefaultMemSIOperand(NameLoc));
-      } else {
-        Operands.push_back(DefaultMemSIOperand(NameLoc));
-        Operands.push_back(DefaultMemDIOperand(NameLoc));
-      }
+      AddDefaultSrcDestOperands(Operands,
+                                DefaultMemSIOperand(NameLoc),
+                                DefaultMemDIOperand(NameLoc));
    } else if (Operands.size() == 3) {
      X86Operand &Op = (X86Operand &)*Operands[1];
      X86Operand &Op2 = (X86Operand &)*Operands[2];
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@ -301,9 +301,8 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
  bool FPIsUsed = false;

  static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
  for (unsigned i = 0; i <= 6; ++i)
-    if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
+    if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
      FPIsUsed = true;
      break;
    }
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@ -1682,6 +1682,8 @@ void X86FrameLowering::adjustForSegmentedStacks(
      .addImm(StackSize);
    BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
      .addImm(X86FI->getArgumentStackSize());
+    MF.getRegInfo().setPhysRegUsed(Reg10);
+    MF.getRegInfo().setPhysRegUsed(Reg11);
  } else {
    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
      .addImm(X86FI->getArgumentStackSize());
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -12640,24 +12640,29 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
      if (User->getOpcode() == ISD::FNEG)
        return Op;

-  SDValue Op0 = Op.getOperand(0);
-  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
-
  SDLoc dl(Op);
  MVT VT = Op.getSimpleValueType();
-  // Assume scalar op for initialization; update for vector if needed.
-  // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
-  // generate a 16-byte vector constant and logic op even for the scalar case.
-  // Using a 16-byte mask allows folding the load of the mask with
-  // the logic op, so it can save (~4 bytes) on code size.
-  MVT EltVT = VT;
-  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+
  // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
  // decide if we should generate a 16-byte constant mask when we only need 4 or
  // 8 bytes for the scalar case.
+
+  MVT LogicVT;
+  MVT EltVT;
+  unsigned NumElts;
+  
  if (VT.isVector()) {
+    LogicVT = VT;
    EltVT = VT.getVectorElementType();
    NumElts = VT.getVectorNumElements();
+  } else {
+    // There are no scalar bitwise logical SSE/AVX instructions, so we
+    // generate a 16-byte vector constant and logic op even for the scalar case.
+    // Using a 16-byte mask allows folding the load of the mask with
+    // the logic op, so it can save (~4 bytes) on code size.
+    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+    EltVT = VT;
+    NumElts = (VT == MVT::f64) ? 2 : 4;
  }

  unsigned EltBits = EltVT.getSizeInBits();
@ -12670,26 +12675,25 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+  SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                             MachinePointerInfo::getConstantPool(),
                             false, false, false, Alignment);

-  if (VT.isVector()) {
-    // For a vector, cast operands to a vector type, perform the logic op,
-    // and cast the result back to the original value type.
-    MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-    SDValue MaskCasted = DAG.getBitcast(VecVT, Mask);
-    SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0))
-                              : DAG.getBitcast(VecVT, Op0);
-    unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
-    return DAG.getBitcast(VT,
-                          DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
-  }
-
-  // If not vector, then scalar.
-  unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+  SDValue Op0 = Op.getOperand(0);
+  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+  unsigned LogicOp =
+    IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
  SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
-  return DAG.getNode(BitOp, dl, VT, Operand, Mask);
+
+  if (VT.isVector())
+    return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+
+  // For the scalar case extend to a 128-bit vector, perform the logic op,
+  // and extract the scalar result back out.
+  Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
+  SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
+                     DAG.getIntPtrConstant(0, dl));
 }

 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
@ -12729,10 +12733,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
  Constant *C = ConstantVector::get(CV);
  auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
  SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
-  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+
+  // Perform all logic operations as 16-byte vectors because there are no
+  // scalar FP logic instructions in SSE. This allows load folding of the
+  // constants into the logic instructions.
+  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+  SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
                              false, false, false, 16);
-  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+  Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);

  // Next, clear the sign bit from the first operand (magnitude).
  // If it's a constant, we can clear it here.
@ -12740,7 +12750,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
    APFloat APF = Op0CN->getValueAPF();
    // If the magnitude is a positive zero, the sign bit alone is enough.
    if (APF.isPosZero())
-      return SignBit;
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+                         DAG.getIntPtrConstant(0, dl));
    APF.clearSign();
    CV[0] = ConstantFP::get(*Context, APF);
  } else {
@ -12750,15 +12761,18 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
  }
  C = ConstantVector::get(CV);
  CPIdx = DAG.getConstantPool(C, PtrVT, 16);
-  SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+  SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                            MachinePointerInfo::getConstantPool(),
                            false, false, false, 16);
  // If the magnitude operand wasn't a constant, we need to AND out the sign.
-  if (!isa<ConstantFPSDNode>(Op0))
-    Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
-
+  if (!isa<ConstantFPSDNode>(Op0)) {
+    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+    Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
+  }
  // OR the magnitude value with the sign bit.
-  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+  Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+                     DAG.getIntPtrConstant(0, dl));
 }

 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@ -956,18 +956,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
    { X86::DPPDrri,         X86::DPPDrmi,       TB_ALIGN_16 },
    { X86::DPPSrri,         X86::DPPSrmi,       TB_ALIGN_16 },

-    // FIXME: We should not be folding Fs* scalar loads into vector
-    // instructions because the vector instructions require vector-sized
-    // loads. Lowering should create vector-sized instructions (the Fv*
-    // variants below) to allow load folding.
-    { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
-    { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
-    { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
-    { X86::FsANDPSrr,       X86::FsANDPSrm,     TB_ALIGN_16 },
-    { X86::FsORPDrr,        X86::FsORPDrm,      TB_ALIGN_16 },
-    { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
-    { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
-    { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
+    // Do not fold Fs* scalar logical op loads because there are no scalar
+    // load variants for these instructions. When folded, the load is required
+    // to be 128-bits, so the load size would not match.

    { X86::FvANDNPDrr,      X86::FvANDNPDrm,    TB_ALIGN_16 },
    { X86::FvANDNPSrr,      X86::FvANDNPSrm,    TB_ALIGN_16 },
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -2919,6 +2919,14 @@ multiclass sse12_fp_packed_vector_logical_alias<
  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
        VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
        PD, VEX_4V;
+
+  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+        VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
+        PS, VEX_4V, VEX_L;
+        
+  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+        VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
+        PD, VEX_4V, VEX_L;
  }

  let Constraints = "$src1 = $dst" in {
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@ -93,7 +93,8 @@ static Value *getFCmpValue(bool isordered, unsigned code,
  case 5: Pred = isordered ? FCmpInst::FCMP_ONE : FCmpInst::FCMP_UNE; break;
  case 6: Pred = isordered ? FCmpInst::FCMP_OLE : FCmpInst::FCMP_ULE; break;
  case 7:
-    if (!isordered) return ConstantInt::getTrue(LHS->getContext());
+    if (!isordered)
+      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
    Pred = FCmpInst::FCMP_ORD; break;
  }
  return Builder->CreateFCmp(Pred, LHS, RHS);
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@ -2112,9 +2112,8 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
 bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
                                         Value *RHS, Instruction &OrigI,
                                         Value *&Result, Constant *&Overflow) {
-  assert((!OrigI.isCommutative() ||
-          !(isa<Constant>(LHS) && !isa<Constant>(RHS))) &&
-         "call with a constant RHS if possible!");
+  if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS))
+    std::swap(LHS, RHS);

  auto SetResult = [&](Value *OpResult, Constant *OverflowVal, bool ReuseName) {
    Result = OpResult;
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@ -658,7 +658,7 @@ bool EarlyCSE::run() {
  // gains over vector when the container becomes very large due to the
  // specific access patterns. For more information see the mailing list
  // discussion on this:
-  // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+  // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
  std::deque<StackNode *> nodesToProcess;

  bool Changed = false;
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@ -1847,10 +1847,17 @@ static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
  if (OldTy == NewTy)
    return true;
-  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
-    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
-      if (NewITy->getBitWidth() >= OldITy->getBitWidth())
-        return true;
+
+  // For integer types, we can't handle any bit-width differences. This would
+  // break both vector conversions with extension and introduce endianness
+  // issues when in conjunction with loads and stores.
+  if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
+    assert(cast<IntegerType>(OldTy)->getBitWidth() !=
+               cast<IntegerType>(NewTy)->getBitWidth() &&
+           "We can't have the same bitwidth for different int types");
+    return false;
+  }
+
  if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
    return false;
  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
@ -1885,10 +1892,8 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
  if (OldTy == NewTy)
    return V;

-  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
-    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
-      if (NewITy->getBitWidth() > OldITy->getBitWidth())
-        return IRB.CreateZExt(V, NewITy);
+  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
+         "Integer types must be the exact same to convert.");

  // See if we need inttoptr for this type pair. A cast involving both scalars
  // and vectors requires and additional bitcast.
@ -2134,6 +2139,9 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
    if (LI->isVolatile())
      return false;
+    // We can't handle loads that extend past the allocated memory.
+    if (DL.getTypeStoreSize(LI->getType()) > Size)
+      return false;
    // Note that we don't count vector loads or stores as whole-alloca
    // operations which enable integer widening because we would prefer to use
    // vector widening instead.
@ -2152,6 +2160,9 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
    Type *ValueTy = SI->getValueOperand()->getType();
    if (SI->isVolatile())
      return false;
+    // We can't handle stores that extend past the allocated memory.
+    if (DL.getTypeStoreSize(ValueTy) > Size)
+      return false;
    // Note that we don't count vector loads or stores as whole-alloca
    // operations which enable integer widening because we would prefer to use
    // vector widening instead.
@ -2585,6 +2596,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {

    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                             : LI.getType();
+    const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
    bool IsPtrAdjusted = false;
    Value *V;
    if (VecTy) {
@ -2592,13 +2604,27 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
    } else if (IntTy && LI.getType()->isIntegerTy()) {
      V = rewriteIntegerLoad(LI);
    } else if (NewBeginOffset == NewAllocaBeginOffset &&
-               canConvertValue(DL, NewAllocaTy, LI.getType())) {
+               NewEndOffset == NewAllocaEndOffset &&
+               (canConvertValue(DL, NewAllocaTy, TargetTy) ||
+                (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
+                 TargetTy->isIntegerTy()))) {
      LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                              LI.isVolatile(), LI.getName());
      if (LI.isVolatile())
        NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
-
      V = NewLI;
+
+      // If this is an integer load past the end of the slice (which means the
+      // bytes outside the slice are undef or this load is dead) just forcibly
+      // fix the integer size with correct handling of endianness.
+      if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+        if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
+          if (AITy->getBitWidth() < TITy->getBitWidth()) {
+            V = IRB.CreateZExt(V, TITy, "load.ext");
+            if (DL.isBigEndian())
+              V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
+                                "endian_shift");
+          }
    } else {
      Type *LTy = TargetTy->getPointerTo();
      LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
@ -2718,10 +2744,25 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
    if (IntTy && V->getType()->isIntegerTy())
      return rewriteIntegerStore(V, SI);

+    const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
    StoreInst *NewSI;
    if (NewBeginOffset == NewAllocaBeginOffset &&
        NewEndOffset == NewAllocaEndOffset &&
-        canConvertValue(DL, V->getType(), NewAllocaTy)) {
+        (canConvertValue(DL, V->getType(), NewAllocaTy) ||
+         (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
+          V->getType()->isIntegerTy()))) {
+      // If this is an integer store past the end of slice (and thus the bytes
+      // past that point are irrelevant or this is unreachable), truncate the
+      // value prior to storing.
+      if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
+        if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+          if (VITy->getBitWidth() > AITy->getBitWidth()) {
+            if (DL.isBigEndian())
+              V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
+                                 "endian_shift");
+            V = IRB.CreateTrunc(V, AITy, "load.trunc");
+          }
+
      V = convertValue(DL, IRB, V, NewAllocaTy);
      NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                     SI.isVolatile());
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@ -227,10 +227,16 @@ Value *Scatterer::operator[](unsigned I) {
      if (!Idx)
        break;
      unsigned J = Idx->getZExtValue();
-      CV[J] = Insert->getOperand(1);
      V = Insert->getOperand(0);
-      if (I == J)
+      if (I == J) {
+        CV[J] = Insert->getOperand(1);
        return CV[J];
+      } else if (!CV[J]) {
+        // Only cache the first entry we find for each index we're not actively
+        // searching for. This prevents us from going too far up the chain and
+        // caching incorrect entries.
+        CV[J] = Insert->getOperand(1);
+      }
    }
    CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I),
                                         V->getName() + ".i" + Twine(I));
--- a/test/Analysis/BasicAA/gep-alias.ll
+++ b/test/Analysis/BasicAA/gep-alias.ll
@ -228,3 +228,51 @@ define i32 @test12(i32 %x, i32 %y, i8* %p) nounwind {
 ; CHECK-LABEL: @test12(
 ; CHECK: ret i32 %r
 }
+
+@P = internal global i32 715827882, align 4
+@Q = internal global i32 715827883, align 4
+@.str = private unnamed_addr constant [7 x i8] c"%u %u\0A\00", align 1
+
+; Make sure we recognize that u[0] and u[Global + Cst] may alias
+; when the addition has wrapping semantic.
+; PR24468.
+; CHECK-LABEL: @test13(
+; Make sure the stores appear before the related loads.
+; CHECK: store i8 42,
+; CHECK: store i8 99,
+; Find the loads and make sure they are used in the arguments to the printf.
+; CHECK: [[T0ADDR:%[a-zA-Z0-9_]+]] = getelementptr inbounds [3 x i8], [3 x i8]* %t, i32 0, i32 0
+; CHECK: [[T0:%[a-zA-Z0-9_]+]] = load i8, i8* [[T0ADDR]], align 1
+; CHECK: [[T0ARG:%[a-zA-Z0-9_]+]] = zext i8 [[T0]] to i32
+; CHECK: [[U0ADDR:%[a-zA-Z0-9_]+]] = getelementptr inbounds [3 x i8], [3 x i8]* %u, i32 0, i32 0
+; CHECK: [[U0:%[a-zA-Z0-9_]+]] = load i8, i8* [[U0ADDR]], align 1
+; CHECK: [[U0ARG:%[a-zA-Z0-9_]+]] = zext i8 [[U0]] to i32
+; CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 [[T0ARG]], i32 [[U0ARG]])
+; CHECK: ret
+define void @test13() {
+entry:
+  %t = alloca [3 x i8], align 1
+  %u = alloca [3 x i8], align 1
+  %tmp = load i32, i32* @P, align 4
+  %tmp1 = mul i32 %tmp, 3
+  %mul = add i32 %tmp1, -2147483646
+  %idxprom = zext i32 %mul to i64
+  %arrayidx = getelementptr inbounds [3 x i8], [3 x i8]* %t, i64 0, i64 %idxprom
+  store i8 42, i8* %arrayidx, align 1
+  %tmp2 = load i32, i32* @Q, align 4
+  %tmp3 = mul i32 %tmp2, 3
+  %mul2 = add i32 %tmp3, 2147483647
+  %idxprom3 = zext i32 %mul2 to i64
+  %arrayidx4 = getelementptr inbounds [3 x i8], [3 x i8]* %u, i64 0, i64 %idxprom3
+  store i8 99, i8* %arrayidx4, align 1
+  %arrayidx5 = getelementptr inbounds [3 x i8], [3 x i8]* %t, i64 0, i64 0
+  %tmp4 = load i8, i8* %arrayidx5, align 1
+  %conv = zext i8 %tmp4 to i32
+  %arrayidx6 = getelementptr inbounds [3 x i8], [3 x i8]* %u, i64 0, i64 0
+  %tmp5 = load i8, i8* %arrayidx6, align 1
+  %conv7 = zext i8 %tmp5 to i32
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i64 0, i64 0), i32 %conv, i32 %conv7)
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
--- a/test/Analysis/BasicAA/phi-aa.ll
+++ b/test/Analysis/BasicAA/phi-aa.ll
@ -39,7 +39,6 @@ return:

 ; CHECK-LABEL: pr18068
 ; CHECK: MayAlias: i32* %0, i32* %arrayidx5
-; CHECK: NoAlias: i32* %arrayidx13, i32* %arrayidx5

 define i32 @pr18068(i32* %jj7, i32* %j) {
 entry:
--- a/Show More
+++ b/Show More