Vendor import of llvm release_39 branch r278877:

https://llvm.org/svn/llvm-project/llvm/branches/release_39@278877
2016-08-17 19:33:52 +00:00 · 2016-08-17 19:33:52 +00:00 · a7fe922b98
commit a7fe922b98
parent c3aee98e72
207 changed files with 4434 additions and 1325 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -293,6 +293,7 @@ endif()
 option(LLVM_ENABLE_CXX1Y "Compile with C++1y enabled." OFF)
 option(LLVM_ENABLE_LIBCXX "Use libc++ if available." OFF)
 option(LLVM_ENABLE_LIBCXXABI "Use libc++abi when using libc++." OFF)
+option(LLVM_ENABLE_LLD "Use lld as C and C++ linker." OFF)
 option(LLVM_ENABLE_PEDANTIC "Compile with pedantic enabled." ON)
 option(LLVM_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF)

--- a/LICENSE.TXT
+++ b/LICENSE.TXT
@ -61,8 +61,6 @@ licenses, and/or restrictions:

 Program             Directory
 -------             ---------
-Autoconf            llvm/autoconf
-                    llvm/projects/ModuleMaker/autoconf
 Google Test         llvm/utils/unittest/googletest
 OpenBSD regex       llvm/lib/Support/{reg*, COPYRIGHT.regex}
 pyyaml tests        llvm/test/YAMLParser/{*.data, LICENSE.TXT}
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@ -144,6 +144,12 @@ function(add_flag_or_print_warning flag name)
  endif()
 endfunction()

+if(LLVM_ENABLE_LLD)
+  check_cxx_compiler_flag("-fuse-ld=lld" CXX_SUPPORTS_LLD)
+  append_if(CXX_SUPPORTS_LLD "-fuse-ld=lld"
+    CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+endif()
+
 if( LLVM_ENABLE_PIC )
  if( XCODE )
    # Xcode has -mdynamic-no-pic on by default, which overrides -fPIC. I don't
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@ -436,7 +436,7 @@ For example, consider this simple LLVM example:
 The X86 instruction selector might produce this machine code for the ``div`` and
 ``ret``:

-.. code-block:: llvm
+.. code-block:: text

  ;; Start of div
  %EAX = mov %reg1024           ;; Copy X (in reg1024) into EAX
@ -453,7 +453,7 @@ By the end of code generation, the register allocator would coalesce the
 registers and delete the resultant identity moves producing the following
 code:

-.. code-block:: llvm
+.. code-block:: text

  ;; X is in EAX, Y is in ECX
  mov %EAX, %EDX
@ -965,7 +965,7 @@ target code.  For example, consider the following LLVM fragment:

 This LLVM code corresponds to a SelectionDAG that looks basically like this:

-.. code-block:: llvm
+.. code-block:: text

  (fadd:f32 (fmul:f32 (fadd:f32 W, X), Y), Z)

--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@ -144,7 +144,7 @@ exists anywhere in the file.
 The FileCheck -check-prefix option
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The FileCheck :option:`-check-prefix` option allows multiple test
+The FileCheck `-check-prefix` option allows multiple test
 configurations to be driven from one `.ll` file.  This is useful in many
 circumstances, for example, testing different architectural variants with
 :program:`llc`.  Here's a simple example:
@ -303,7 +303,7 @@ be aware that the definition rule can match `after` its use.

 So, for instance, the code below will pass:

-.. code-block:: llvm
+.. code-block:: text

  ; CHECK-DAG: vmov.32 [[REG2:d[0-9]+]][0]
  ; CHECK-DAG: vmov.32 [[REG2]][1]
@ -312,7 +312,7 @@ So, for instance, the code below will pass:

 While this other code, will not:

-.. code-block:: llvm
+.. code-block:: text

  ; CHECK-DAG: vmov.32 [[REG2:d[0-9]+]][0]
  ; CHECK-DAG: vmov.32 [[REG2]][1]
@ -473,7 +473,7 @@ To match newline characters in regular expressions the character class

 matches output of the form (from llvm-dwarfdump):

-.. code-block:: llvm
+.. code-block:: text

       DW_AT_location [DW_FORM_sec_offset]   (0x00000233)
       DW_AT_name [DW_FORM_strp]  ( .debug_str[0x000000c9] = "intd")
--- a/docs/CommandGuide/llvm-nm.rst
+++ b/docs/CommandGuide/llvm-nm.rst
@ -68,11 +68,11 @@ OPTIONS

 .. option:: -B    (default)

- Use BSD output format.  Alias for :option:`--format=bsd`.
+ Use BSD output format.  Alias for `--format=bsd`.

 .. option:: -P

- Use POSIX.2 output format.  Alias for :option:`--format=posix`.
+ Use POSIX.2 output format.  Alias for `--format=posix`.

 .. option:: --debug-syms, -a

--- a/docs/CommandGuide/opt.rst
+++ b/docs/CommandGuide/opt.rst
@ -12,16 +12,16 @@ DESCRIPTION
 The :program:`opt` command is the modular LLVM optimizer and analyzer.  It
 takes LLVM source files as input, runs the specified optimizations or analyses
 on it, and then outputs the optimized file or the analysis results.  The
-function of :program:`opt` depends on whether the :option:`-analyze` option is
+function of :program:`opt` depends on whether the `-analyze` option is
 given.

-When :option:`-analyze` is specified, :program:`opt` performs various analyses
+When `-analyze` is specified, :program:`opt` performs various analyses
 of the input source.  It will usually print the results on standard output, but
 in a few cases, it will print output to standard error or generate a file with
 the analysis output, which is usually done when the output is meant for another
 program.

-While :option:`-analyze` is *not* given, :program:`opt` attempts to produce an
+While `-analyze` is *not* given, :program:`opt` attempts to produce an
 optimized output file.  The optimizations available via :program:`opt` depend
 upon what libraries were linked into it as well as any additional libraries
 that have been loaded with the :option:`-load` option.  Use the :option:`-help`
@ -68,19 +68,19 @@ OPTIONS

 .. option:: -disable-opt

- This option is only meaningful when :option:`-std-link-opts` is given.  It
+ This option is only meaningful when `-std-link-opts` is given.  It
 disables most passes.

 .. option:: -strip-debug

 This option causes opt to strip debug information from the module before
- applying other optimizations.  It is essentially the same as :option:`-strip`
+ applying other optimizations.  It is essentially the same as `-strip`
 but it ensures that stripping of debug information is done first.

 .. option:: -verify-each

 This option causes opt to add a verify pass after every pass otherwise
- specified on the command line (including :option:`-verify`).  This is useful
+ specified on the command line (including `-verify`).  This is useful
 for cases where it is suspected that a pass is creating an invalid module but
 it is not clear which pass is doing it.

--- a/docs/ExceptionHandling.rst
+++ b/docs/ExceptionHandling.rst
@ -406,7 +406,7 @@ outlined.  After the handler is outlined, this intrinsic is simply removed.
 ``llvm.eh.exceptionpointer``
 ----------------------------

-.. code-block:: llvm
+.. code-block:: text

  i8 addrspace(N)* @llvm.eh.padparam.pNi8(token %catchpad)

@ -427,7 +427,7 @@ backend.  Uses of them are generated by the backend's
 ``llvm.eh.sjlj.setjmp``
 ~~~~~~~~~~~~~~~~~~~~~~~

-.. code-block:: llvm
+.. code-block:: text

  i32 @llvm.eh.sjlj.setjmp(i8* %setjmp_buf)

@ -664,7 +664,7 @@ all of the new IR instructions:
    return 0;
  }

-.. code-block:: llvm
+.. code-block:: text

  define i32 @f() nounwind personality i32 (...)* @__CxxFrameHandler3 {
  entry:
@ -741,7 +741,7 @@ C++ code:
    }
  }

-.. code-block:: llvm
+.. code-block:: text

  define void @f() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
  entry:
--- a/docs/Extensions.rst
+++ b/docs/Extensions.rst
@ -43,7 +43,7 @@ The following additional relocation types are supported:
 corresponds to the COFF relocation types ``IMAGE_REL_I386_DIR32NB`` (32-bit) or
 ``IMAGE_REL_AMD64_ADDR32NB`` (64-bit).

-.. code-block:: gas
+.. code-block:: text

  .text
  fun:
--- a/docs/GarbageCollection.rst
+++ b/docs/GarbageCollection.rst
@ -204,7 +204,7 @@ IR features is specified by the selected :ref:`GC strategy description
 Specifying GC code generation: ``gc "..."``
 -------------------------------------------

-.. code-block:: llvm
+.. code-block:: text

  define <returntype> @name(...) gc "name" { ... }

--- a/docs/GetElementPtr.rst
+++ b/docs/GetElementPtr.rst
@ -105,7 +105,7 @@ memory, or a global variable.

 To make this clear, let's consider a more obtuse example:

-.. code-block:: llvm
+.. code-block:: text

  %MyVar = uninitialized global i32
  ...
@ -142,7 +142,7 @@ Quick answer: there are no superfluous indices.
 This question arises most often when the GEP instruction is applied to a global
 variable which is always a pointer type. For example, consider this:

-.. code-block:: llvm
+.. code-block:: text

  %MyStruct = uninitialized global { float*, i32 }
  ...
@ -178,7 +178,7 @@ The GetElementPtr instruction dereferences nothing. That is, it doesn't access
 memory in any way. That's what the Load and Store instructions are for.  GEP is
 only involved in the computation of addresses. For example, consider this:

-.. code-block:: llvm
+.. code-block:: text

  %MyVar = uninitialized global { [40 x i32 ]* }
  ...
@ -195,7 +195,7 @@ illegal.
 In order to access the 18th integer in the array, you would need to do the
 following:

-.. code-block:: llvm
+.. code-block:: text

  %idx = getelementptr { [40 x i32]* }, { [40 x i32]* }* %, i64 0, i32 0
  %arr = load [40 x i32]** %idx
@ -204,7 +204,7 @@ following:
 In this case, we have to load the pointer in the structure with a load
 instruction before we can index into the array. If the example was changed to:

-.. code-block:: llvm
+.. code-block:: text

  %MyVar = uninitialized global { [40 x i32 ] }
  ...
--- a/docs/HowToUseInstrMappings.rst
+++ b/docs/HowToUseInstrMappings.rst
@ -30,7 +30,7 @@ instructions with each other. These tables are emitted in the
 ``XXXInstrInfo.inc`` file along with the functions to query them. Following
 is the definition of ``InstrMapping`` class definied in Target.td file:

-.. code-block:: llvm
+.. code-block:: text

  class InstrMapping {
    // Used to reduce search space only to the instructions using this
@ -69,7 +69,7 @@ non-predicated form by assigning appropriate values to the ``InstrMapping``
 fields. For this relationship, non-predicated instructions are treated as key
 instruction since they are the one used to query the interface function.

-.. code-block:: llvm
+.. code-block:: text

  def getPredOpcode : InstrMapping {
    // Choose a FilterClass that is used as a base class for all the
@ -116,7 +116,7 @@ to include relevant information in its definition. For example, consider
 following to be the current definitions of ADD, ADD_pt (true) and ADD_pf (false)
 instructions:

-.. code-block:: llvm
+.. code-block:: text

  def ADD : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$a, IntRegs:$b),
              "$dst = add($a, $b)",
@ -137,7 +137,7 @@ In this step, we modify these instructions to include the information
 required by the relationship model, <tt>getPredOpcode</tt>, so that they can
 be related.

-.. code-block:: llvm
+.. code-block:: text

  def ADD : PredRel, ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$a, IntRegs:$b),
              "$dst = add($a, $b)",
--- a/docs/InAlloca.rst
+++ b/docs/InAlloca.rst
@ -41,7 +41,7 @@ that passes two default-constructed ``Foo`` objects to ``g`` in the
      g(Foo(), Foo());
    }

-.. code-block:: llvm
+.. code-block:: text

    %struct.Foo = type { i32, i32 }
    declare void @Foo_ctor(%struct.Foo* %this)
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -839,7 +839,7 @@ Note that the Mach-O platform doesn't support COMDATs and ELF only supports
 Here is an example of a COMDAT group where a function will only be selected if
 the COMDAT key's section is the largest:

-.. code-block:: llvm
+.. code-block:: text

   $foo = comdat largest
   @foo = global i32 2, comdat($foo)
@ -851,7 +851,7 @@ the COMDAT key's section is the largest:
 As a syntactic sugar the ``$name`` can be omitted if the name is the same as
 the global name:

-.. code-block:: llvm
+.. code-block:: text

  $foo = comdat any
  @foo = global i32 2, comdat
@ -875,7 +875,7 @@ if a collision occurs in the symbol table.
 The combined use of COMDATS and section attributes may yield surprising results.
 For example:

-.. code-block:: llvm
+.. code-block:: text

   $foo = comdat any
   $bar = comdat any
@ -1205,7 +1205,7 @@ makes the format of the prologue data highly target dependent.
 A trivial example of valid prologue data for the x86 architecture is ``i8 144``,
 which encodes the ``nop`` instruction:

-.. code-block:: llvm
+.. code-block:: text

    define void @f() prologue i8 144 { ... }

@ -1213,7 +1213,7 @@ Generally prologue data can be formed by encoding a relative branch instruction
 which skips the metadata, as in this example of valid prologue data for the
 x86_64 architecture, where the first two bytes encode ``jmp .+10``:

-.. code-block:: llvm
+.. code-block:: text

    %0 = type <{ i8, i8, i8* }>

@ -2237,7 +2237,7 @@ source file name to the local function name.

 The syntax for the source file name is simply:

-.. code-block:: llvm
+.. code-block:: text

    source_filename = "/path/to/source.c"

@ -2847,7 +2847,7 @@ cleared low bit. However, in the ``%C`` example, the optimizer is
 allowed to assume that the '``undef``' operand could be the same as
 ``%Y``, allowing the whole '``select``' to be eliminated.

-.. code-block:: llvm
+.. code-block:: text

      %A = xor undef, undef

@ -2899,7 +2899,7 @@ does not execute at all. This allows us to delete the divide and all
 code after it. Because the undefined operation "can't happen", the
 optimizer can assume that it occurs in dead code.

-.. code-block:: llvm
+.. code-block:: text

    a:  store undef -> %X
    b:  store %X -> undef
@ -3884,7 +3884,7 @@ their operand. For example:

 Metadata nodes that aren't uniqued use the ``distinct`` keyword. For example:

-.. code-block:: llvm
+.. code-block:: text

    !0 = distinct !{!"test\00", i32 10}

@ -3949,7 +3949,7 @@ fields are tuples containing the debug info to be emitted along with the compile
 unit, regardless of code optimizations (some nodes are only emitted if there are
 references to them from instructions).

-.. code-block:: llvm
+.. code-block:: text

    !0 = !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang",
                        isOptimized: true, flags: "-O2", runtimeVersion: 2,
@ -3985,7 +3985,7 @@ DIBasicType
 ``DIBasicType`` nodes represent primitive types, such as ``int``, ``bool`` and
 ``float``. ``tag:`` defaults to ``DW_TAG_base_type``.

-.. code-block:: llvm
+.. code-block:: text

    !0 = !DIBasicType(name: "unsigned char", size: 8, align: 8,
                      encoding: DW_ATE_unsigned_char)
@ -3994,7 +3994,7 @@ DIBasicType
 The ``encoding:`` describes the details of the type. Usually it's one of the
 following:

-.. code-block:: llvm
+.. code-block:: text

  DW_ATE_address       = 1
  DW_ATE_boolean       = 2
@ -4014,7 +4014,7 @@ refers to a tuple; the first operand is the return type, while the rest are the
 types of the formal arguments in order. If the first operand is ``null``, that
 represents a function with no return value (such as ``void foo() {}`` in C++).

-.. code-block:: llvm
+.. code-block:: text

    !0 = !BasicType(name: "int", size: 32, align: 32, DW_ATE_signed)
    !1 = !BasicType(name: "char", size: 8, align: 8, DW_ATE_signed_char)
@ -4028,7 +4028,7 @@ DIDerivedType
 ``DIDerivedType`` nodes represent types derived from other types, such as
 qualified types.

-.. code-block:: llvm
+.. code-block:: text

    !0 = !DIBasicType(name: "unsigned char", size: 8, align: 8,
                      encoding: DW_ATE_unsigned_char)
@ -4037,7 +4037,7 @@ qualified types.

 The following ``tag:`` values are valid:

-.. code-block:: llvm
+.. code-block:: text

  DW_TAG_member             = 13
  DW_TAG_pointer_type       = 15
@ -4089,7 +4089,7 @@ does not have  ``flags: DIFlagFwdDecl`` set.  LLVM tools that link modules
 together will unique such definitions at parse time via the ``identifier:``
 field, even if the nodes are ``distinct``.

-.. code-block:: llvm
+.. code-block:: text

    !0 = !DIEnumerator(name: "SixKind", value: 7)
    !1 = !DIEnumerator(name: "SevenKind", value: 7)
@ -4100,7 +4100,7 @@ field, even if the nodes are ``distinct``.

 The following ``tag:`` values are valid:

-.. code-block:: llvm
+.. code-block:: text

  DW_TAG_array_type       = 1
  DW_TAG_class_type       = 2
@ -4219,7 +4219,7 @@ type with an ODR ``identifier:`` and that does not set ``flags: DIFwdDecl``,
 then the subprogram declaration is uniqued based only on its ``linkageName:``
 and ``scope:``.

-.. code-block:: llvm
+.. code-block:: text

    define void @_Z3foov() !dbg !0 {
      ...
@ -4244,7 +4244,7 @@ DILexicalBlock
 two lexical blocks at same depth. They are valid targets for ``scope:``
 fields.

-.. code-block:: llvm
+.. code-block:: text

    !0 = distinct !DILexicalBlock(scope: !1, file: !2, line: 7, column: 35)

@ -4290,7 +4290,7 @@ the ``arg:`` field is set to non-zero, then this variable is a subprogram
 parameter, and it will be included in the ``variables:`` field of its
 :ref:`DISubprogram`.

-.. code-block:: llvm
+.. code-block:: text

    !0 = !DILocalVariable(name: "this", arg: 1, scope: !3, file: !2, line: 7,
                          type: !3, flags: DIFlagArtificial)
@ -4313,7 +4313,7 @@ The current supported vocabulary is limited:
 - ``DW_OP_bit_piece, 16, 8`` specifies the offset and size (``16`` and ``8``
  here, respectively) of the variable piece from the working expression.

-.. code-block:: llvm
+.. code-block:: text

    !0 = !DIExpression(DW_OP_deref)
    !1 = !DIExpression(DW_OP_plus, 3)
@ -4336,7 +4336,7 @@ DIImportedEntity
 ``DIImportedEntity`` nodes represent entities (such as modules) imported into a
 compile unit.

-.. code-block:: llvm
+.. code-block:: text

   !2 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0,
                          entity: !1, line: 7)
@ -4349,7 +4349,7 @@ The ``name:`` field is the macro identifier, followed by macro parameters when
 defining a function-like macro, and the ``value`` field is the token-string
 used to expand the macro identifier.

-.. code-block:: llvm
+.. code-block:: text

   !2 = !DIMacro(macinfo: DW_MACINFO_define, line: 7, name: "foo(x)",
                 value: "((x) + 1)")
@ -4362,7 +4362,7 @@ DIMacroFile
 The ``nodes:`` field is a list of ``DIMacro`` and ``DIMacroFile`` nodes that
 appear in the included source file.

-.. code-block:: llvm
+.. code-block:: text

   !2 = !DIMacroFile(macinfo: DW_MACINFO_start_file, line: 7, file: !2,
                     nodes: !3)
@ -5660,7 +5660,7 @@ block. Therefore, it must be the only non-phi instruction in the block.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

    dispatch1:
      %cs1 = catchswitch within none [label %handler0, label %handler1] unwind to caller
@ -5711,7 +5711,7 @@ the ``catchret``'s behavior is undefined.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      catchret from %catch label %continue

@ -5761,7 +5761,7 @@ It transfers control to ``continue`` or unwinds out of the function.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      cleanupret from %cleanup unwind to caller
      cleanupret from %cleanup unwind label %continue
@ -5851,7 +5851,7 @@ unsigned and/or signed overflow, respectively, occurs.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = add i32 4, %var          ; yields i32:result = 4 + %var

@ -5890,7 +5890,7 @@ optimizations:
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = fadd float 4.0, %var          ; yields float:result = 4.0 + %var

@ -5942,7 +5942,7 @@ unsigned and/or signed overflow, respectively, occurs.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = sub i32 4, %var          ; yields i32:result = 4 - %var
      <result> = sub i32 0, %val          ; yields i32:result = -%var
@ -5985,7 +5985,7 @@ unsafe floating point optimizations:
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = fsub float 4.0, %var           ; yields float:result = 4.0 - %var
      <result> = fsub float -0.0, %val          ; yields float:result = -%var
@ -6039,7 +6039,7 @@ unsigned and/or signed overflow, respectively, occurs.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = mul i32 4, %var          ; yields i32:result = 4 * %var

@ -6078,7 +6078,7 @@ unsafe floating point optimizations:
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = fmul float 4.0, %var          ; yields float:result = 4.0 * %var

@ -6122,7 +6122,7 @@ such, "((a udiv exact b) mul b) == a").
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = udiv i32 4, %var          ; yields i32:result = 4 / %var

@ -6168,7 +6168,7 @@ a :ref:`poison value <poisonvalues>` if the result would be rounded.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = sdiv i32 4, %var          ; yields i32:result = 4 / %var

@ -6207,7 +6207,7 @@ unsafe floating point optimizations:
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = fdiv float 4.0, %var          ; yields float:result = 4.0 / %var

@ -6249,7 +6249,7 @@ Taking the remainder of a division by zero leads to undefined behavior.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = urem i32 4, %var          ; yields i32:result = 4 % %var

@ -6304,7 +6304,7 @@ result of the division and the remainder.)
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = srem i32 4, %var          ; yields i32:result = 4 % %var

@ -6344,7 +6344,7 @@ to enable otherwise unsafe floating point optimizations:
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = frem float 4.0, %var          ; yields float:result = 4.0 % %var

@ -6406,7 +6406,7 @@ nsw/nuw bits in (mul %op1, (shl 1, %op2)).
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = shl i32 4, %var   ; yields i32: 4 << %var
      <result> = shl i32 4, 2      ; yields i32: 16
@ -6455,7 +6455,7 @@ non-zero.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = lshr i32 4, 1   ; yields i32:result = 2
      <result> = lshr i32 4, 2   ; yields i32:result = 1
@ -6506,7 +6506,7 @@ non-zero.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = ashr i32 4, 1   ; yields i32:result = 2
      <result> = ashr i32 4, 2   ; yields i32:result = 1
@ -6558,7 +6558,7 @@ The truth table used for the '``and``' instruction is:
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = and i32 4, %var         ; yields i32:result = 4 & %var
      <result> = and i32 15, 40          ; yields i32:result = 8
@ -6657,7 +6657,7 @@ The truth table used for the '``xor``' instruction is:
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = xor i32 4, %var         ; yields i32:result = 4 ^ %var
      <result> = xor i32 15, 40          ; yields i32:result = 39
@ -6710,7 +6710,7 @@ exceeds the length of ``val``, the results are undefined.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = extractelement <4 x i32> %vec, i32 0    ; yields i32

@ -6752,7 +6752,7 @@ undefined.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = insertelement <4 x i32> %vec, i32 1, i32 0    ; yields <4 x i32>

@ -6800,7 +6800,7 @@ only one vector.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = shufflevector <4 x i32> %v1, <4 x i32> %v2,
                              <4 x i32> <i32 0, i32 4, i32 1, i32 5>  ; yields <4 x i32>
@ -6859,7 +6859,7 @@ the index operands.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = extractvalue {i32, float} %agg, 0    ; yields i32

@ -8126,7 +8126,7 @@ or :ref:`ptrtoint <i_ptrtoint>` instructions first.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      %X = bitcast i8 255 to i8              ; yields i8 :-1
      %Y = bitcast i32* %x to sint*          ; yields sint*:%x
@ -8265,7 +8265,7 @@ as the values being compared. Otherwise, the result is an ``i1``.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = icmp eq i32 4, 5          ; yields: result=false
      <result> = icmp ne float* %X, %X     ; yields: result=false
@ -8379,7 +8379,7 @@ assumptions to be made about the values of input arguments; namely
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      <result> = fcmp oeq float 4.0, 5.0    ; yields: result=false
      <result> = fcmp one float 4.0, 5.0    ; yields: result=true
@ -8815,7 +8815,7 @@ that does not carry an appropriate :ref:`"funclet" bundle <ob_funclet>`.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

    dispatch:
      %cs = catchswitch within none [label %handler0] unwind to caller
@ -8885,7 +8885,7 @@ that does not carry an appropriate :ref:`"funclet" bundle <ob_funclet>`.
 Example:
 """"""""

-.. code-block:: llvm
+.. code-block:: text

      %tok = cleanuppad within %cs []

@ -12481,19 +12481,19 @@ optimistic assumptions made during compilation.  The semantics of
 ``@llvm.experimental.deoptimize`` -- its body is defined to be
 equivalent to:

-.. code-block:: llvm
+.. code-block:: text

-	define void @llvm.experimental.guard(i1 %pred, <args...>) {
-	  %realPred = and i1 %pred, undef
-	  br i1 %realPred, label %continue, label %leave [, !make.implicit !{}]
+  define void @llvm.experimental.guard(i1 %pred, <args...>) {
+    %realPred = and i1 %pred, undef
+    br i1 %realPred, label %continue, label %leave [, !make.implicit !{}]

-	leave:
-	  call void @llvm.experimental.deoptimize(<args...>) [ "deopt"() ]
-	  ret void
+  leave:
+    call void @llvm.experimental.deoptimize(<args...>) [ "deopt"() ]
+    ret void

-	continue:
-	  ret void
-	}
+  continue:
+    ret void
+  }


 with the optional ``[, !make.implicit !{}]`` present if and only if it
--- a/docs/MIRLangRef.rst
+++ b/docs/MIRLangRef.rst
@ -111,7 +111,6 @@ Here is an example of a YAML document that contains an LLVM module:

 .. code-block:: llvm

-     --- |
       define i32 @inc(i32* %x) {
       entry:
         %0 = load i32, i32* %x
@ -119,7 +118,6 @@ Here is an example of a YAML document that contains an LLVM module:
         store i32 %1, i32* %x
         ret i32 %1
       }
-     ...

 .. _YAML block literal string: http://www.yaml.org/spec/1.2/spec.html#id2795688

@ -129,7 +127,7 @@ Machine Functions
 The remaining YAML documents contain the machine functions. This is an example
 of such YAML document:

-.. code-block:: llvm
+.. code-block:: text

     ---
     name:            inc
@ -172,7 +170,7 @@ A machine basic block is defined in a single block definition source construct
 that contains the block's ID.
 The example below defines two blocks that have an ID of zero and one:

-.. code-block:: llvm
+.. code-block:: text

    bb.0:
      <instructions>
@ -182,7 +180,7 @@ The example below defines two blocks that have an ID of zero and one:
 A machine basic block can also have a name. It should be specified after the ID
 in the block's definition:

-.. code-block:: llvm
+.. code-block:: text

    bb.0.entry:       ; This block's name is "entry"
       <instructions>
@ -196,7 +194,7 @@ Block References
 The machine basic blocks are identified by their ID numbers. Individual
 blocks are referenced using the following syntax:

-.. code-block:: llvm
+.. code-block:: text

    %bb.<id>[.<name>]

@ -213,7 +211,7 @@ Successors
 The machine basic block's successors have to be specified before any of the
 instructions:

-.. code-block:: llvm
+.. code-block:: text

    bb.0.entry:
      successors: %bb.1.then, %bb.2.else
@ -227,7 +225,7 @@ The branch weights can be specified in brackets after the successor blocks.
 The example below defines a block that has two successors with branch weights
 of 32 and 16:

-.. code-block:: llvm
+.. code-block:: text

    bb.0.entry:
      successors: %bb.1.then(32), %bb.2.else(16)
@ -240,7 +238,7 @@ Live In Registers
 The machine basic block's live in registers have to be specified before any of
 the instructions:

-.. code-block:: llvm
+.. code-block:: text

    bb.0.entry:
      liveins: %edi, %esi
@ -255,7 +253,7 @@ Miscellaneous Attributes
 The attributes ``IsAddressTaken``, ``IsLandingPad`` and ``Alignment`` can be
 specified in brackets after the block's definition:

-.. code-block:: llvm
+.. code-block:: text

    bb.0.entry (address-taken):
      <instructions>
@ -278,7 +276,7 @@ The instruction's name is usually specified before the operands. The example
 below shows an instance of the X86 ``RETQ`` instruction with a single machine
 operand:

-.. code-block:: llvm
+.. code-block:: text

    RETQ %eax

@ -287,7 +285,7 @@ operands, the instruction's name has to be specified after them. The example
 below shows an instance of the AArch64 ``LDPXpost`` instruction with three
 defined register operands:

-.. code-block:: llvm
+.. code-block:: text

    %sp, %fp, %lr = LDPXpost %sp, 2

@ -303,7 +301,7 @@ Instruction Flags

 The flag ``frame-setup`` can be specified before the instruction's name:

-.. code-block:: llvm
+.. code-block:: text

    %fp = frame-setup ADDXri %sp, 0, 0

@ -321,13 +319,13 @@ but they can also be used in a number of other places, like the
 The physical registers are identified by their name. They use the following
 syntax:

-.. code-block:: llvm
+.. code-block:: text

    %<name>

 The example below shows three X86 physical registers:

-.. code-block:: llvm
+.. code-block:: text

    %eax
    %r15
@ -336,13 +334,13 @@ The example below shows three X86 physical registers:
 The virtual registers are identified by their ID number. They use the following
 syntax:

-.. code-block:: llvm
+.. code-block:: text

    %<id>

 Example:

-.. code-block:: llvm
+.. code-block:: text

    %0

@ -366,7 +364,7 @@ The immediate machine operands are untyped, 64-bit signed integers. The
 example below shows an instance of the X86 ``MOV32ri`` instruction that has an
 immediate machine operand ``-42``:

-.. code-block:: llvm
+.. code-block:: text

    %eax = MOV32ri -42

@ -384,14 +382,14 @@ machine operands. The register operands can also have optional
 and a reference to the tied register operand.
 The full syntax of a register operand is shown below:

-.. code-block:: llvm
+.. code-block:: text

    [<flags>] <register> [ :<subregister-idx-name> ] [ (tied-def <tied-op>) ]

 This example shows an instance of the X86 ``XOR32rr`` instruction that has
 5 register operands with different register flags:

-.. code-block:: llvm
+.. code-block:: text

  dead %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, implicit-def %al

@ -446,7 +444,7 @@ the subregister indices. The example below shows an instance of the ``COPY``
 pseudo instruction that uses the X86 ``sub_8bit`` subregister index to copy 8
 lower bits from the 32-bit virtual register 0 to the 8-bit virtual register 1:

-.. code-block:: llvm
+.. code-block:: text

    %1 = COPY %0:sub_8bit

@ -461,7 +459,7 @@ The global value machine operands reference the global values from the
 The example below shows an instance of the X86 ``MOV64rm`` instruction that has
 a global value operand named ``G``:

-.. code-block:: llvm
+.. code-block:: text

    %rax = MOV64rm %rip, 1, _, @G, _

--- a/docs/MarkedUpDisassembly.rst
+++ b/docs/MarkedUpDisassembly.rst
@ -70,7 +70,7 @@ clients.
 For example, a possible annotation of an ARM load of a stack-relative location
 might be annotated as:

-.. code-block:: nasm
+.. code-block:: text

   ldr <reg gpr:r0>, <mem regoffset:[<reg gpr:sp>, <imm:#4>]>

--- a/docs/MergeFunctions.rst
+++ b/docs/MergeFunctions.rst
@ -394,7 +394,7 @@ and in right function "*FR*". And every part of *left* place is equal to the
 corresponding part of *right* place, and (!) both parts use *Value* instances,
 for example:

-.. code-block:: llvm
+.. code-block:: text

   instr0 i32 %LV   ; left side, function FL
   instr0 i32 %RV   ; right side, function FR
@ -409,13 +409,13 @@ in "*FL*" and "*FR*".

 Consider small example here:

-.. code-block:: llvm
+.. code-block:: text

  define void %f(i32 %pf0, i32 %pf1) {
    instr0 i32 %pf0 instr1 i32 %pf1 instr2 i32 123
  }

-.. code-block:: llvm
+.. code-block:: text

  define void %g(i32 %pg0, i32 %pg1) {
    instr0 i32 %pg0 instr1 i32 %pg0 instr2 i32 123
--- a/docs/NVPTXUsage.rst
+++ b/docs/NVPTXUsage.rst
@ -37,7 +37,7 @@ code. By default, the back-end will emit device functions. Metadata is used to
 declare a function as a kernel function. This metadata is attached to the
 ``nvvm.annotations`` named metadata object, and has the following format:

-.. code-block:: llvm
+.. code-block:: text

   !0 = !{<function-ref>, metadata !"kernel", i32 1}

--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@ -40,7 +40,10 @@ Non-comprehensive list of changes in this release

 * There is no longer a "global context" available in LLVM, except for the C API.

-* .. note about autoconf build having been removed.
+* The autoconf build system has been removed in favor of CMake. LLVM 3.9
+  requires CMake 3.4.3 or later to build. For information about using CMake
+  please see the documentation on :doc:`CMake`. For information about the CMake
+  language there is also a :doc:`CMakePrimer` document available.

 * .. note about C API functions LLVMParseBitcode,
   LLVMParseBitcodeInContext, LLVMGetBitcodeModuleInContext and
@ -69,11 +72,13 @@ Non-comprehensive list of changes in this release
  need to be updated to replace the argument node and remove any dead nodes in
  cases where they currently return an ``SDNode *`` from this interface.

-* Introduction of ThinLTO: [FIXME: needs to be documented more extensively in
-  /docs/ ; ping Mehdi/Teresa before the release if not done]
-
 * Raised the minimum required CMake version to 3.4.3.

+* Added the MemorySSA analysis, which hopes to replace MemoryDependenceAnalysis.
+  It should provide higher-quality results than MemDep, and be algorithmically
+  faster than MemDep. Currently, GVNHoist (which is off by default) makes use of
+  MemorySSA.
+
 .. NOTE
   For small 1-3 sentence descriptions, just add an entry at the end of
   this list. If your description won't fit comfortably in one bullet
@ -93,6 +98,32 @@ Non-comprehensive list of changes in this release

   Makes programs 10x faster by doing Special New Thing.

+GCC ABI Tag
+-----------
+
+Recently, many of the Linux distributions (ex. `Fedora <http://developerblog.redhat.com/2015/02/10/gcc-5-in-fedora/>`_,
+`Debian <https://wiki.debian.org/GCC5>`_, `Ubuntu <https://wiki.ubuntu.com/GCC5>`_)
+have moved on to use the new `GCC ABI <https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Attributes.html>`_
+to work around `C++11 incompatibilities in libstdc++ <https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html>`_.
+This caused `incompatibility problems <https://gcc.gnu.org/ml/gcc-patches/2015-04/msg00153.html>`_
+with other compilers (ex. Clang), which needed to be fixed, but due to the
+experimental nature of GCC's own implementation, it took a long time for it to
+land in LLVM (`here <https://reviews.llvm.org/D18035>`_ and
+`here <https://reviews.llvm.org/D17567>`_), not in time for the 3.8 release.
+
+Those patches are now present in the 3.9.0 release and should be working on the
+majority of cases, as they have been tested thoroughly. However, some bugs were
+`filled in GCC <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71712>`_ and have not
+yet been fixed, so there may be corner cases not covered by either GCC or Clang.
+Bug fixes to those problems should be reported in Bugzilla (either LLVM or GCC),
+and patches to LLVM's trunk are very likely to be back-ported to future 3.9.x
+releases (depends on how destructive it is).
+
+Unfortunately, these patches won't be back-ported to 3.8.x or earlier, so we
+strongly recommend people to use 3.9.x when GCC ABI cases are at stake.
+
+For a more in-depth view of the issue, check our `Bugzilla entry <https://llvm.org/bugs/show_bug.cgi?id=23529>`_.
+
 Changes to the LLVM IR
 ----------------------

@ -110,16 +141,98 @@ link-time may be differently optimized than the one what was visible
 during optimization, and may have arbitrarily different observable
 behavior.  See `PR26774 <http://llvm.org/PR26774>`_ for more details.

-Changes to the ARM Backend
+Support for ThinLTO
+-------------------
+
+LLVM now supports ThinLTO compilation, which can be invoked by compiling
+and linking with -flto=thin. The gold linker plugin, as well as linkers
+that use the new ThinLTO API in libLTO (like ld64), will transparently
+execute the ThinLTO backends in parallel threads.
+For more information on ThinLTO and the LLVM implementation, see the
+`ThinLTO blog post <http://blog.llvm.org/2016/06/thinlto-scalable-and-incremental-lto.html>`_.
+
+Changes to the ARM Targets
 --------------------------

- During this release ...
+**During this release the AArch64 backend has:**
+
+* Gained support for Qualcomm's Kryo and Broadcom's Vulcan CPUs, including
+  scheduling models.
+* Landed a scheduling model for Samsung's Exynos M1.
+* Seen a lot of work on GlobalISel.
+* Learned a few more useful combines (fadd and fmul into fmadd, adjustments to the
+  stack pointer for callee-save stack memory and local stack memory etc).
+* Gained support for the Swift calling convention.
+* Switched to using SubtargetFeatures rather than testing for specific CPUs and
+  to using TableGen for handling system instruction operands.
+* Like ARM, AArch64 is now using the TargetParser, so no more StringSwitches
+  matching CPU, FPU or feature names will be accepted in normal code.
+* Clang can now self-host itself using LLD on AArch64.
+* Gained a big batch of tests from Halide.
+
+ Furthermore, LLDB now supports AArch64 compact unwind tables, as used on iOS,
+ tvos and watchos.
+
+**During this release the ARM target has:**
+
+* ARMv8.2-A can now be targeted directly via Clang flags.
+* Adding preliminary support for Cortex-R8.
+* LLDB can now parse EABI attributes for an ELF input.
+* Initial ARM/Thumb support was added to LLD.
+* The ExecutionEngine now supports COFF/ARM.
+* Swift calling convention was ported to ARM.
+* A large number of codegen fixes around ARMv8, DSP, correct sub-target support,
+  relocations, EABI, EHABI, Windows on ARM, atomics..
+* Improved assembler support for Linux/Android/Chromium sub-projects.
+* Initial support for MUSL (libc) on ARM.
+* Support for Thumb1 targets in libunwind.
+* Gained a big batch of tests from Halide.


 Changes to the MIPS Target
 --------------------------

- During this release ...
+**During this release the MIPS target has:**
+
+* Enabled the Integrated Assembler by default for all ``mips-*`` and
+  ``mipsel-*`` triples.
+* Significantly improved the Integrated Assembler support for the n64 ABI.
+* Added the Clang frontend ``-mcompact-branches={never,optimal,always}`` option
+  that controls how LLVM generates compact branches for MIPS targets.
+* Improved performance and code size for stack pointer adjustments in functions
+  with large frames.
+* Implemented many instructions from the microMIPS32R6 ISA and added CodeGen
+  support for most of them.
+* Added support for the triple used by Debian Stretch for little endian
+  MIPS64, ie. ``mips64el-linux-gnuabi64``.
+* Removed EABI which was neither tested nor properly supported.
+* Gained the ability to self-host on MIPS32R6.
+* Gained the ability to self-host on MIPS64R2 and MIPS64R6 when using the n64
+  ABI.
+* Added support for the ``LA`` macro in PIC mode for o32.
+* Added support for safestack in compiler-rt.
+* Added support for the MIPS n64 ABI in LLD.
+* Added LLD support for TLS relocations for both o32 and n64 MIPS ABIs.
+
+**The MIPS target has also fixed various bugs including the following notable
+fixes:**
+
+* Delay slots are no longer filled multiple times when either ``-save-temps``
+  or ``-via-file-asm`` are used.
+* Updated n32 and n64 to follow the standard ELF conventions for label prefixes
+  (``.L``), whereas o32 still uses its own (``$``).
+* Properly sign-extend values to GPR width for instructions that expect 32-bit
+  values on 64-bit ISAs.
+* Several fixes for the delay-slot filler pass, including correct
+  forbidden-slot hazard handling.
+* Fixed several errors caught by the machine verifier when turned on for MIPS.
+* Fixed broken predicate for ``SELECT`` patterns in MIPS64.
+* Fixed wrong truncation of memory address for ``LL``/``SC`` seqeuences in
+  MIPS64.
+* Fixed the o32, n32 and n64 handling of ``.cprestore`` directives when inside
+  a ``.set noat`` region by the Integrated Assembler.
+* Fixed the ordering of ``HI``/``LO`` pairs in the relocation table.
+* Fixed the generated ELF ``EFlags`` when Octeon is the target.


 Changes to the PowerPC Target
@ -140,9 +253,16 @@ Changes to the X86 Target
  extensions using ``-march=knl``. The switch enables the ISA extensions
  AVX-512{F, CD, ER, PF}.

+* LLVM will now prefer ``PUSH`` instructions rather than ``%esp``-relative
+  ``MOV`` instructions for function calls at all optimization levels greater
+  than ``-O0``. Previously this transformation only occurred at ``-Os``.
+
 Changes to the AMDGPU Target
 -----------------------------

+ * Added backend support for OpenGL shader image, buffer storage, atomic
+   counter, and compute shader extensions (supported since Mesa 12)
+
 * Mesa 11.0.x is no longer supported


@ -167,6 +287,21 @@ projects that have already been updated to work with LLVM 3.9.

 * A project

+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM
+and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64
+are underway.
+

 Additional Information
 ======================
--- a/docs/SegmentedStacks.rst
+++ b/docs/SegmentedStacks.rst
@ -33,7 +33,7 @@ current stack limit (minus the amount of space needed to allocate a new block) -
 this slot's offset is again dictated by ``libgcc``. The generated
 assembly looks like this on x86-64:

-.. code-block:: nasm
+.. code-block:: text

    leaq     -8(%rsp), %r10
    cmpq     %fs:112,  %r10
--- a/docs/SourceLevelDebugging.rst
+++ b/docs/SourceLevelDebugging.rst
@ -230,7 +230,7 @@ following C fragment, for example:

 Compiled to LLVM, this function would be represented like this:

-.. code-block:: llvm
+.. code-block:: text

  ; Function Attrs: nounwind ssp uwtable
  define void @foo() #0 !dbg !4 {
@ -303,7 +303,7 @@ The first intrinsic ``%llvm.dbg.declare`` encodes debugging information for the
 variable ``X``.  The metadata ``!dbg !14`` attached to the intrinsic provides
 scope information for the variable ``X``.

-.. code-block:: llvm
+.. code-block:: text

  !14 = !DILocation(line: 2, column: 9, scope: !4)
  !4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !5,
@ -327,7 +327,7 @@ The third intrinsic ``%llvm.dbg.declare`` encodes debugging information for
 variable ``Z``.  The metadata ``!dbg !19`` attached to the intrinsic provides
 scope information for the variable ``Z``.

-.. code-block:: llvm
+.. code-block:: text

  !18 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5)
  !19 = !DILocation(line: 5, column: 11, scope: !18)
@ -390,7 +390,7 @@ Given an integer global variable declared as follows:

 a C/C++ front-end would generate the following descriptors:

-.. code-block:: llvm
+.. code-block:: text

  ;;
  ;; Define the global itself.
@ -456,7 +456,7 @@ Given a function declared as follows:

 a C/C++ front-end would generate the following descriptors:

-.. code-block:: llvm
+.. code-block:: text

  ;;
  ;; Define the anchor for subprograms.
--- a/docs/Statepoints.rst
+++ b/docs/Statepoints.rst
@ -138,7 +138,7 @@ SSA value ``%obj.relocated`` which represents the potentially changed value of
 ``%obj`` after the safepoint and update any following uses appropriately.  The 
 resulting relocation sequence is:

-.. code-block:: llvm
+.. code-block:: text

  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
         gc "statepoint-example" {
@ -237,7 +237,7 @@ afterwards.
 If we extend our previous example to include a pointless derived pointer, 
 we get:

-.. code-block:: llvm
+.. code-block:: text

  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
         gc "statepoint-example" {
@ -283,7 +283,7 @@ Let's assume a hypothetical GC--somewhat unimaginatively named "hypothetical-gc"
 --that requires that a TLS variable must be written to before and after a call
 to unmanaged code. The resulting relocation sequence is:

-.. code-block:: llvm
+.. code-block:: text

  @flag = thread_local global i32 0, align 4

@ -662,7 +662,7 @@ distinguish between GC references and non-GC references in IR it is given.

 As an example, given this code:

-.. code-block:: llvm
+.. code-block:: text

  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
         gc "statepoint-example" {
@ -672,7 +672,7 @@ As an example, given this code:

 The pass would produce this IR:

-.. code-block:: llvm
+.. code-block:: text

  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) 
         gc "statepoint-example" {
@ -737,7 +737,7 @@ As an example, given input IR of the following:

 This pass would produce the following IR:

-.. code-block:: llvm
+.. code-block:: text

  define void @test() gc "statepoint-example" {
    %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0)
--- a/docs/TableGen/LangIntro.rst
+++ b/docs/TableGen/LangIntro.rst
@ -232,7 +232,7 @@ the record ends with a semicolon.

 Here is a simple TableGen file:

-.. code-block:: llvm
+.. code-block:: text

  class C { bit V = 1; }
  def X : C;
@ -276,7 +276,7 @@ derived class or definition wants to override.  Let expressions consist of the
 value.  For example, a new class could be added to the example above, redefining
 the ``V`` field for all of its subclasses:

-.. code-block:: llvm
+.. code-block:: text

  class D : C { let V = 0; }
  def Z : D;
@ -295,7 +295,7 @@ concrete classes.  Parameterized TableGen classes specify a list of variable
 bindings (which may optionally have defaults) that are bound when used.  Here is
 a simple example:

-.. code-block:: llvm
+.. code-block:: text

  class FPFormat<bits<3> val> {
    bits<3> Value = val;
@ -316,7 +316,7 @@ integer.
 The more esoteric forms of `TableGen expressions`_ are useful in conjunction
 with template arguments.  As an example:

-.. code-block:: llvm
+.. code-block:: text

  class ModRefVal<bits<2> val> {
    bits<2> Value = val;
@ -346,7 +346,7 @@ be used to decouple the interface provided to the user of the class from the
 actual internal data representation expected by the class.  In this case,
 running ``llvm-tblgen`` on the example prints the following definitions:

-.. code-block:: llvm
+.. code-block:: text

  def bork {      // Value
    bit isMod = 1;
@ -379,7 +379,7 @@ commonality exists, then in a separate place indicate what all the ops are.

 Here is an example TableGen fragment that shows this idea:

-.. code-block:: llvm
+.. code-block:: text

  def ops;
  def GPR;
@ -405,7 +405,7 @@ inherit from multiple multiclasses, instantiating definitions from each
 multiclass.  Using a multiclass this way is exactly equivalent to instantiating
 the classes multiple times yourself, e.g. by writing:

-.. code-block:: llvm
+.. code-block:: text

  def ops;
  def GPR;
@ -432,7 +432,7 @@ the classes multiple times yourself, e.g. by writing:
 A ``defm`` can also be used inside a multiclass providing several levels of
 multiclass instantiations.

-.. code-block:: llvm
+.. code-block:: text

  class Instruction<bits<4> opc, string Name> {
    bits<4> opcode = opc;
@ -473,7 +473,7 @@ multiclass instantiations.
 the class list must start after the last multiclass, and there must be at least
 one multiclass before them.

-.. code-block:: llvm
+.. code-block:: text

  class XD { bits<4> Prefix = 11; }
  class XS { bits<4> Prefix = 12; }
@ -516,7 +516,7 @@ specified file in place of the include directive.  The filename should be
 specified as a double quoted string immediately after the '``include``' keyword.
 Example:

-.. code-block:: llvm
+.. code-block:: text

  include "foo.td"

@ -532,7 +532,7 @@ commonality from the records.
 File-scope "let" expressions take a comma-separated list of bindings to apply,
 and one or more records to bind the values in.  Here are some examples:

-.. code-block:: llvm
+.. code-block:: text

  let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in
    def RET : I<0xC3, RawFrm, (outs), (ins), "ret", [(X86retflag 0)]>;
@ -559,7 +559,7 @@ ways to factor out commonality from the records, specially if using several
 levels of multiclass instantiations. This also avoids the need of using "let"
 expressions within subsequent records inside a multiclass.

-.. code-block:: llvm
+.. code-block:: text

  multiclass basic_r<bits<4> opc> {
    let Predicates = [HasSSE2] in {
@ -587,7 +587,7 @@ TableGen supports the '``foreach``' block, which textually replicates the loop
 body, substituting iterator values for iterator references in the body.
 Example:

-.. code-block:: llvm
+.. code-block:: text

  foreach i = [0, 1, 2, 3] in {
    def R#i : Register<...>;
@ -598,7 +598,7 @@ This will create objects ``R0``, ``R1``, ``R2`` and ``R3``.  ``foreach`` blocks
 may be nested. If there is only one item in the body the braces may be
 elided:

-.. code-block:: llvm
+.. code-block:: text

  foreach i = [0, 1, 2, 3] in
    def R#i : Register<...>;
--- a/docs/TableGen/index.rst
+++ b/docs/TableGen/index.rst
@ -90,7 +90,7 @@ of the classes, then all of the definitions.  This is a good way to see what the
 various definitions expand to fully.  Running this on the ``X86.td`` file prints
 this (at the time of this writing):

-.. code-block:: llvm
+.. code-block:: text

  ...
  def ADD32rr {   // Instruction X86Inst I
@ -155,7 +155,7 @@ by the code generator, and specifying it all manually would be unmaintainable,
 prone to bugs, and tiring to do in the first place.  Because we are using
 TableGen, all of the information was derived from the following definition:

-.. code-block:: llvm
+.. code-block:: text

  let Defs = [EFLAGS],
      isCommutable = 1,                  // X = ADD Y,Z --> X = ADD Z,Y
@ -201,7 +201,7 @@ TableGen.
 **TableGen definitions** are the concrete form of 'records'.  These generally do
 not have any undefined values, and are marked with the '``def``' keyword.

-.. code-block:: llvm
+.. code-block:: text

  def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
                                        "Enable ARMv8 FP">;
@ -220,7 +220,7 @@ floating point instructions in the X86 backend).  TableGen keeps track of all of
 the classes that are used to build up a definition, so the backend can find all
 definitions of a particular class, such as "Instruction".

-.. code-block:: llvm
+.. code-block:: text

 class ProcNoItin<string Name, list<SubtargetFeature> Features>
       : Processor<Name, NoItineraries, Features>;
@ -235,7 +235,7 @@ If a multiclass inherits from another multiclass, the definitions in the
 sub-multiclass become part of the current multiclass, as if they were declared
 in the current multiclass.

-.. code-block:: llvm
+.. code-block:: text

  multiclass ro_signed_pats<string T, string Rm, dag Base, dag Offset, dag Extend,
                          dag address, ValueType sty> {
--- a/docs/WritingAnLLVMBackend.rst
+++ b/docs/WritingAnLLVMBackend.rst
@ -345,7 +345,7 @@ to define an object for each register.  The specified string ``n`` becomes the
 ``Name`` of the register.  The basic ``Register`` object does not have any
 subregisters and does not specify any aliases.

-.. code-block:: llvm
+.. code-block:: text

  class Register<string n> {
    string Namespace = "";
@ -361,7 +361,7 @@ subregisters and does not specify any aliases.
 For example, in the ``X86RegisterInfo.td`` file, there are register definitions
 that utilize the ``Register`` class, such as:

-.. code-block:: llvm
+.. code-block:: text

  def AL : Register<"AL">, DwarfRegNum<[0, 0, 0]>;

@ -414,7 +414,7 @@ classes.  In ``Target.td``, the ``Register`` class is the base for the
 ``RegisterWithSubRegs`` class that is used to define registers that need to
 specify subregisters in the ``SubRegs`` list, as shown here:

-.. code-block:: llvm
+.. code-block:: text

  class RegisterWithSubRegs<string n, list<Register> subregs> : Register<n> {
    let SubRegs = subregs;
@ -427,7 +427,7 @@ feature common to these subclasses.  Note the use of "``let``" expressions to
 override values that are initially defined in a superclass (such as ``SubRegs``
 field in the ``Rd`` class).

-.. code-block:: llvm
+.. code-block:: text

  class SparcReg<string n> : Register<n> {
    field bits<5> Num;
@ -452,7 +452,7 @@ field in the ``Rd`` class).
 In the ``SparcRegisterInfo.td`` file, there are register definitions that
 utilize these subclasses of ``Register``, such as:

-.. code-block:: llvm
+.. code-block:: text

  def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
  def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
@ -478,7 +478,7 @@ default allocation order of the registers.  A target description file
 ``XXXRegisterInfo.td`` that uses ``Target.td`` can construct register classes
 using the following class:

-.. code-block:: llvm
+.. code-block:: text

  class RegisterClass<string namespace,
  list<ValueType> regTypes, int alignment, dag regList> {
@ -532,7 +532,7 @@ defines a group of 32 single-precision floating-point registers (``F0`` to
 ``F31``); ``DFPRegs`` defines a group of 16 double-precision registers
 (``D0-D15``).

-.. code-block:: llvm
+.. code-block:: text

  // F0, F1, F2, ..., F31
  def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
@ -703,7 +703,7 @@ which describes one instruction.  An instruction descriptor defines:
 The Instruction class (defined in ``Target.td``) is mostly used as a base for
 more complex instruction classes.

-.. code-block:: llvm
+.. code-block:: text

  class Instruction {
    string Namespace = "";
@ -760,7 +760,7 @@ specific operation value for ``LD``/Load Word.  The third parameter is the
 output destination, which is a register operand and defined in the ``Register``
 target description file (``IntRegs``).

-.. code-block:: llvm
+.. code-block:: text

  def LDrr : F3_1 <3, 0b000000, (outs IntRegs:$dst), (ins MEMrr:$addr),
                   "ld [$addr], $dst",
@ -769,7 +769,7 @@ target description file (``IntRegs``).
 The fourth parameter is the input source, which uses the address operand
 ``MEMrr`` that is defined earlier in ``SparcInstrInfo.td``:

-.. code-block:: llvm
+.. code-block:: text

  def MEMrr : Operand<i32> {
    let PrintMethod = "printMemOperand";
@ -788,7 +788,7 @@ immediate value operands.  For example, to perform a Load Integer instruction
 for a Word from an immediate operand to a register, the following instruction
 class is defined:

-.. code-block:: llvm
+.. code-block:: text

  def LDri : F3_2 <3, 0b000000, (outs IntRegs:$dst), (ins MEMri:$addr),
                   "ld [$addr], $dst",
@ -801,7 +801,7 @@ creation of templates to define several instruction classes at once (using the
 pattern ``F3_12`` is defined to create 2 instruction classes each time
 ``F3_12`` is invoked:

-.. code-block:: llvm
+.. code-block:: text

  multiclass F3_12 <string OpcStr, bits<6> Op3Val, SDNode OpNode> {
    def rr  : F3_1 <2, Op3Val,
@ -818,7 +818,7 @@ So when the ``defm`` directive is used for the ``XOR`` and ``ADD``
 instructions, as seen below, it creates four instruction objects: ``XORrr``,
 ``XORri``, ``ADDrr``, and ``ADDri``.

-.. code-block:: llvm
+.. code-block:: text

  defm XOR   : F3_12<"xor", 0b000011, xor>;
  defm ADD   : F3_12<"add", 0b000000, add>;
@ -830,7 +830,7 @@ For example, the 10\ :sup:`th` bit represents the "greater than" condition for
 integers, and the 22\ :sup:`nd` bit represents the "greater than" condition for
 floats.

-.. code-block:: llvm
+.. code-block:: text

  def ICC_NE  : ICC_VAL< 9>;  // Not Equal
  def ICC_E   : ICC_VAL< 1>;  // Equal
@ -855,7 +855,7 @@ order they are defined.  Fields are bound when they are assigned a value.  For
 example, the Sparc target defines the ``XNORrr`` instruction as a ``F3_1``
 format instruction having three operands.

-.. code-block:: llvm
+.. code-block:: text

  def XNORrr  : F3_1<2, 0b000111,
                     (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
@ -865,7 +865,7 @@ format instruction having three operands.
 The instruction templates in ``SparcInstrFormats.td`` show the base class for
 ``F3_1`` is ``InstSP``.

-.. code-block:: llvm
+.. code-block:: text

  class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction {
    field bits<32> Inst;
@ -880,7 +880,7 @@ The instruction templates in ``SparcInstrFormats.td`` show the base class for

 ``InstSP`` leaves the ``op`` field unbound.

-.. code-block:: llvm
+.. code-block:: text

  class F3<dag outs, dag ins, string asmstr, list<dag> pattern>
      : InstSP<outs, ins, asmstr, pattern> {
@ -897,7 +897,7 @@ The instruction templates in ``SparcInstrFormats.td`` show the base class for
 fields.  ``F3`` format instructions will bind the operands ``rd``, ``op3``, and
 ``rs1`` fields.

-.. code-block:: llvm
+.. code-block:: text

  class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
             string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
@ -925,7 +925,7 @@ TableGen definition will add all of its operands to an enumeration in the
 llvm::XXX:OpName namespace and also add an entry for it into the OperandMap
 table, which can be queried using getNamedOperandIdx()

-.. code-block:: llvm
+.. code-block:: text

  int DstIndex = SP::getNamedOperandIdx(SP::XNORrr, SP::OpName::dst); // => 0
  int BIndex = SP::getNamedOperandIdx(SP::XNORrr, SP::OpName::b);     // => 1
@ -972,7 +972,7 @@ For example, the X86 backend defines ``brtarget`` and ``brtarget8``, both
 instances of the TableGen ``Operand`` class, which represent branch target
 operands:

-.. code-block:: llvm
+.. code-block:: text

  def brtarget : Operand<OtherVT>;
  def brtarget8 : Operand<OtherVT>;
@ -1222,14 +1222,14 @@ definitions in ``XXXInstrInfo.td``.  For example, in ``SparcInstrInfo.td``,
 this entry defines a register store operation, and the last parameter describes
 a pattern with the store DAG operator.

-.. code-block:: llvm
+.. code-block:: text

  def STrr  : F3_1< 3, 0b000100, (outs), (ins MEMrr:$addr, IntRegs:$src),
                   "st $src, [$addr]", [(store i32:$src, ADDRrr:$addr)]>;

 ``ADDRrr`` is a memory mode that is also defined in ``SparcInstrInfo.td``:

-.. code-block:: llvm
+.. code-block:: text

  def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;

@ -1240,7 +1240,7 @@ defined in an implementation of the Instructor Selector (such as
 In ``lib/Target/TargetSelectionDAG.td``, the DAG operator for store is defined
 below:

-.. code-block:: llvm
+.. code-block:: text

  def store : PatFrag<(ops node:$val, node:$ptr),
                      (st node:$val, node:$ptr), [{
@ -1458,7 +1458,7 @@ if the current argument is of type ``f32`` or ``f64``), then the action is
 performed.  In this case, the ``CCAssignToReg`` action assigns the argument
 value to the first available register: either ``R0`` or ``R1``.

-.. code-block:: llvm
+.. code-block:: text

  CCIfType<[f32,f64], CCAssignToReg<[R0, R1]>>

@ -1469,7 +1469,7 @@ which registers are used for specified scalar return types.  A single-precision
 float is returned to register ``F0``, and a double-precision float goes to
 register ``D0``.  A 32-bit integer is returned in register ``I0`` or ``I1``.

-.. code-block:: llvm
+.. code-block:: text

  def RetCC_Sparc32 : CallingConv<[
    CCIfType<[i32], CCAssignToReg<[I0, I1]>>,
@ -1484,7 +1484,7 @@ the size of the slot, and the second parameter, also 4, indicates the stack
 alignment along 4-byte units.  (Special cases: if size is zero, then the ABI
 size is used; if alignment is zero, then the ABI alignment is used.)

-.. code-block:: llvm
+.. code-block:: text

  def CC_Sparc32 : CallingConv<[
    // All arguments get passed in integer registers if there is space.
@ -1499,7 +1499,7 @@ the following example (in ``X86CallingConv.td``), the definition of
 assigned to the register ``ST0`` or ``ST1``, the ``RetCC_X86Common`` is
 invoked.

-.. code-block:: llvm
+.. code-block:: text

  def RetCC_X86_32_C : CallingConv<[
    CCIfType<[f32], CCAssignToReg<[ST0, ST1]>>,
@ -1514,7 +1514,7 @@ then a specified action is invoked.  In the following example (in
 ``RetCC_X86_32_Fast`` is invoked.  If the ``SSECall`` calling convention is in
 use, then ``RetCC_X86_32_SSE`` is invoked.

-.. code-block:: llvm
+.. code-block:: text

  def RetCC_X86_32 : CallingConv<[
    CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
@ -1682,7 +1682,7 @@ feature, the value of the attribute, and a description of the feature.  (The
 fifth parameter is a list of features whose presence is implied, and its
 default value is an empty array.)

-.. code-block:: llvm
+.. code-block:: text

  class SubtargetFeature<string n, string a, string v, string d,
                         list<SubtargetFeature> i = []> {
@ -1696,7 +1696,7 @@ default value is an empty array.)
 In the ``Sparc.td`` file, the ``SubtargetFeature`` is used to define the
 following features.

-.. code-block:: llvm
+.. code-block:: text

  def FeatureV9 : SubtargetFeature<"v9", "IsV9", "true",
                       "Enable SPARC-V9 instructions">;
@ -1710,7 +1710,7 @@ Elsewhere in ``Sparc.td``, the ``Proc`` class is defined and then is used to
 define particular SPARC processor subtypes that may have the previously
 described features.

-.. code-block:: llvm
+.. code-block:: text

  class Proc<string Name, list<SubtargetFeature> Features>
    : Processor<Name, NoItineraries, Features>;
--- a/docs/WritingAnLLVMPass.rst
+++ b/docs/WritingAnLLVMPass.rst
@ -747,7 +747,7 @@ template parameter is the name of the pass that is to be used on the command
 line to specify that the pass should be added to a program (for example, with
 :program:`opt` or :program:`bugpoint`).  The first argument is the name of the
 pass, which is to be used for the :option:`-help` output of programs, as well
-as for debug output generated by the :option:`--debug-pass` option.
+as for debug output generated by the `--debug-pass` option.

 If you want your pass to be easily dumpable, you should implement the virtual
 print method:
--- a/docs/index.rst
+++ b/docs/index.rst
@ -1,11 +1,6 @@
 Overview
 ========

-.. warning::
-
-   If you are using a released version of LLVM, see `the download page
-   <http://llvm.org/releases/>`_ to find your documentation.
-
 The LLVM compiler infrastructure supports a wide range of projects, from
 industrial strength compilers to specialized JIT applications to small
 research projects.
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@ -2014,6 +2014,9 @@ void LLVMAddFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA);

 void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
                             LLVMAttributeRef A);
+unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx);
+void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
+                              LLVMAttributeRef *Attrs);
 LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F,
                                             LLVMAttributeIndex Idx,
                                             unsigned KindID);
@ -2600,6 +2603,9 @@ void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,

 void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
                              LLVMAttributeRef A);
+unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C, LLVMAttributeIndex Idx);
+void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx,
+                               LLVMAttributeRef *Attrs);
 LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C,
                                              LLVMAttributeIndex Idx,
                                              unsigned KindID);
--- a/include/llvm/ADT/GraphTraits.h
+++ b/include/llvm/ADT/GraphTraits.h
@ -27,19 +27,24 @@ template<class GraphType>
 struct GraphTraits {
  // Elements to provide:

+  // NOTICE: We are in a transition from migration interfaces that require
+  // NodeType *, to NodeRef. NodeRef is required to be cheap to copy, but does
+  // not have to be a raw pointer. In the transition, user should define
+  // NodeType, and NodeRef = NodeType *.
+  //
  // typedef NodeType          - Type of Node in the graph
+  // typedef NodeRef           - NodeType *
  // typedef ChildIteratorType - Type used to iterate over children in graph

-  // static NodeType *getEntryNode(const GraphType &)
+  // static NodeRef getEntryNode(const GraphType &)
  //    Return the entry node of the graph

-  // static ChildIteratorType child_begin(NodeType *)
-  // static ChildIteratorType child_end  (NodeType *)
+  // static ChildIteratorType child_begin(NodeRef)
+  // static ChildIteratorType child_end  (NodeRef)
  //    Return iterators that point to the beginning and ending of the child
  //    node list for the specified node.
  //

-
  // typedef  ...iterator nodes_iterator;
  // static nodes_iterator nodes_begin(GraphType *G)
  // static nodes_iterator nodes_end  (GraphType *G)
@ -57,7 +62,7 @@ struct GraphTraits {
  // your argument to XXX_begin(...) is unknown or needs to have the proper .h
  // file #include'd.
  //
-  typedef typename GraphType::UnknownGraphTypeError NodeType;
+  typedef typename GraphType::UnknownGraphTypeError NodeRef;
 };


--- a/include/llvm/ADT/SCCIterator.h
+++ b/include/llvm/ADT/SCCIterator.h
@ -37,23 +37,22 @@ namespace llvm {
 /// build up a vector of nodes in a particular SCC. Note that it is a forward
 /// iterator and thus you cannot backtrack or re-visit nodes.
 template <class GraphT, class GT = GraphTraits<GraphT>>
-class scc_iterator
-    : public iterator_facade_base<
-          scc_iterator<GraphT, GT>, std::forward_iterator_tag,
-          const std::vector<typename GT::NodeType *>, ptrdiff_t> {
-  typedef typename GT::NodeType NodeType;
+class scc_iterator : public iterator_facade_base<
+                         scc_iterator<GraphT, GT>, std::forward_iterator_tag,
+                         const std::vector<typename GT::NodeRef>, ptrdiff_t> {
+  typedef typename GT::NodeRef NodeRef;
  typedef typename GT::ChildIteratorType ChildItTy;
-  typedef std::vector<NodeType *> SccTy;
+  typedef std::vector<NodeRef> SccTy;
  typedef typename scc_iterator::reference reference;

  /// Element of VisitStack during DFS.
  struct StackElement {
-    NodeType *Node;       ///< The current node pointer.
+    NodeRef Node;         ///< The current node pointer.
    ChildItTy NextChild;  ///< The next child, modified inplace during DFS.
    unsigned MinVisited;  ///< Minimum uplink value of all children of Node.

-    StackElement(NodeType *Node, const ChildItTy &Child, unsigned Min)
-      : Node(Node), NextChild(Child), MinVisited(Min) {}
+    StackElement(NodeRef Node, const ChildItTy &Child, unsigned Min)
+        : Node(Node), NextChild(Child), MinVisited(Min) {}

    bool operator==(const StackElement &Other) const {
      return Node == Other.Node &&
@ -67,10 +66,10 @@ class scc_iterator
  ///
  /// nodeVisitNumbers are per-node visit numbers, also used as DFS flags.
  unsigned visitNum;
-  DenseMap<NodeType *, unsigned> nodeVisitNumbers;
+  DenseMap<NodeRef, unsigned> nodeVisitNumbers;

  /// Stack holding nodes of the SCC.
-  std::vector<NodeType *> SCCNodeStack;
+  std::vector<NodeRef> SCCNodeStack;

  /// The current SCC, retrieved using operator*().
  SccTy CurrentSCC;
@ -80,7 +79,7 @@ class scc_iterator
  std::vector<StackElement> VisitStack;

  /// A single "visit" within the non-recursive DFS traversal.
-  void DFSVisitOne(NodeType *N);
+  void DFSVisitOne(NodeRef N);

  /// The stack-based DFS traversal; defined below.
  void DFSVisitChildren();
@ -88,7 +87,7 @@ class scc_iterator
  /// Compute the next SCC using the DFS traversal.
  void GetNextSCC();

-  scc_iterator(NodeType *entryN) : visitNum(0) {
+  scc_iterator(NodeRef entryN) : visitNum(0) {
    DFSVisitOne(entryN);
    GetNextSCC();
  }
@ -131,7 +130,7 @@ public:

  /// This informs the \c scc_iterator that the specified \c Old node
  /// has been deleted, and \c New is to be used in its place.
-  void ReplaceNode(NodeType *Old, NodeType *New) {
+  void ReplaceNode(NodeRef Old, NodeRef New) {
    assert(nodeVisitNumbers.count(Old) && "Old not in scc_iterator?");
    nodeVisitNumbers[New] = nodeVisitNumbers[Old];
    nodeVisitNumbers.erase(Old);
@ -139,7 +138,7 @@ public:
 };

 template <class GraphT, class GT>
-void scc_iterator<GraphT, GT>::DFSVisitOne(NodeType *N) {
+void scc_iterator<GraphT, GT>::DFSVisitOne(NodeRef N) {
  ++visitNum;
  nodeVisitNumbers[N] = visitNum;
  SCCNodeStack.push_back(N);
@ -155,8 +154,8 @@ void scc_iterator<GraphT, GT>::DFSVisitChildren() {
  assert(!VisitStack.empty());
  while (VisitStack.back().NextChild != GT::child_end(VisitStack.back().Node)) {
    // TOS has at least one more child so continue DFS
-    NodeType *childN = *VisitStack.back().NextChild++;
-    typename DenseMap<NodeType *, unsigned>::iterator Visited =
+    NodeRef childN = *VisitStack.back().NextChild++;
+    typename DenseMap<NodeRef, unsigned>::iterator Visited =
        nodeVisitNumbers.find(childN);
    if (Visited == nodeVisitNumbers.end()) {
      // this node has never been seen.
@ -176,7 +175,7 @@ template <class GraphT, class GT> void scc_iterator<GraphT, GT>::GetNextSCC() {
    DFSVisitChildren();

    // Pop the leaf on top of the VisitStack.
-    NodeType *visitingN = VisitStack.back().Node;
+    NodeRef visitingN = VisitStack.back().Node;
    unsigned minVisitNum = VisitStack.back().MinVisited;
    assert(VisitStack.back().NextChild == GT::child_end(visitingN));
    VisitStack.pop_back();
@ -212,7 +211,7 @@ bool scc_iterator<GraphT, GT>::hasLoop() const {
    assert(!CurrentSCC.empty() && "Dereferencing END SCC iterator!");
    if (CurrentSCC.size() > 1)
      return true;
-    NodeType *N = CurrentSCC.front();
+    NodeRef N = CurrentSCC.front();
    for (ChildItTy CI = GT::child_begin(N), CE = GT::child_end(N); CI != CE;
         ++CI)
      if (*CI == N)
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@ -26,10 +26,18 @@
 #include <memory>
 #include <utility> // for std::pair

+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Compiler.h"

 namespace llvm {
+namespace detail {
+
+template <typename RangeT>
+using IterOfRange = decltype(std::begin(std::declval<RangeT>()));
+
+} // End detail namespace

 //===----------------------------------------------------------------------===//
 //     Extra additions to <functional>
@ -235,6 +243,90 @@ auto reverse(
                    llvm::make_reverse_iterator(std::begin(C)));
 }

+/// An iterator adaptor that filters the elements of given inner iterators.
+///
+/// The predicate parameter should be a callable object that accepts the wrapped
+/// iterator's reference type and returns a bool. When incrementing or
+/// decrementing the iterator, it will call the predicate on each element and
+/// skip any where it returns false.
+///
+/// \code
+///   int A[] = { 1, 2, 3, 4 };
+///   auto R = make_filter_range(A, [](int N) { return N % 2 == 1; });
+///   // R contains { 1, 3 }.
+/// \endcode
+template <typename WrappedIteratorT, typename PredicateT>
+class filter_iterator
+    : public iterator_adaptor_base<
+          filter_iterator<WrappedIteratorT, PredicateT>, WrappedIteratorT,
+          typename std::common_type<
+              std::forward_iterator_tag,
+              typename std::iterator_traits<
+                  WrappedIteratorT>::iterator_category>::type> {
+  using BaseT = iterator_adaptor_base<
+      filter_iterator<WrappedIteratorT, PredicateT>, WrappedIteratorT,
+      typename std::common_type<
+          std::forward_iterator_tag,
+          typename std::iterator_traits<WrappedIteratorT>::iterator_category>::
+          type>;
+
+  struct PayloadType {
+    WrappedIteratorT End;
+    PredicateT Pred;
+  };
+
+  Optional<PayloadType> Payload;
+
+  void findNextValid() {
+    assert(Payload && "Payload should be engaged when findNextValid is called");
+    while (this->I != Payload->End && !Payload->Pred(*this->I))
+      BaseT::operator++();
+  }
+
+  // Construct the begin iterator. The begin iterator requires to know where end
+  // is, so that it can properly stop when it hits end.
+  filter_iterator(WrappedIteratorT Begin, WrappedIteratorT End, PredicateT Pred)
+      : BaseT(std::move(Begin)),
+        Payload(PayloadType{std::move(End), std::move(Pred)}) {
+    findNextValid();
+  }
+
+  // Construct the end iterator. It's not incrementable, so Payload doesn't
+  // have to be engaged.
+  filter_iterator(WrappedIteratorT End) : BaseT(End) {}
+
+public:
+  using BaseT::operator++;
+
+  filter_iterator &operator++() {
+    BaseT::operator++();
+    findNextValid();
+    return *this;
+  }
+
+  template <typename RT, typename PT>
+  friend iterator_range<filter_iterator<detail::IterOfRange<RT>, PT>>
+  make_filter_range(RT &&, PT);
+};
+
+/// Convenience function that takes a range of elements and a predicate,
+/// and return a new filter_iterator range.
+///
+/// FIXME: Currently if RangeT && is a rvalue reference to a temporary, the
+/// lifetime of that temporary is not kept by the returned range object, and the
+/// temporary is going to be dropped on the floor after the make_iterator_range
+/// full expression that contains this function call.
+template <typename RangeT, typename PredicateT>
+iterator_range<filter_iterator<detail::IterOfRange<RangeT>, PredicateT>>
+make_filter_range(RangeT &&Range, PredicateT Pred) {
+  using FilterIteratorT =
+      filter_iterator<detail::IterOfRange<RangeT>, PredicateT>;
+  return make_range(FilterIteratorT(std::begin(std::forward<RangeT>(Range)),
+                                    std::end(std::forward<RangeT>(Range)),
+                                    std::move(Pred)),
+                    FilterIteratorT(std::end(std::forward<RangeT>(Range))));
+}
+
 //===----------------------------------------------------------------------===//
 //     Extra additions to <utility>
 //===----------------------------------------------------------------------===//
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@ -174,6 +174,7 @@ public:
    UnknownEnvironment,

    GNU,
+    GNUABI64,
    GNUEABI,
    GNUEABIHF,
    GNUX32,
@ -476,8 +477,9 @@ public:

  bool isGNUEnvironment() const {
    EnvironmentType Env = getEnvironment();
-    return Env == Triple::GNU || Env == Triple::GNUEABI ||
-           Env == Triple::GNUEABIHF || Env == Triple::GNUX32;
+    return Env == Triple::GNU || Env == Triple::GNUABI64 ||
+           Env == Triple::GNUEABI || Env == Triple::GNUEABIHF ||
+           Env == Triple::GNUX32;
  }

  /// Checks if the environment could be MSVC.
--- a/include/llvm/ADT/iterator.h
+++ b/include/llvm/ADT/iterator.h
@ -155,7 +155,14 @@ template <
    typename T = typename std::iterator_traits<WrappedIteratorT>::value_type,
    typename DifferenceTypeT =
        typename std::iterator_traits<WrappedIteratorT>::difference_type,
-    typename PointerT = T *, typename ReferenceT = T &,
+    typename PointerT = typename std::conditional<
+        std::is_same<T, typename std::iterator_traits<
+                            WrappedIteratorT>::value_type>::value,
+        typename std::iterator_traits<WrappedIteratorT>::pointer, T *>::type,
+    typename ReferenceT = typename std::conditional<
+        std::is_same<T, typename std::iterator_traits<
+                            WrappedIteratorT>::value_type>::value,
+        typename std::iterator_traits<WrappedIteratorT>::reference, T &>::type,
    // Don't provide these, they are mostly to act as aliases below.
    typename WrappedTraitsT = std::iterator_traits<WrappedIteratorT>>
 class iterator_adaptor_base
@ -168,15 +175,7 @@ protected:

  iterator_adaptor_base() = default;

-  template <typename U>
-  explicit iterator_adaptor_base(
-      U &&u,
-      typename std::enable_if<
-          !std::is_base_of<typename std::remove_cv<
-                               typename std::remove_reference<U>::type>::type,
-                           DerivedT>::value,
-          int>::type = 0)
-      : I(std::forward<U &&>(u)) {}
+  explicit iterator_adaptor_base(WrappedIteratorT u) : I(std::move(u)) {}

  const WrappedIteratorT &wrapped() const { return I; }

--- a/include/llvm/Analysis/CallGraph.h
+++ b/include/llvm/Analysis/CallGraph.h
@ -410,6 +410,7 @@ public:
 // traversals.
 template <> struct GraphTraits<CallGraphNode *> {
  typedef CallGraphNode NodeType;
+  typedef CallGraphNode *NodeRef;

  typedef CallGraphNode::CallRecord CGNPairTy;
  typedef std::pointer_to_unary_function<CGNPairTy, CallGraphNode *>
@ -431,6 +432,7 @@ template <> struct GraphTraits<CallGraphNode *> {

 template <> struct GraphTraits<const CallGraphNode *> {
  typedef const CallGraphNode NodeType;
+  typedef const CallGraphNode *NodeRef;

  typedef CallGraphNode::CallRecord CGNPairTy;
  typedef std::pointer_to_unary_function<CGNPairTy, const CallGraphNode *>
--- a/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpander.h
@ -196,6 +196,13 @@ namespace llvm {
    /// block.
    Value *expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I);

+    /// \brief Insert code to directly compute the specified SCEV expression
+    /// into the program.  The inserted code is inserted into the SCEVExpander's
+    /// current insertion point. If a type is specified, the result will be
+    /// expanded to have that type, with a cast if necessary.
+    Value *expandCodeFor(const SCEV *SH, Type *Ty = nullptr);
+
+
    /// \brief Generates a code sequence that evaluates this predicate.
    /// The inserted instructions will be at position \p Loc.
    /// The result will be of type i1 and will have a value of 0 when the
@ -253,6 +260,15 @@ namespace llvm {

    void enableLSRMode() { LSRMode = true; }

+    /// \brief Set the current insertion point. This is useful if multiple calls
+    /// to expandCodeFor() are going to be made with the same insert point and
+    /// the insert point may be moved during one of the expansions (e.g. if the
+    /// insert point is not a block terminator).
+    void setInsertPoint(Instruction *IP) {
+      assert(IP);
+      Builder.SetInsertPoint(IP);
+    }
+
    /// \brief Clear the current insertion point. This is useful if the
    /// instruction that had been serving as the insertion point may have been
    /// deleted.
@ -313,12 +329,6 @@ namespace llvm {

    Value *expand(const SCEV *S);

-    /// \brief Insert code to directly compute the specified SCEV expression
-    /// into the program.  The inserted code is inserted into the SCEVExpander's
-    /// current insertion point. If a type is specified, the result will be
-    /// expanded to have that type, with a cast if necessary.
-    Value *expandCodeFor(const SCEV *SH, Type *Ty = nullptr);
-
    /// \brief Determine the most "relevant" loop for the given SCEV.
    const Loop *getRelevantLoop(const SCEV *);

--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@ -740,6 +740,7 @@ struct MBB2NumberFunctor :

 template <> struct GraphTraits<MachineBasicBlock *> {
  typedef MachineBasicBlock NodeType;
+  typedef MachineBasicBlock *NodeRef;
  typedef MachineBasicBlock::succ_iterator ChildIteratorType;

  static NodeType *getEntryNode(MachineBasicBlock *BB) { return BB; }
@ -753,6 +754,7 @@ template <> struct GraphTraits<MachineBasicBlock *> {

 template <> struct GraphTraits<const MachineBasicBlock *> {
  typedef const MachineBasicBlock NodeType;
+  typedef const MachineBasicBlock *NodeRef;
  typedef MachineBasicBlock::const_succ_iterator ChildIteratorType;

  static NodeType *getEntryNode(const MachineBasicBlock *BB) { return BB; }
@ -772,6 +774,7 @@ template <> struct GraphTraits<const MachineBasicBlock *> {
 //
 template <> struct GraphTraits<Inverse<MachineBasicBlock*> > {
  typedef MachineBasicBlock NodeType;
+  typedef MachineBasicBlock *NodeRef;
  typedef MachineBasicBlock::pred_iterator ChildIteratorType;
  static NodeType *getEntryNode(Inverse<MachineBasicBlock *> G) {
    return G.Graph;
@ -786,6 +789,7 @@ template <> struct GraphTraits<Inverse<MachineBasicBlock*> > {

 template <> struct GraphTraits<Inverse<const MachineBasicBlock*> > {
  typedef const MachineBasicBlock NodeType;
+  typedef const MachineBasicBlock *NodeRef;
  typedef MachineBasicBlock::const_pred_iterator ChildIteratorType;
  static NodeType *getEntryNode(Inverse<const MachineBasicBlock*> G) {
    return G.Graph;
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@ -210,6 +210,7 @@ public:
 private:
  friend class AttrBuilder;
  friend class AttributeSetImpl;
+  friend class AttributeSetNode;
  template <typename Ty> friend struct DenseMapInfo;

  /// \brief The attributes that we are managing. This can be null to represent
--- a/include/llvm/IR/CFG.h
+++ b/include/llvm/IR/CFG.h
@ -155,6 +155,7 @@ struct isPodLike<TerminatorInst::SuccIterator<T, U>> {

 template <> struct GraphTraits<BasicBlock*> {
  typedef BasicBlock NodeType;
+  typedef BasicBlock *NodeRef;
  typedef succ_iterator ChildIteratorType;

  static NodeType *getEntryNode(BasicBlock *BB) { return BB; }
@ -168,6 +169,7 @@ template <> struct GraphTraits<BasicBlock*> {

 template <> struct GraphTraits<const BasicBlock*> {
  typedef const BasicBlock NodeType;
+  typedef const BasicBlock *NodeRef;
  typedef succ_const_iterator ChildIteratorType;

  static NodeType *getEntryNode(const BasicBlock *BB) { return BB; }
@ -187,6 +189,7 @@ template <> struct GraphTraits<const BasicBlock*> {
 //
 template <> struct GraphTraits<Inverse<BasicBlock*> > {
  typedef BasicBlock NodeType;
+  typedef BasicBlock *NodeRef;
  typedef pred_iterator ChildIteratorType;
  static NodeType *getEntryNode(Inverse<BasicBlock *> G) { return G.Graph; }
  static inline ChildIteratorType child_begin(NodeType *N) {
@ -199,6 +202,7 @@ template <> struct GraphTraits<Inverse<BasicBlock*> > {

 template <> struct GraphTraits<Inverse<const BasicBlock*> > {
  typedef const BasicBlock NodeType;
+  typedef const BasicBlock *NodeRef;
  typedef const_pred_iterator ChildIteratorType;
  static NodeType *getEntryNode(Inverse<const BasicBlock*> G) {
    return G.Graph;
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@ -479,6 +479,8 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
              Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
  def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">,
              Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
  def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">,
              Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
  def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">,
@ -1512,8 +1514,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
        Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
  def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">,
        Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+  def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">,
+        Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
  def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">,
        Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+  def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">,
+        Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 }

 // Vector bit test
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@ -2349,6 +2349,10 @@ public:
  /// from getBooleanContents().
  bool isConstFalseVal(const SDNode *N) const;

+  /// Return a constant of type VT that contains a true value that respects
+  /// getBooleanContents()
+  SDValue getConstTrueVal(SelectionDAG &DAG, EVT VT, const SDLoc &DL) const;
+
  /// Return if \p N is a True value when extended to \p VT.
  bool isExtendedTrueVal(const ConstantSDNode *N, EVT VT, bool Signed) const;

--- a/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp
@ -623,6 +623,7 @@ template <> struct GraphTraits<IrreducibleGraph> {
  typedef bfi_detail::IrreducibleGraph GraphT;

  typedef const GraphT::IrrNode NodeType;
+  typedef const GraphT::IrrNode *NodeRef;
  typedef GraphT::IrrNode::iterator ChildIteratorType;

  static const NodeType *getEntryNode(const GraphT &G) {
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@ -1424,8 +1424,8 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), double V,
 /// integer type Ty is used to select how many bits are available for the
 /// result. Returns null if the conversion cannot be performed, otherwise
 /// returns the Constant value resulting from the conversion.
-Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
-                                   Type *Ty) {
+Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero,
+                                      Type *Ty) {
  // All of these conversion intrinsics form an integer of at most 64bits.
  unsigned ResultWidth = Ty->getIntegerBitWidth();
  assert(ResultWidth <= 64 &&
@ -1438,7 +1438,8 @@ Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
  APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
                                                  /*isSigned=*/true, mode,
                                                  &isExact);
-  if (status != APFloat::opOK && status != APFloat::opInexact)
+  if (status != APFloat::opOK &&
+      (!roundTowardZero || status != APFloat::opInexact))
    return nullptr;
  return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
 }
@ -1676,17 +1677,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
      case Intrinsic::x86_sse2_cvtsd2si:
      case Intrinsic::x86_sse2_cvtsd2si64:
        if (ConstantFP *FPOp =
-              dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
-          return ConstantFoldConvertToInt(FPOp->getValueAPF(),
-                                          /*roundTowardZero=*/false, Ty);
+                dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+          return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
+                                             /*roundTowardZero=*/false, Ty);
      case Intrinsic::x86_sse_cvttss2si:
      case Intrinsic::x86_sse_cvttss2si64:
      case Intrinsic::x86_sse2_cvttsd2si:
      case Intrinsic::x86_sse2_cvttsd2si64:
        if (ConstantFP *FPOp =
-              dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
-          return ConstantFoldConvertToInt(FPOp->getValueAPF(),
-                                          /*roundTowardZero=*/true, Ty);
+                dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+          return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
+                                             /*roundTowardZero=*/true, Ty);
      }
    }

--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@ -3400,7 +3400,10 @@ static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
    return TrueVal;

  if (const auto *ICI = dyn_cast<ICmpInst>(CondVal)) {
-    unsigned BitWidth = Q.DL.getTypeSizeInBits(TrueVal->getType());
+    // FIXME: This code is nearly duplicated in InstCombine. Using/refactoring
+    // decomposeBitTestICmp() might help.
+    unsigned BitWidth =
+        Q.DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
    ICmpInst::Predicate Pred = ICI->getPredicate();
    Value *CmpLHS = ICI->getOperand(0);
    Value *CmpRHS = ICI->getOperand(1);
@ -4274,7 +4277,8 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,

    // Gracefully handle edge cases where the instruction is not wired into any
    // parent block.
-    if (I->getParent())
+    if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) &&
+        !I->mayHaveSideEffects())
      I->eraseFromParent();
  } else {
    Worklist.insert(I);
@ -4302,7 +4306,8 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,

    // Gracefully handle edge cases where the instruction is not wired into any
    // parent block.
-    if (I->getParent())
+    if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) &&
+        !I->mayHaveSideEffects())
      I->eraseFromParent();
  }
  return Simplified;
--- a/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ b/lib/Analysis/LoopUnrollAnalyzer.cpp
@ -115,13 +115,19 @@ bool UnrolledInstAnalyzer::visitLoad(LoadInst &I) {
  // We might have a vector load from an array. FIXME: for now we just bail
  // out in this case, but we should be able to resolve and simplify such
  // loads.
-  if(CDS->getElementType() != I.getType())
+  if (CDS->getElementType() != I.getType())
    return false;

-  int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
-  if (SimplifiedAddrOp->getValue().getActiveBits() >= 64)
+  unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
+  if (SimplifiedAddrOp->getValue().getActiveBits() > 64)
    return false;
-  int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize;
+  int64_t SimplifiedAddrOpV = SimplifiedAddrOp->getSExtValue();
+  if (SimplifiedAddrOpV < 0) {
+    // FIXME: For now we conservatively ignore out of bound accesses, but
+    // we're allowed to perform the optimization in this case.
+    return false;
+  }
+  uint64_t Index = static_cast<uint64_t>(SimplifiedAddrOpV) / ElemSize;
  if (Index >= CDS->getNumElements()) {
    // FIXME: For now we conservatively ignore out of bound accesses, but
    // we're allowed to perform the optimization in this case.
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@ -1610,8 +1610,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {

 Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty,
                                   Instruction *IP) {
-  assert(IP);
-  Builder.SetInsertPoint(IP);
+  setInsertPoint(IP);
  return expandCodeFor(SH, Ty);
 }

--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@ -214,10 +214,7 @@ TypeIndex CodeViewDebug::getScopeIndex(const DIScope *Scope) {
 }

 TypeIndex CodeViewDebug::getFuncIdForSubprogram(const DISubprogram *SP) {
-  // It's possible to ask for the FuncId of a function which doesn't have a
-  // subprogram: inlining a function with debug info into a function with none.
-  if (!SP)
-    return TypeIndex::None();
+  assert(SP);

  // Check if we've already translated this subprogram.
  auto I = TypeIndices.find({SP, nullptr});
@ -621,11 +618,12 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,

  std::string FuncName;
  auto *SP = GV->getSubprogram();
+  assert(SP);
  setCurrentSubprogram(SP);

  // If we have a display name, build the fully qualified name by walking the
  // chain of scopes.
-  if (SP != nullptr && !SP->getDisplayName().empty())
+  if (!SP->getDisplayName().empty())
    FuncName =
        getFullyQualifiedName(SP->getScope().resolve(), SP->getDisplayName());

@ -864,7 +862,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
 void CodeViewDebug::beginFunction(const MachineFunction *MF) {
  assert(!CurFn && "Can't process two functions at once!");

-  if (!Asm || !MMI->hasDebugInfo())
+  if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram())
    return;

  DebugHandlerBase::beginFunction(MF);
@ -1939,7 +1937,8 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
  DebugHandlerBase::beginInstruction(MI);

  // Ignore DBG_VALUE locations and function prologue.
-  if (!Asm || MI->isDebugValue() || MI->getFlag(MachineInstr::FrameSetup))
+  if (!Asm || !CurFn || MI->isDebugValue() ||
+      MI->getFlag(MachineInstr::FrameSetup))
    return;
  DebugLoc DL = MI->getDebugLoc();
  if (DL == PrevInstLoc || !DL)
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@ -996,6 +996,24 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
    MachineBasicBlock *IBB = &*I;
    MachineBasicBlock *PredBB = &*std::prev(I);
    MergePotentials.clear();
+    MachineLoop *ML;
+
+    // Bail if merging after placement and IBB is the loop header because
+    // -- If merging predecessors that belong to the same loop as IBB, the
+    // common tail of merged predecessors may become the loop top if block
+    // placement is called again and the predecessors may branch to this common
+    // tail and require more branches. This can be relaxed if
+    // MachineBlockPlacement::findBestLoopTop is more flexible.
+    // --If merging predecessors that do not belong to the same loop as IBB, the
+    // loop info of IBB's loop and the other loops may be affected. Calling the
+    // block placement again may make big change to the layout and eliminate the
+    // reason to do tail merging here.
+    if (AfterBlockPlacement && MLI) {
+      ML = MLI->getLoopFor(IBB);
+      if (ML && IBB == ML->getHeader())
+        continue;
+    }
+
    for (MachineBasicBlock *PBB : I->predecessors()) {
      if (MergePotentials.size() == TailMergeThreshold)
        break;
@ -1015,16 +1033,12 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
      if (PBB->hasEHPadSuccessor())
        continue;

-      // Bail out if the loop header (IBB) is not the top of the loop chain
-      // after the block placement.  Otherwise, the common tail of IBB's
-      // predecessors may become the loop top if block placement is called again
-      // and the predecessors may branch to this common tail.
-      // FIXME: Relaxed this check if the algorithm of finding loop top is
-      // changed in MBP.
+      // After block placement, only consider predecessors that belong to the
+      // same loop as IBB.  The reason is the same as above when skipping loop
+      // header.
      if (AfterBlockPlacement && MLI)
-        if (MachineLoop *ML = MLI->getLoopFor(IBB))
-          if (IBB == ML->getHeader() && ML == MLI->getLoopFor(PBB))
-            continue;
+        if (ML != MLI->getLoopFor(PBB))
+          continue;

      MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
      SmallVector<MachineOperand, 4> Cond;
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@ -530,7 +530,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
    unsigned Align =
        std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
    SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
-                  Align, SSC.getLiveRange(StackGuardSlot));
+                  Align, SSC.getFullLiveRange());
  }

  for (Argument *Arg : ByValArguments) {
--- a/lib/CodeGen/SafeStackColoring.cpp
+++ b/lib/CodeGen/SafeStackColoring.cpp
@ -25,7 +25,9 @@ static cl::opt<bool> ClColoring("safe-stack-coloring",
                                cl::Hidden, cl::init(true));

 const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) {
-  return LiveRanges[AllocaNumbering[AI]];
+  const auto IT = AllocaNumbering.find(AI);
+  assert(IT != AllocaNumbering.end());
+  return LiveRanges[IT->second];
 }

 bool StackColoring::readMarker(Instruction *I, bool *IsStart) {
--- a/lib/CodeGen/SafeStackLayout.cpp
+++ b/lib/CodeGen/SafeStackLayout.cpp
@ -100,7 +100,8 @@ void StackLayout::layoutObject(StackObject &Obj) {
  }

  // Split starting and ending regions if necessary.
-  for (StackRegion &R : Regions) {
+  for (unsigned i = 0; i < Regions.size(); ++i) {
+    StackRegion &R = Regions[i];
    if (Start > R.Start && Start < R.End) {
      StackRegion R0 = R;
      R.Start = R0.End = Start;
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -6198,13 +6198,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
      }
    }

-    // sext(setcc x, y, cc) -> (select (setcc x, y, cc), -1, 0)
-    unsigned ElementWidth = VT.getScalarType().getSizeInBits();
+    // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0)
+    // Here, T can be 1 or -1, depending on the type of the setcc and
+    // getBooleanContents().
+    unsigned SetCCWidth = N0.getValueType().getScalarSizeInBits();
+
    SDLoc DL(N);
-    SDValue NegOne =
-      DAG.getConstant(APInt::getAllOnesValue(ElementWidth), DL, VT);
+    // To determine the "true" side of the select, we need to know the high bit
+    // of the value returned by the setcc if it evaluates to true.
+    // If the type of the setcc is i1, then the true case of the select is just
+    // sext(i1 1), that is, -1.
+    // If the type of the setcc is larger (say, i8) then the value of the high
+    // bit depends on getBooleanContents(). So, ask TLI for a real "true" value
+    // of the appropriate width.
+    SDValue ExtTrueVal =
+        (SetCCWidth == 1)
+            ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()),
+                              DL, VT)
+            : TLI.getConstTrueVal(DAG, VT, DL);
+
    if (SDValue SCC = SimplifySelectCC(
-            DL, N0.getOperand(0), N0.getOperand(1), NegOne,
+            DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal,
            DAG.getConstant(0, DL, VT),
            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
      return SCC;
@ -6215,10 +6229,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
          TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) {
        SDLoc DL(N);
        ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
-        SDValue SetCC = DAG.getSetCC(DL, SetCCVT,
-                                     N0.getOperand(0), N0.getOperand(1), CC);
-        return DAG.getSelect(DL, VT, SetCC,
-                             NegOne, DAG.getConstant(0, DL, VT));
+        SDValue SetCC =
+            DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC);
+        return DAG.getSelect(DL, VT, SetCC, ExtTrueVal,
+                             DAG.getConstant(0, DL, VT));
      }
    }
  }
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -6639,19 +6639,26 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
  SDNode *FromNode = From.getNode();
  SDNode *ToNode = To.getNode();
  ArrayRef<SDDbgValue *> DVs = GetDbgValues(FromNode);
+  SmallVector<SDDbgValue *, 2> ClonedDVs;
  for (ArrayRef<SDDbgValue *>::iterator I = DVs.begin(), E = DVs.end();
       I != E; ++I) {
    SDDbgValue *Dbg = *I;
    // Only add Dbgvalues attached to same ResNo.
    if (Dbg->getKind() == SDDbgValue::SDNODE &&
-        Dbg->getResNo() == From.getResNo()) {
+        Dbg->getSDNode() == From.getNode() &&
+        Dbg->getResNo() == From.getResNo() && !Dbg->isInvalidated()) {
+      assert(FromNode != ToNode &&
+             "Should not transfer Debug Values intranode");
      SDDbgValue *Clone =
          getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode,
                      To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(),
                      Dbg->getDebugLoc(), Dbg->getOrder());
-      AddDbgValue(Clone, ToNode, false);
+      ClonedDVs.push_back(Clone);
+      Dbg->setIsInvalidated();
    }
  }
+  for (SDDbgValue *I : ClonedDVs)
+    AddDbgValue(I, ToNode, false);
 }

 //===----------------------------------------------------------------------===//
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -1234,6 +1234,16 @@ bool TargetLowering::isConstTrueVal(const SDNode *N) const {
  llvm_unreachable("Invalid boolean contents");
 }

+SDValue TargetLowering::getConstTrueVal(SelectionDAG &DAG, EVT VT,
+                                        const SDLoc &DL) const {
+  unsigned ElementWidth = VT.getScalarSizeInBits();
+  APInt TrueInt =
+      getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent
+          ? APInt(ElementWidth, 1)
+          : APInt::getAllOnesValue(ElementWidth);
+  return DAG.getConstant(TrueInt, DL, VT);
+}
+
 bool TargetLowering::isConstFalseVal(const SDNode *N) const {
  if (!N)
    return false;
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@ -29,7 +29,7 @@

 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@ -539,6 +539,16 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
  return TRI->regsOverlap(RegA, RegB);
 }

+// Returns true if Reg is equal or aliased to at least one register in Set.
+static bool regOverlapsSet(const SmallVectorImpl<unsigned> &Set, unsigned Reg,
+                           const TargetRegisterInfo *TRI) {
+  for (unsigned R : Set)
+    if (TRI->regsOverlap(R, Reg))
+      return true;
+
+  return false;
+}
+
 /// Return true if it's potentially profitable to commute the two-address
 /// instruction that's being processed.
 bool
@ -864,9 +874,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
    // FIXME: Needs more sophisticated heuristics.
    return false;

-  SmallSet<unsigned, 2> Uses;
-  SmallSet<unsigned, 2> Kills;
-  SmallSet<unsigned, 2> Defs;
+  SmallVector<unsigned, 2> Uses;
+  SmallVector<unsigned, 2> Kills;
+  SmallVector<unsigned, 2> Defs;
  for (const MachineOperand &MO : MI->operands()) {
    if (!MO.isReg())
      continue;
@ -874,12 +884,12 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
    if (!MOReg)
      continue;
    if (MO.isDef())
-      Defs.insert(MOReg);
+      Defs.push_back(MOReg);
    else {
-      Uses.insert(MOReg);
+      Uses.push_back(MOReg);
      if (MOReg != Reg && (MO.isKill() ||
                           (LIS && isPlainlyKilled(MI, MOReg, LIS))))
-        Kills.insert(MOReg);
+        Kills.push_back(MOReg);
    }
  }

@ -888,8 +898,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
  MachineBasicBlock::iterator AfterMI = std::next(Begin);

  MachineBasicBlock::iterator End = AfterMI;
-  while (End->isCopy() && Defs.count(End->getOperand(1).getReg())) {
-    Defs.insert(End->getOperand(0).getReg());
+  while (End->isCopy() &&
+         regOverlapsSet(Defs, End->getOperand(1).getReg(), TRI)) {
+    Defs.push_back(End->getOperand(0).getReg());
    ++End;
  }

@ -915,21 +926,21 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
      if (!MOReg)
        continue;
      if (MO.isDef()) {
-        if (Uses.count(MOReg))
+        if (regOverlapsSet(Uses, MOReg, TRI))
          // Physical register use would be clobbered.
          return false;
-        if (!MO.isDead() && Defs.count(MOReg))
+        if (!MO.isDead() && regOverlapsSet(Defs, MOReg, TRI))
          // May clobber a physical register def.
          // FIXME: This may be too conservative. It's ok if the instruction
          // is sunken completely below the use.
          return false;
      } else {
-        if (Defs.count(MOReg))
+        if (regOverlapsSet(Defs, MOReg, TRI))
          return false;
        bool isKill =
            MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS));
-        if (MOReg != Reg &&
-            ((isKill && Uses.count(MOReg)) || Kills.count(MOReg)))
+        if (MOReg != Reg && ((isKill && regOverlapsSet(Uses, MOReg, TRI)) ||
+                             regOverlapsSet(Kills, MOReg, TRI)))
          // Don't want to extend other live ranges and update kills.
          return false;
        if (MOReg == Reg && !isKill)
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@ -19,8 +19,8 @@
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/IR/Attributes.h"
+#include "AttributeSetNode.h"
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Support/TrailingObjects.h"
 #include <climits>
 #include <string>

@ -142,73 +142,6 @@ public:
  StringRef getStringValue() const { return Val; }
 };

-//===----------------------------------------------------------------------===//
-/// \class
-/// \brief This class represents a group of attributes that apply to one
-/// element: function, return type, or parameter.
-class AttributeSetNode final
-    : public FoldingSetNode,
-      private TrailingObjects<AttributeSetNode, Attribute> {
-  friend TrailingObjects;
-
-  unsigned NumAttrs; ///< Number of attributes in this node.
-  /// Bitset with a bit for each available attribute Attribute::AttrKind.
-  uint64_t AvailableAttrs;
-
-  AttributeSetNode(ArrayRef<Attribute> Attrs)
-    : NumAttrs(Attrs.size()), AvailableAttrs(0) {
-    static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
-                  "Too many attributes for AvailableAttrs");
-    // There's memory after the node where we can store the entries in.
-    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
-
-    for (Attribute I : *this) {
-      if (!I.isStringAttribute()) {
-        AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
-      }
-    }
-  }
-
-  // AttributesSetNode is uniqued, these should not be publicly available.
-  void operator=(const AttributeSetNode &) = delete;
-  AttributeSetNode(const AttributeSetNode &) = delete;
-public:
-  void operator delete(void *p) { ::operator delete(p); }
-
-  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
-
-  /// \brief Return the number of attributes this AttributeSet contains.
-  unsigned getNumAttributes() const { return NumAttrs; }
-
-  bool hasAttribute(Attribute::AttrKind Kind) const {
-    return AvailableAttrs & ((uint64_t)1) << Kind;
-  }
-  bool hasAttribute(StringRef Kind) const;
-  bool hasAttributes() const { return NumAttrs != 0; }
-
-  Attribute getAttribute(Attribute::AttrKind Kind) const;
-  Attribute getAttribute(StringRef Kind) const;
-
-  unsigned getAlignment() const;
-  unsigned getStackAlignment() const;
-  uint64_t getDereferenceableBytes() const;
-  uint64_t getDereferenceableOrNullBytes() const;
-  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
-  std::string getAsString(bool InAttrGrp) const;
-
-  typedef const Attribute *iterator;
-  iterator begin() const { return getTrailingObjects<Attribute>(); }
-  iterator end() const { return begin() + NumAttrs; }
-
-  void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, makeArrayRef(begin(), end()));
-  }
-  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
-    for (unsigned I = 0, E = AttrList.size(); I != E; ++I)
-      AttrList[I].Profile(ID);
-  }
-};
-
 typedef std::pair<unsigned, AttributeSetNode *> IndexAttrPair;

 //===----------------------------------------------------------------------===//
--- a/lib/IR/AttributeSetNode.h
+++ b/lib/IR/AttributeSetNode.h
@ -0,0 +1,98 @@
+//===-- AttributeSetNode.h - AttributeSet Internal Node ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the node class used internally by AttributeSet.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_ATTRIBUTESETNODE_H
+#define LLVM_IR_ATTRIBUTESETNODE_H
+
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/Support/TrailingObjects.h"
+#include <climits>
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief This class represents a group of attributes that apply to one
+/// element: function, return type, or parameter.
+class AttributeSetNode final
+    : public FoldingSetNode,
+      private TrailingObjects<AttributeSetNode, Attribute> {
+  friend TrailingObjects;
+
+  unsigned NumAttrs; ///< Number of attributes in this node.
+  /// Bitset with a bit for each available attribute Attribute::AttrKind.
+  uint64_t AvailableAttrs;
+
+  AttributeSetNode(ArrayRef<Attribute> Attrs)
+    : NumAttrs(Attrs.size()), AvailableAttrs(0) {
+    static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
+                  "Too many attributes for AvailableAttrs");
+    // There's memory after the node where we can store the entries in.
+    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
+
+    for (Attribute I : *this) {
+      if (!I.isStringAttribute()) {
+        AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
+      }
+    }
+  }
+
+  // AttributesSetNode is uniqued, these should not be publicly available.
+  void operator=(const AttributeSetNode &) = delete;
+  AttributeSetNode(const AttributeSetNode &) = delete;
+public:
+  void operator delete(void *p) { ::operator delete(p); }
+
+  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
+
+  static AttributeSetNode *get(AttributeSet AS, unsigned Index) {
+    return AS.getAttributes(Index);
+  }
+
+  /// \brief Return the number of attributes this AttributeSet contains.
+  unsigned getNumAttributes() const { return NumAttrs; }
+
+  bool hasAttribute(Attribute::AttrKind Kind) const {
+    return AvailableAttrs & ((uint64_t)1) << Kind;
+  }
+  bool hasAttribute(StringRef Kind) const;
+  bool hasAttributes() const { return NumAttrs != 0; }
+
+  Attribute getAttribute(Attribute::AttrKind Kind) const;
+  Attribute getAttribute(StringRef Kind) const;
+
+  unsigned getAlignment() const;
+  unsigned getStackAlignment() const;
+  uint64_t getDereferenceableBytes() const;
+  uint64_t getDereferenceableOrNullBytes() const;
+  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
+  std::string getAsString(bool InAttrGrp) const;
+
+  typedef const Attribute *iterator;
+  iterator begin() const { return getTrailingObjects<Attribute>(); }
+  iterator end() const { return begin() + NumAttrs; }
+
+  void Profile(FoldingSetNodeID &ID) const {
+    Profile(ID, makeArrayRef(begin(), end()));
+  }
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
+    for (unsigned I = 0, E = AttrList.size(); I != E; ++I)
+      AttrList[I].Profile(ID);
+  }
+};
+
+} // end llvm namespace
+
+#endif
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@ -251,8 +251,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "sse2.cvtps2pd" ||
         Name == "avx.cvtdq2.pd.256" ||
         Name == "avx.cvt.ps2.pd.256" ||
-         Name == "sse2.cvttps2dq" ||
-         Name.startswith("avx.cvtt.") ||
         Name.startswith("avx.vinsertf128.") ||
         Name == "avx2.vinserti128" ||
         Name.startswith("avx.vextractf128.") ||
@ -712,12 +710,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
        Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
      else
        Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
-    } else if (IsX86 && (Name == "sse2.cvttps2dq" ||
-                         Name.startswith("avx.cvtt."))) {
-      // Truncation (round to zero) float/double to i32 vector conversion.
-      Value *Src = CI->getArgOperand(0);
-      VectorType *DstTy = cast<VectorType>(CI->getType());
-      Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt");
    } else if (IsX86 && Name.startswith("sse4a.movnt.")) {
      Module *M = F->getParent();
      SmallVector<Metadata *, 1> Elts;
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@ -16,6 +16,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/Attributes.h"
+#include "AttributeSetNode.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@ -1844,6 +1845,18 @@ void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
  unwrap<Function>(F)->addAttribute(Idx, unwrap(A));
 }

+unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx) {
+  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
+  return ASN->getNumAttributes();
+}
+
+void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
+                              LLVMAttributeRef *Attrs) {
+  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
+  for (auto A: make_range(ASN->begin(), ASN->end()))
+    *Attrs++ = wrap(A);
+}
+
 LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F,
                                             LLVMAttributeIndex Idx,
                                             unsigned KindID) {
@ -2216,6 +2229,21 @@ void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
  CallSite(unwrap<Instruction>(C)).addAttribute(Idx, unwrap(A));
 }

+unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C,
+                                       LLVMAttributeIndex Idx) {
+  auto CS = CallSite(unwrap<Instruction>(C));
+  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
+  return ASN->getNumAttributes();
+}
+
+void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx,
+                               LLVMAttributeRef *Attrs) {
+  auto CS = CallSite(unwrap<Instruction>(C));
+  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
+  for (auto A: make_range(ASN->begin(), ASN->end()))
+    *Attrs++ = wrap(A);
+}
+
 LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C,
                                              LLVMAttributeIndex Idx,
                                              unsigned KindID) {
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@ -675,8 +675,8 @@ void MDNode::handleChangedOperand(void *Ref, Metadata *New) {
  Metadata *Old = getOperand(Op);
  setOperand(Op, New);

-  // Drop uniquing for self-reference cycles.
-  if (New == this) {
+  // Drop uniquing for self-reference cycles and deleted constants.
+  if (New == this || (!New && Old && isa<ConstantAsMetadata>(Old))) {
    if (!isResolved())
      resolve();
    storeDistinctInContext();
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@ -201,6 +201,7 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) {
  switch (Kind) {
  case UnknownEnvironment: return "unknown";
  case GNU: return "gnu";
+  case GNUABI64: return "gnuabi64";
  case GNUEABIHF: return "gnueabihf";
  case GNUEABI: return "gnueabi";
  case GNUX32: return "gnux32";
@ -468,6 +469,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
  return StringSwitch<Triple::EnvironmentType>(EnvironmentName)
    .StartsWith("eabihf", Triple::EABIHF)
    .StartsWith("eabi", Triple::EABI)
+    .StartsWith("gnuabi64", Triple::GNUABI64)
    .StartsWith("gnueabihf", Triple::GNUEABIHF)
    .StartsWith("gnueabi", Triple::GNUEABI)
    .StartsWith("gnux32", Triple::GNUX32)
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@ -250,6 +250,7 @@ def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
                                   FeatureMacroOpFusion,
                                   FeatureNEON,
                                   FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive,
                                   HasV8_1aOps]>;

 def : ProcessorModel<"generic", NoSchedModel, [
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -7685,6 +7685,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
 /// Fold a floating-point multiply by power of two into floating-point to
 /// fixed-point conversion.
 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const AArch64Subtarget *Subtarget) {
  if (!Subtarget->hasNEON())
    return SDValue();
@ -7728,10 +7729,16 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
    ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
    break;
  case 4:
-    ResTy = MVT::v4i32;
+    ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
    break;
  }

+  if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
+         "Illegal vector type after legalization");
+
  SDLoc DL(N);
  bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
@ -9853,7 +9860,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
    return performIntToFpCombine(N, DAG, Subtarget);
  case ISD::FP_TO_SINT:
  case ISD::FP_TO_UINT:
-    return performFpToIntCombine(N, DAG, Subtarget);
+    return performFpToIntCombine(N, DAG, DCI, Subtarget);
  case ISD::FDIV:
    return performFDivCombine(N, DAG, Subtarget);
  case ISD::OR:
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@ -20,6 +20,7 @@ class AMDGPUInstrPrinter;
 class AMDGPUSubtarget;
 class AMDGPUTargetMachine;
 class FunctionPass;
+class GCNTargetMachine;
 struct MachineSchedContext;
 class MCAsmInfo;
 class raw_ostream;
@ -50,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
-FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);

 ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);

--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@ -783,15 +783,19 @@ void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) {
  emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion,
                        RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2);
  if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
-    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
-                          RuntimeMD::OpenCL_C, 1);
-    auto Node = MD->getOperand(0);
-    unsigned short Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
-                             ->getZExtValue();
-    unsigned short Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
-                             ->getZExtValue();
-    emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
-                          Major * 100 + Minor * 10, 2);
+    if (MD->getNumOperands()) {
+      auto Node = MD->getOperand(0);
+      if (Node->getNumOperands() > 1) {
+        emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
+                              RuntimeMD::OpenCL_C, 1);
+        uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
+                         ->getZExtValue();
+        uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
+                         ->getZExtValue();
+        emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
+                              Major * 100 + Minor * 10, 2);
+      }
+    }
  }
 }

--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@ -14,7 +14,9 @@
 //===----------------------------------------------------------------------===//

 #include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"

 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
@ -30,15 +32,28 @@ using namespace llvm;
 namespace {

 class AMDGPUCodeGenPrepare : public FunctionPass,
-                             public InstVisitor<AMDGPUCodeGenPrepare> {
+                             public InstVisitor<AMDGPUCodeGenPrepare, bool> {
+  const GCNTargetMachine *TM;
+  const SISubtarget *ST;
  DivergenceAnalysis *DA;
-  const TargetMachine *TM;
+  Module *Mod;
+  bool HasUnsafeFPMath;

 public:
  static char ID;
  AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
    FunctionPass(ID),
-    TM(TM) { }
+    TM(static_cast<const GCNTargetMachine *>(TM)),
+    ST(nullptr),
+    DA(nullptr),
+    Mod(nullptr),
+    HasUnsafeFPMath(false) { }
+
+  bool visitFDiv(BinaryOperator &I);
+
+  bool visitInstruction(Instruction &I) {
+    return false;
+  }

  bool doInitialization(Module &M) override;
  bool runOnFunction(Function &F) override;
@ -55,7 +70,92 @@ public:

 } // End anonymous namespace

+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
+  if (!CNum)
+    return false;
+
+  // Reciprocal f32 is handled separately without denormals.
+  return UnsafeDiv || CNum->isExactlyValue(+1.0);
+}
+
+// Insert an intrinsic for fast fdiv for safe math situations where we can
+// reduce precision. Leave fdiv for situations where the generic node is
+// expected to be optimized.
+bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
+  Type *Ty = FDiv.getType();
+
+  // TODO: Handle half
+  if (!Ty->getScalarType()->isFloatTy())
+    return false;
+
+  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+  if (!FPMath)
+    return false;
+
+  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
+  float ULP = FPOp->getFPAccuracy();
+  if (ULP < 2.5f)
+    return false;
+
+  FastMathFlags FMF = FPOp->getFastMathFlags();
+  bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+                                      FMF.allowReciprocal();
+  if (ST->hasFP32Denormals() && !UnsafeDiv)
+    return false;
+
+  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+  Builder.setFastMathFlags(FMF);
+  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
+
+  const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
+  Function *Decl
+    = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+
+  Value *Num = FDiv.getOperand(0);
+  Value *Den = FDiv.getOperand(1);
+
+  Value *NewFDiv = nullptr;
+
+  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+    NewFDiv = UndefValue::get(VT);
+
+    // FIXME: Doesn't do the right thing for cases where the vector is partially
+    // constant. This works when the scalarizer pass is run first.
+    for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
+      Value *NumEltI = Builder.CreateExtractElement(Num, I);
+      Value *DenEltI = Builder.CreateExtractElement(Den, I);
+      Value *NewElt;
+
+      if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
+      } else {
+        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+      }
+
+      NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
+    }
+  } else {
+    if (!shouldKeepFDivF32(Num, UnsafeDiv))
+      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+  }
+
+  if (NewFDiv) {
+    FDiv.replaceAllUsesWith(NewFDiv);
+    NewFDiv->takeName(&FDiv);
+    FDiv.eraseFromParent();
+  }
+
+  return true;
+}
+
+static bool hasUnsafeFPMath(const Function &F) {
+  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
+  return Attr.getValueAsString() == "true";
+}
+
 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
+  Mod = &M;
  return false;
 }

@ -63,10 +163,21 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
  if (!TM || skipFunction(F))
    return false;

+  ST = &TM->getSubtarget<SISubtarget>(F);
  DA = &getAnalysis<DivergenceAnalysis>();
-  visit(F);
+  HasUnsafeFPMath = hasUnsafeFPMath(F);

-  return true;
+  bool MadeChange = false;
+
+  for (BasicBlock &BB : F) {
+    BasicBlock::iterator Next;
+    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+      Next = std::next(I);
+      MadeChange |= visit(*I);
+    }
+  }
+
+  return MadeChange;
 }

 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
@ -77,6 +188,6 @@ INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,

 char AMDGPUCodeGenPrepare::ID = 0;

-FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) {
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
  return new AMDGPUCodeGenPrepare(TM);
 }
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@ -420,9 +420,10 @@ int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
 int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
-int FP32_NEG_ONE = 0xbf800000;
 int FP32_ONE = 0x3f800000;
+int FP32_NEG_ONE = 0xbf800000;
 int FP64_ONE = 0x3ff0000000000000;
+int FP64_NEG_ONE = 0xbff0000000000000;
 }
 def CONST : Constants;

--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@ -29,16 +29,39 @@ static const char *const IntrinsicNameTable[] = {
 #undef GET_INTRINSIC_NAME_TABLE
 };

-std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
-                                         unsigned numTys) const {
-  if (IntrID < Intrinsic::num_intrinsics) {
-    return nullptr;
-  }
+namespace {
+#define GET_INTRINSIC_ATTRIBUTES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+}
+
+StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
+                                       ArrayRef<Type *> Tys) const {
+  if (IntrID < Intrinsic::num_intrinsics)
+    return StringRef();
+
  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
         "Invalid intrinsic ID");

-  std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]);
-  return Result;
+  return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
+}
+
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+                                         unsigned NumTys) const {
+  return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
+}
+
+FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
+                                           ArrayRef<Type*> Tys) const {
+  // FIXME: Re-use Intrinsic::getType machinery
+  switch (ID) {
+  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+    Type *F32Ty = Type::getFloatTy(Context);
+    return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
+  }
+  default:
+    llvm_unreachable("unhandled intrinsic");
+  }
 }

 unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
@ -69,7 +92,19 @@ bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
 }

 Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
-                                              Type **Tys,
-                                              unsigned numTys) const {
-  llvm_unreachable("Not implemented");
+                                              ArrayRef<Type *> Tys) const {
+  FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
+  Function *F
+    = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
+
+  AttributeSet AS = getAttributes(M->getContext(),
+                                  static_cast<AMDGPUIntrinsic::ID>(IntrID));
+  F->setAttributes(AS);
+  return F;
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                              Type **Tys,
+                                              unsigned NumTys) const {
+  return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
 }
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@ -34,13 +34,23 @@ enum ID {
 class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
 public:
  AMDGPUIntrinsicInfo();
+
+  StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
+
  std::string getName(unsigned IntrId, Type **Tys = nullptr,
-                      unsigned numTys = 0) const override;
+                      unsigned NumTys = 0) const override;
+
  unsigned lookupName(const char *Name, unsigned Len) const override;
  bool isOverloaded(unsigned IID) const override;
  Function *getDeclaration(Module *M, unsigned ID,
                           Type **Tys = nullptr,
-                           unsigned numTys = 0) const override;
+                           unsigned NumTys = 0) const override;
+
+  Function *getDeclaration(Module *M, unsigned ID,
+                           ArrayRef<Type *> = None) const;
+
+  FunctionType *getType(LLVMContext &Context, unsigned ID,
+                        ArrayRef<Type*> Tys = None) const;
 };

 } // end namespace llvm
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@ -348,9 +348,6 @@ static VectorType *arrayTypeToVecType(Type *ArrayTy) {
 static Value *
 calculateVectorIndex(Value *Ptr,
                     const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
-  if (isa<AllocaInst>(Ptr))
-    return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
-
  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);

  auto I = GEPIdx.find(GEP);
@ -360,11 +357,11 @@ calculateVectorIndex(Value *Ptr,
 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
  // FIXME we only support simple cases
  if (GEP->getNumOperands() != 3)
-    return NULL;
+    return nullptr;

  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
  if (!I0 || !I0->isZero())
-    return NULL;
+    return nullptr;

  return GEP->getOperand(2);
 }
@ -398,7 +395,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
  // are just being conservative for now.
  if (!AllocaTy ||
      AllocaTy->getElementType()->isVectorTy() ||
-      AllocaTy->getNumElements() > 4) {
+      AllocaTy->getNumElements() > 4 ||
+      AllocaTy->getNumElements() < 2) {
    DEBUG(dbgs() << "  Cannot convert type to vector\n");
    return false;
  }
@ -443,9 +441,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
    IRBuilder<> Builder(Inst);
    switch (Inst->getOpcode()) {
    case Instruction::Load: {
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
      Value *Ptr = Inst->getOperand(0);
      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+
+      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
      Value *VecValue = Builder.CreateLoad(BitCast);
      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
      Inst->replaceAllUsesWith(ExtractElement);
@ -453,9 +453,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
      break;
    }
    case Instruction::Store: {
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+
      Value *Ptr = Inst->getOperand(1);
      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
      Value *VecValue = Builder.CreateLoad(BitCast);
      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
                                                       Inst->getOperand(0),
@ -469,7 +471,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
      break;

    default:
-      Inst->dump();
      llvm_unreachable("Inconsistency in instructions promotable to vector");
    }
  }
@ -477,11 +478,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
 }

 static bool isCallPromotable(CallInst *CI) {
-  // TODO: We might be able to handle some cases where the callee is a
-  // constantexpr bitcast of a function.
-  if (!CI->getCalledFunction())
-    return false;
-
  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
  if (!II)
    return false;
@ -773,28 +769,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
      continue;
    }

-    IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
-    if (!Intr) {
-      // FIXME: What is this for? It doesn't make sense to promote arbitrary
-      // function calls. If the call is to a defined function that can also be
-      // promoted, we should be able to do this once that function is also
-      // rewritten.
-
-      std::vector<Type*> ArgTypes;
-      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
-                                ArgIdx != ArgEnd; ++ArgIdx) {
-        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
-      }
-      Function *F = Call->getCalledFunction();
-      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
-                                                F->isVarArg());
-      Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
-                                             NewType, F->getAttributes());
-      Function *NewF = cast<Function>(C);
-      Call->setCalledFunction(NewF);
-      continue;
-    }
-
+    IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
    Builder.SetInsertPoint(Intr);
    switch (Intr->getIntrinsicID()) {
    case Intrinsic::lifetime_start:
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -309,6 +309,7 @@ public:
  ScheduleDAGInstrs *
  createMachineScheduler(MachineSchedContext *C) const override;

+  void addIRPasses() override;
  bool addPreISel() override;
  void addMachineSSAOptimization() override;
  bool addInstSelector() override;
@ -499,6 +500,13 @@ void GCNPassConfig::addMachineSSAOptimization() {
  addPass(&DeadMachineInstructionElimID);
 }

+void GCNPassConfig::addIRPasses() {
+  // TODO: May want to move later or split into an early and late one.
+  addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
+
+  AMDGPUPassConfig::addIRPasses();
+}
+
 bool GCNPassConfig::addInstSelector() {
  AMDGPUPassConfig::addInstSelector();
  addPass(createSILowerI1CopiesPass());
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@ -122,6 +122,7 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
  setOperationAction(ISD::SETCC, MVT::i32, Expand);
  setOperationAction(ISD::SETCC, MVT::f32, Expand);
  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

@ -832,13 +833,18 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
    return;
  case ISD::FP_TO_UINT:
    if (N->getValueType(0) == MVT::i1) {
-      Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+      Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
      return;
    }
    // Fall-through. Since we don't care about out of bounds values
    // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
    // considers some extra cases which are not necessary here.
  case ISD::FP_TO_SINT: {
+    if (N->getValueType(0) == MVT::i1) {
+      Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
+      return;
+    }
+
    SDValue Result;
    if (expandFP_TO_SINT(N, Result, DAG))
      Results.push_back(Result);
@ -1052,15 +1058,24 @@ SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
 }

-SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
+SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
  SDLoc DL(Op);
  return DAG.getNode(
      ISD::SETCC,
      DL,
      MVT::i1,
-      Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
-      DAG.getCondCode(ISD::SETNE)
-      );
+      Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
+      DAG.getCondCode(ISD::SETEQ));
+}
+
+SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(
+      ISD::SETCC,
+      DL,
+      MVT::i1,
+      Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
+      DAG.getCondCode(ISD::SETEQ));
 }

 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@ -72,7 +72,8 @@ private:

  SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const;
  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;

  SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@ -41,7 +41,8 @@ enum {
  WQM = 1 << 22,
  VGPRSpill = 1 << 23,
  VOPAsmPrefer32Bit = 1 << 24,
-  Gather4 = 1 << 25
+  Gather4 = 1 << 25,
+  DisableWQM = 1 << 26
 };
 }

--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -1134,9 +1134,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
    MachineFunction *MF = BB->getParent();
    SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    DebugLoc DL = MI.getDebugLoc();
-    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32))
-        .addOperand(MI.getOperand(0))
-        .addImm(MFI->LDSSize);
+    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
+      .addOperand(MI.getOperand(0))
+      .addImm(MFI->LDSSize);
    MI.eraseFromParent();
    return BB;
  }
@ -1792,6 +1792,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
                                   Op->getVTList(), Ops, VT, MMO);
  }
+  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+    return lowerFDIV_FAST(Op, DAG);
+  }
  case AMDGPUIntrinsic::SI_vs_load_input:
    return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
                       Op.getOperand(1),
@ -2098,7 +2101,8 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

 // Catch division cases where we can use shortcuts with rcp and rsq
 // instructions.
-SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
+SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
+                                              SelectionDAG &DAG) const {
  SDLoc SL(Op);
  SDValue LHS = Op.getOperand(0);
  SDValue RHS = Op.getOperand(1);
@ -2139,47 +2143,48 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
  return SDValue();
 }

+// Faster 2.5 ULP division that does not support denormals.
+SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+
+  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+
+  const APFloat K0Val(BitsToFloat(0x6f800000));
+  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+
+  const APFloat K1Val(BitsToFloat(0x2f800000));
+  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+
+  EVT SetCCVT =
+    getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+
+  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+
+  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+
+  // TODO: Should this propagate fast-math-flags?
+  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+
+  // rcp does not support denormals.
+  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+
+  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+}
+
 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
-  if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
+  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
    return FastLowered;

  SDLoc SL(Op);
  SDValue LHS = Op.getOperand(0);
  SDValue RHS = Op.getOperand(1);

-  // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
-  if (EnableAMDGPUFastFDIV) {
-    // This does not support denormals.
-    SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
-
-    const APFloat K0Val(BitsToFloat(0x6f800000));
-    const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
-
-    const APFloat K1Val(BitsToFloat(0x2f800000));
-    const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
-
-    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
-
-    EVT SetCCVT =
-        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
-
-    SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
-
-    SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
-
-    // TODO: Should this propagate fast-math-flags?
-
-    r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
-
-    // rcp does not support denormals.
-    SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
-
-    SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
-
-    return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
-  }
-
-  // Generates more precise fpdiv32.
  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);

  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
@ -2209,7 +2214,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {

 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
  if (DAG.getTarget().Options.UnsafeFPMath)
-    return LowerFastFDIV(Op, DAG);
+    return lowerFastUnsafeFDIV(Op, DAG);

  SDLoc SL(Op);
  SDValue X = Op.getOperand(0);
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@ -36,7 +36,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@ -41,6 +41,8 @@ class InstSI <dag outs, dag ins, string asm = "",
  field bits<1> DS = 0;
  field bits<1> MIMG = 0;
  field bits<1> FLAT = 0;
+
+  // Whether WQM _must_ be enabled for this instruction.
  field bits<1> WQM = 0;
  field bits<1> VGPRSpill = 0;

@ -50,6 +52,9 @@ class InstSI <dag outs, dag ins, string asm = "",

  field bits<1> Gather4 = 0;

+  // Whether WQM _must_ be disabled for this instruction.
+  field bits<1> DisableWQM = 0;
+
  // These need to be kept in sync with the enum in SIInstrFlags.
  let TSFlags{0} = VM_CNT;
  let TSFlags{1} = EXP_CNT;
@ -81,6 +86,7 @@ class InstSI <dag outs, dag ins, string asm = "",
  let TSFlags{23} = VGPRSpill;
  let TSFlags{24} = VOPAsmPrefer32Bit;
  let TSFlags{25} = Gather4;
+  let TSFlags{26} = DisableWQM;

  let SchedRW = [Write32Bit];

--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -738,7 +738,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
    MachineBasicBlock::iterator Insert = Entry.front();
    DebugLoc DL = Insert->getDebugLoc();

-    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
+    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
+                                   *MF);
    if (TIDReg == AMDGPU::NoRegister)
      return TIDReg;

--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@ -340,6 +340,14 @@ public:
    return get(Opcode).TSFlags & SIInstrFlags::WQM;
  }

+  static bool isDisableWQM(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
+  }
+
+  bool isDisableWQM(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
+  }
+
  static bool isVGPRSpill(const MachineInstr &MI) {
    return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
  }
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@ -2949,6 +2949,10 @@ multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm,
  def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
           MUBUFAddr64Table <0>;

+  let DisableWQM = 1 in {
+    def "_exact" : MUBUF_Pseudo <opName, outs, ins, []>;
+  }
+
  let addr64 = 0, isCodeGenOnly = 0 in {
    def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
  }
@ -3019,7 +3023,8 @@ multiclass MUBUFAtomicOther_m <mubuf op, string opName, dag outs, dag ins,
 multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
                         ValueType vt, SDPatternOperator atomic> {

-  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in {
+  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1,
+      DisableWQM = 1 in {

    // No return variants
    let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
@ -3423,6 +3428,7 @@ class MIMG_Store_Helper <bits<7> op, string asm,
  let mayStore = 1;
  let hasSideEffects = 1;
  let hasPostISelHook = 0;
+  let DisableWQM = 1;
 }

 multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
@ -3454,6 +3460,7 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
  let mayStore = 1;
  let hasSideEffects = 1;
  let hasPostISelHook = 0;
+  let DisableWQM = 1;
  let Constraints = "$vdst = $vdata";
  let AsmMatchConverter = "cvtMIMGAtomic";
 }
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@ -2200,7 +2200,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
    (name vt:$vdata, v4i32:$rsrc, 0,
          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
          imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+    (!cast<MUBUF>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
                                    (as_i1imm $glc), (as_i1imm $slc), 0)
  >;

@ -2208,7 +2208,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
          imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
+    (!cast<MUBUF>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
                                   (as_i16imm $offset), (as_i1imm $glc),
                                   (as_i1imm $slc), 0)
  >;
@ -2217,7 +2217,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
    (name vt:$vdata, v4i32:$rsrc, 0,
          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
          imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
+    (!cast<MUBUF>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
                                   (as_i16imm $offset), (as_i1imm $glc),
                                   (as_i1imm $slc), 0)
  >;
@ -2226,7 +2226,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
          imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _BOTHEN)
+    (!cast<MUBUF>(opcode # _BOTHEN_exact)
      $vdata,
      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
      $rsrc, $soffset, (as_i16imm $offset),
@ -3391,6 +3391,16 @@ def : Pat <
    (V_CNDMASK_B32_e64 0, -1, $src), sub1)
 >;

+class FPToI1Pat<Instruction Inst, int KOne, ValueType vt, SDPatternOperator fp_to_int> : Pat <
+  (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
+  (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+>;
+
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, f32, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, f32, fp_to_sint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, f64, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, f64, fp_to_sint>;
+
 // If we need to perform a logical operation on i1 values, we need to
 // use vector comparisons since there is only one SCC register. Vector
 // comparisions still write to a pair of SGPRs, so treat these as
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// SI Intrinsic Definitions
+// Backend internal SI Intrinsic Definitions. User code should not
+// directly use these.
 //
 //===----------------------------------------------------------------------===//

@ -177,6 +178,12 @@ let TargetPrefix = "SI", isTarget = 1 in {
 } // End TargetPrefix = "SI", isTarget = 1

 let TargetPrefix = "amdgcn", isTarget = 1 in {
+  // Emit 2.5 ulp, no denormal division. Should only be inserted by
+  // pass based on !fpmath metadata.
+  def int_amdgcn_fdiv_fast : Intrinsic<
+    [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
+  >;
+
  /* Control flow Intrinsics */

  def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -203,7 +203,8 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
  Spill.Lane = Lane;

  if (!LaneVGPRs.count(LaneVGPRIdx)) {
-    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
+    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass,
+                                                *MF);

    if (LaneVGPR == AMDGPU::NoRegister)
      // We have no VGPRs left for spilling SGPRs.
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -957,10 +957,13 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
 /// \brief Returns a register that is not used at any point in the function.
 ///        If all registers are used, then this function will return
 //         AMDGPU::NoRegister.
-unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
-                                           const TargetRegisterClass *RC) const {
+unsigned
+SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+                                   const TargetRegisterClass *RC,
+                                   const MachineFunction &MF) const {
+
  for (unsigned Reg : *RC)
-    if (!MRI.isPhysRegUsed(Reg))
+    if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
      return Reg;
  return AMDGPU::NoRegister;
 }
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@ -185,7 +185,8 @@ public:
  unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const;

  unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
-                              const TargetRegisterClass *RC) const;
+                              const TargetRegisterClass *RC,
+                              const MachineFunction &MF) const;

  unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
  unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@ -94,12 +94,15 @@ private:
  const SIInstrInfo *TII;
  const SIRegisterInfo *TRI;
  MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;

  DenseMap<const MachineInstr *, InstrInfo> Instructions;
  DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
  SmallVector<const MachineInstr *, 2> ExecExports;
  SmallVector<MachineInstr *, 1> LiveMaskQueries;

+  void markInstruction(MachineInstr &MI, char Flag,
+                       std::vector<WorkItem> &Worklist);
  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@ -126,6 +129,7 @@ public:
  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
    AU.setPreservesCFG();
    MachineFunctionPass::getAnalysisUsage(AU);
  }
@ -135,8 +139,11 @@ public:

 char SIWholeQuadMode::ID = 0;

-INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
-                "SI Whole Quad Mode", false, false)
+INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                    false)

 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;

@ -144,6 +151,23 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
  return new SIWholeQuadMode;
 }

+void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
+                                      std::vector<WorkItem> &Worklist) {
+  InstrInfo &II = Instructions[&MI];
+
+  assert(Flag == StateWQM || Flag == StateExact);
+
+  // Ignore if the instruction is already marked. The typical case is that we
+  // mark an instruction WQM multiple times, but for atomics it can happen that
+  // Flag is StateWQM, but Needs is already set to StateExact. In this case,
+  // letting the atomic run in StateExact is correct as per the relevant specs.
+  if (II.Needs)
+    return;
+
+  II.Needs = Flag;
+  Worklist.push_back(&MI);
+}
+
 // Scan instructions to determine which ones require an Exact execmask and
 // which ones seed WQM requirements.
 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
@ -161,7 +185,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,

      if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
        Flags = StateWQM;
-      } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
+      } else if (TII->isDisableWQM(MI)) {
        Flags = StateExact;
      } else {
        // Handle export instructions with the exec mask valid flag set
@ -192,8 +216,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
          continue;
      }

-      Instructions[&MI].Needs = Flags;
-      Worklist.push_back(&MI);
+      markInstruction(MI, Flags, Worklist);
      GlobalFlags |= Flags;
    }

@ -214,9 +237,10 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
  BlockInfo &BI = Blocks[MBB];

-  // Control flow-type instructions that are followed by WQM computations
-  // must themselves be in WQM.
-  if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
+  // Control flow-type instructions and stores to temporary memory that are
+  // followed by WQM computations must themselves be in WQM.
+  if ((II.OutNeeds & StateWQM) && !II.Needs &&
+      (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
    Instructions[&MI].Needs = StateWQM;
    II.Needs = StateWQM;
  }
@ -249,32 +273,35 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
    if (!Use.isReg() || !Use.isUse())
      continue;

-    // At this point, physical registers appear as inputs or outputs
-    // and following them makes no sense (and would in fact be incorrect
-    // when the same VGPR is used as both an output and an input that leads
-    // to a NeedsWQM instruction).
-    //
-    // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
-    // have to trace this, in practice it happens for 64-bit computations like
-    // pointers where both dwords are followed already anyway.
-    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
-      continue;
+    unsigned Reg = Use.getReg();

-    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) {
-      InstrInfo &DefII = Instructions[&DefMI];
-
-      // Obviously skip if DefMI is already flagged as NeedWQM.
-      //
-      // The instruction might also be flagged as NeedExact. This happens when
-      // the result of an atomic is used in a WQM computation. In this case,
-      // the atomic must not run for helper pixels and the WQM result is
-      // undefined.
-      if (DefII.Needs != 0)
+    // Handle physical registers that we need to track; this is mostly relevant
+    // for VCC, which can appear as the (implicit) input of a uniform branch,
+    // e.g. when a loop counter is stored in a VGPR.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Reg == AMDGPU::EXEC)
        continue;

-      DefII.Needs = StateWQM;
-      Worklist.push_back(&DefMI);
+      for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+        LiveRange &LR = LIS->getRegUnit(*RegUnit);
+        const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+        if (!Value)
+          continue;
+
+        // Since we're in machine SSA, we do not need to track physical
+        // registers across basic blocks.
+        if (Value->isPHIDef())
+          continue;
+
+        markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+                        Worklist);
+      }
+
+      continue;
    }
+
+    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+      markInstruction(DefMI, StateWQM, Worklist);
  }
 }

@ -468,6 +495,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
  TII = ST.getInstrInfo();
  TRI = &TII->getRegisterInfo();
  MRI = &MF.getRegInfo();
+  LIS = &getAnalysis<LiveIntervals>();

  char GlobalFlags = analyzeFunction(MF);
  if (!(GlobalFlags & StateWQM)) {
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@ -3857,7 +3857,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
  // Try to convert two saturating conditional selects into a single SSAT
  SDValue SatValue;
  uint64_t SatConstant;
-  if (isSaturatingConditional(Op, SatValue, SatConstant))
+  if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
+      isSaturatingConditional(Op, SatValue, SatConstant))
    return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
                       DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));

--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@ -3650,7 +3650,8 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),

 def SSAT : AI<(outs GPRnopc:$Rd),
              (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
-              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsARM,HasV6]>{
  bits<4> Rd;
  bits<5> sat_imm;
  bits<4> Rn;
@ -3666,7 +3667,8 @@ def SSAT : AI<(outs GPRnopc:$Rd),

 def SSAT16 : AI<(outs GPRnopc:$Rd),
                (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm,
-                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> {
+                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
+                Requires<[IsARM,HasV6]>{
  bits<4> Rd;
  bits<4> sat_imm;
  bits<4> Rn;
@ -3679,7 +3681,8 @@ def SSAT16 : AI<(outs GPRnopc:$Rd),

 def USAT : AI<(outs GPRnopc:$Rd),
              (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
-              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsARM,HasV6]> {
  bits<4> Rd;
  bits<5> sat_imm;
  bits<4> Rn;
@ -3695,7 +3698,8 @@ def USAT : AI<(outs GPRnopc:$Rd),

 def USAT16 : AI<(outs GPRnopc:$Rd),
                (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm,
-                NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> {
+                NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+                Requires<[IsARM,HasV6]>{
  bits<4> Rd;
  bits<4> sat_imm;
  bits<4> Rn;
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@ -2240,7 +2240,8 @@ class T2SatI<dag oops, dag iops, InstrItinClass itin,
 def t2SSAT: T2SatI<
              (outs rGPR:$Rd),
              (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsThumb2]> {
  let Inst{31-27} = 0b11110;
  let Inst{25-22} = 0b1100;
  let Inst{20} = 0;
@ -2251,7 +2252,7 @@ def t2SSAT: T2SatI<
 def t2SSAT16: T2SatI<
                (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
                "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
-          Requires<[IsThumb2, HasDSP]> {
+                Requires<[IsThumb2, HasDSP]> {
  let Inst{31-27} = 0b11110;
  let Inst{25-22} = 0b1100;
  let Inst{20} = 0;
@ -2265,7 +2266,8 @@ def t2SSAT16: T2SatI<
 def t2USAT: T2SatI<
               (outs rGPR:$Rd),
               (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+                Requires<[IsThumb2]> {
  let Inst{31-27} = 0b11110;
  let Inst{25-22} = 0b1110;
  let Inst{20} = 0;
@ -2275,7 +2277,7 @@ def t2USAT: T2SatI<
 def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
                     NoItinerary,
                     "usat16", "\t$Rd, $sat_imm, $Rn", []>,
-          Requires<[IsThumb2, HasDSP]> {
+                     Requires<[IsThumb2, HasDSP]> {
  let Inst{31-22} = 0b1111001110;
  let Inst{20} = 0;
  let Inst{15} = 0;
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@ -518,6 +518,10 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
      return true;
    return false;

+  case ELF::R_MIPS_GOT_PAGE:
+  case ELF::R_MICROMIPS_GOT_PAGE:
+  case ELF::R_MIPS_GOT_OFST:
+  case ELF::R_MICROMIPS_GOT_OFST:
  case ELF::R_MIPS_16:
  case ELF::R_MIPS_32:
  case ELF::R_MIPS_GPREL32:
@ -539,8 +543,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
  case ELF::R_MIPS_SHIFT5:
  case ELF::R_MIPS_SHIFT6:
  case ELF::R_MIPS_GOT_DISP:
-  case ELF::R_MIPS_GOT_PAGE:
-  case ELF::R_MIPS_GOT_OFST:
  case ELF::R_MIPS_GOT_HI16:
  case ELF::R_MIPS_GOT_LO16:
  case ELF::R_MIPS_INSERT_A:
@ -589,8 +591,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
  case ELF::R_MICROMIPS_PC16_S1:
  case ELF::R_MICROMIPS_CALL16:
  case ELF::R_MICROMIPS_GOT_DISP:
-  case ELF::R_MICROMIPS_GOT_PAGE:
-  case ELF::R_MICROMIPS_GOT_OFST:
  case ELF::R_MICROMIPS_GOT_HI16:
  case ELF::R_MICROMIPS_GOT_LO16:
  case ELF::R_MICROMIPS_SUB:
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@ -28,12 +28,19 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
    PointerSize = CalleeSaveStackSlotSize = 8;
  }

+  // FIXME: This condition isn't quite right but it's the best we can do until
+  //        this object can identify the ABI. It will misbehave when using O32
+  //        on a mips64*-* triple.
+  if ((TheTriple.getArch() == Triple::mipsel) ||
+      (TheTriple.getArch() == Triple::mips)) {
+    PrivateGlobalPrefix = "$";
+    PrivateLabelPrefix = "$";
+  }
+
  AlignmentIsInBytes          = false;
  Data16bitsDirective         = "\t.2byte\t";
  Data32bitsDirective         = "\t.4byte\t";
  Data64bitsDirective         = "\t.8byte\t";
-  PrivateGlobalPrefix         = "$";
-  PrivateLabelPrefix          = "$";
  CommentString               = "#";
  ZeroDirective               = "\t.space\t";
  GPRel32Directive            = "\t.gpword\t";
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@ -57,7 +57,10 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
  else
    Ret += "E";

-  Ret += "-m:m";
+  if (ABI.IsO32())
+    Ret += "-m:m";
+  else
+    Ret += "-m:e";

  // Pointers are 32 bit on some ABIs.
  if (!ABI.IsN64())
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -1187,6 +1187,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i1, Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i1,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i1,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i1,  Custom);
    setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);

@ -13373,6 +13381,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
  MVT VT = Op.getSimpleValueType();
  SDLoc dl(Op);

+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  if (SrcVT.isVector()) {
    if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
      return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
@ -13380,6 +13389,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                         DAG.getUNDEF(SrcVT)));
    }
    if (SrcVT.getVectorElementType() == MVT::i1) {
+      if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
+        return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                           DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
      MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
      return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
                         DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
@ -13694,6 +13706,15 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
  MVT SVT = N0.getSimpleValueType();
  SDLoc dl(Op);

+  if (SVT.getVectorElementType() == MVT::i1) {
+    if (SVT == MVT::v2i1)
+      return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                         DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
+    MVT IntegerVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+    return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
+  }
+
  switch (SVT.SimpleTy) {
  default:
    llvm_unreachable("Custom UINT_TO_FP is not supported!");
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@ -2661,7 +2661,8 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
 bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                                  unsigned Opc, bool AllowSP, unsigned &NewSrc,
                                  bool &isKill, bool &isUndef,
-                                  MachineOperand &ImplicitOp) const {
+                                  MachineOperand &ImplicitOp,
+                                  LiveVariables *LV) const {
  MachineFunction &MF = *MI.getParent()->getParent();
  const TargetRegisterClass *RC;
  if (AllowSP) {
@ -2715,13 +2716,17 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
    // Virtual register of the wrong class, we have to create a temporary 64-bit
    // vreg to feed into the LEA.
    NewSrc = MF.getRegInfo().createVirtualRegister(RC);
-    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+    MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                 get(TargetOpcode::COPY))
        .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
        .addOperand(Src);

    // Which is obviously going to be dead after we're done with it.
    isKill = true;
    isUndef = false;
+
+    if (LV)
+      LV->replaceKillInstruction(SrcReg, MI, *Copy);
  }

  // We've set all the parameters without issue.
@ -2900,7 +2905,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
    unsigned SrcReg;
    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
      return nullptr;

    MachineInstrBuilder MIB =
@ -2943,7 +2948,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
    unsigned SrcReg;
    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
      return nullptr;

    MachineInstrBuilder MIB =
@ -2977,7 +2982,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
    unsigned SrcReg;
    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
      return nullptr;

    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@ -3016,7 +3021,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
    unsigned SrcReg;
    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
      return nullptr;

    const MachineOperand &Src2 = MI.getOperand(2);
@ -3024,7 +3029,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
    unsigned SrcReg2;
    MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
    if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
-                        SrcReg2, isKill2, isUndef2, ImplicitOp2))
+                        SrcReg2, isKill2, isUndef2, ImplicitOp2, LV))
      return nullptr;

    MachineInstrBuilder MIB =
@ -3087,7 +3092,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
    unsigned SrcReg;
    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                        SrcReg, isKill, isUndef, ImplicitOp))
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
      return nullptr;

    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@ -230,7 +230,7 @@ public:
  bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                      unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
                      bool &isKill, bool &isUndef,
-                      MachineOperand &ImplicitOp) const;
+                      MachineOperand &ImplicitOp, LiveVariables *LV) const;

  /// convertToThreeAddress - This method must be implemented by targets that
  /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -1820,7 +1820,7 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
                       Sched<[WriteCvtF2F]>;
-def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@ -1836,7 +1836,7 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
                       Sched<[WriteCvtF2F]>;
-def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@ -2009,24 +2009,35 @@ def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 // SSE2 packed instructions with XS prefix
 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvttps2dq\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+                         [(set VR128:$dst,
+                           (int_x86_sse2_cvttps2dq VR128:$src))],
+                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvttps2dq\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                            (loadv4f32 addr:$src)))],
+                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
-                          [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                          [(set VR256:$dst,
+                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
-                          [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+                                             (loadv8f32 addr:$src)))],
+                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
                          Sched<[WriteCvtF2ILd]>;

 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;

 let Predicates = [HasAVX] in {
  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
@ -2096,10 +2107,14 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 // YMM only
 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                         [(set VR128:$dst,
+                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
-                         [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                         [(set VR128:$dst,
+                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
+                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;

--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@ -332,6 +332,7 @@ struct ArgumentUsesTracker : public CaptureTracker {
 namespace llvm {
 template <> struct GraphTraits<ArgumentGraphNode *> {
  typedef ArgumentGraphNode NodeType;
+  typedef ArgumentGraphNode *NodeRef;
  typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType;

  static inline NodeType *getEntryNode(NodeType *A) { return A; }
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@ -44,6 +44,7 @@
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/Evaluator.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;

@ -779,7 +780,8 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
        // Instructions could multiply use V.
        while (UI != E && *UI == I)
          ++UI;
-        I->eraseFromParent();
+        if (isInstructionTriviallyDead(I, TLI))
+          I->eraseFromParent();
      }
 }

--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@ -134,6 +134,10 @@ static cl::opt<int> PreInlineThreshold(
    cl::desc("Control the amount of inlining in pre-instrumentation inliner "
             "(default = 75)"));

+static cl::opt<bool> EnableGVNHoist(
+    "enable-gvn-hoist", cl::init(false), cl::Hidden,
+    cl::desc("Enable the experimental GVN Hoisting pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
    OptLevel = 2;
    SizeLevel = 0;
@ -232,7 +236,8 @@ void PassManagerBuilder::populateFunctionPassManager(
  FPM.add(createCFGSimplificationPass());
  FPM.add(createSROAPass());
  FPM.add(createEarlyCSEPass());
-  FPM.add(createGVNHoistPass());
+  if(EnableGVNHoist)
+    FPM.add(createGVNHoistPass());
  FPM.add(createLowerExpectIntrinsicPass());
 }

--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@ -553,8 +553,11 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
    }
  }

+  // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring
+  // decomposeBitTestICmp() might help.
  {
-    unsigned BitWidth = DL.getTypeSizeInBits(TrueVal->getType());
+    unsigned BitWidth =
+        DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
    APInt MinSignedValue = APInt::getSignBit(BitWidth);
    Value *X;
    const APInt *Y, *C;
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@ -2830,7 +2830,8 @@ bool InstCombiner::run() {
        // Add operands to the worklist.
        replaceInstUsesWith(*I, C);
        ++NumConstProp;
-        eraseInstFromFunction(*I);
+        if (isInstructionTriviallyDead(I, TLI))
+          eraseInstFromFunction(*I);
        MadeIRChange = true;
        continue;
      }
@ -2851,7 +2852,8 @@ bool InstCombiner::run() {
        // Add operands to the worklist.
        replaceInstUsesWith(*I, C);
        ++NumConstProp;
-        eraseInstFromFunction(*I);
+        if (isInstructionTriviallyDead(I, TLI))
+          eraseInstFromFunction(*I);
        MadeIRChange = true;
        continue;
      }
@ -3007,7 +3009,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
                       << *Inst << '\n');
          Inst->replaceAllUsesWith(C);
          ++NumConstProp;
-          Inst->eraseFromParent();
+          if (isInstructionTriviallyDead(Inst, TLI))
+            Inst->eraseFromParent();
          continue;
        }

--- a/Show More
+++ b/Show More