Vendor import of llvm trunk r291012:

https://llvm.org/svn/llvm-project/llvm/trunk@291012
2017-01-04 22:11:11 +00:00 · 2017-01-04 22:11:11 +00:00 · 7386cafc7d
commit 7386cafc7d
parent 34e948a036
159 changed files with 4937 additions and 4389 deletions
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@ -457,6 +457,13 @@ if( MSVC )
  if(LLVM_ENABLE_DIA_SDK AND NOT HAVE_DIA_SDK)
    message(FATAL_ERROR "DIA SDK not found. If you have both VS 2012 and 2013 installed, you may need to uninstall the former and re-install the latter afterwards.")
  endif()
+
+  # Normalize to 0/1 for lit.site.cfg
+  if(LLVM_ENABLE_DIA_SDK)
+    set(LLVM_ENABLE_DIA_SDK 1)
+  else()
+    set(LLVM_ENABLE_DIA_SDK 0)
+  endif()
 else()
  set(LLVM_ENABLE_DIA_SDK 0)
 endif( MSVC )
--- a/cmake/modules/CheckCompilerVersion.cmake
+++ b/cmake/modules/CheckCompilerVersion.cmake
@ -43,8 +43,8 @@ int main() { return (float)x; }"
    elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
      if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.0)
        message(FATAL_ERROR "Host Visual Studio must be at least 2015")
-      elseif(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.00.24215.1)
-        message(WARNING "Host Visual Studio should at least be 2015 Update 3 (MSVC 19.00.24215.1)"
+      elseif(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.00.24213.1)
+        message(WARNING "Host Visual Studio should at least be 2015 Update 3 (MSVC 19.00.24213.1)"
          "  due to miscompiles from earlier versions")
      endif()
    endif()
--- a/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/include/llvm/ADT/IntrusiveRefCntPtr.h
@ -21,8 +21,8 @@
 //   class MyClass : public RefCountedBase<MyClass> {};
 //
 //   void foo() {
-//     // Objects that inherit from RefCountedBase should always be instantiated
-//     // on the heap, never on the stack.
+//     // Constructing an IntrusiveRefCntPtr increases the pointee's refcount by
+//     // 1 (from 0 in this case).
 //     IntrusiveRefCntPtr<MyClass> Ptr1(new MyClass());
 //
 //     // Copying an IntrusiveRefCntPtr increases the pointee's refcount by 1.
@ -68,9 +68,6 @@ namespace llvm {
 /// calls to Release() and Retain(), which increment and decrement the object's
 /// refcount, respectively.  When a Release() call decrements the refcount to 0,
 /// the object deletes itself.
-///
-/// Objects that inherit from RefCountedBase should always be allocated with
-/// operator new.
 template <class Derived> class RefCountedBase {
  mutable unsigned RefCount = 0;

--- a/include/llvm/ADT/PriorityWorklist.h
+++ b/include/llvm/ADT/PriorityWorklist.h
@ -18,6 +18,7 @@

 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
 #include <algorithm>
@ -107,6 +108,39 @@ public:
    return false;
  }

+  /// Insert a sequence of new elements into the PriorityWorklist.
+  template <typename SequenceT>
+  typename std::enable_if<!std::is_convertible<SequenceT, T>::value>::type
+  insert(SequenceT &&Input) {
+    if (std::begin(Input) == std::end(Input))
+      // Nothing to do for an empty input sequence.
+      return;
+
+    // First pull the input sequence into the vector as a bulk append
+    // operation.
+    ptrdiff_t StartIndex = V.size();
+    V.insert(V.end(), std::begin(Input), std::end(Input));
+    // Now walk backwards fixing up the index map and deleting any duplicates.
+    for (ptrdiff_t i = V.size() - 1; i >= StartIndex; --i) {
+      auto InsertResult = M.insert({V[i], i});
+      if (InsertResult.second)
+        continue;
+
+      // If the existing index is before this insert's start, nuke that one and
+      // move it up.
+      ptrdiff_t &Index = InsertResult.first->second;
+      if (Index < StartIndex) {
+        V[Index] = T();
+        Index = i;
+        continue;
+      }
+
+      // Otherwise the existing one comes first so just clear out the value in
+      // this slot.
+      V[i] = T();
+    }
+  }
+
  /// Remove the last element of the PriorityWorklist.
  void pop_back() {
    assert(!empty() && "Cannot remove an element when empty!");
@ -169,6 +203,11 @@ public:
    return true;
  }

+  /// Reverse the items in the PriorityWorklist.
+  ///
+  /// This does an in-place reversal. Other kinds of reverse aren't easy to
+  /// support in the face of the worklist semantics.
+
  /// Completely clear the PriorityWorklist
  void clear() {
    M.clear();
--- a/include/llvm/Analysis/Loads.h
+++ b/include/llvm/Analysis/Loads.h
@ -23,10 +23,9 @@ namespace llvm {
 class DataLayout;
 class MDNode;

-/// isDereferenceablePointer - Return true if this is always a dereferenceable
-/// pointer. If the context instruction is specified perform context-sensitive
-/// analysis and return true if the pointer is dereferenceable at the
-/// specified instruction.
+/// Return true if this is always a dereferenceable pointer. If the context
+/// instruction is specified perform context-sensitive analysis and return true
+/// if the pointer is dereferenceable at the specified instruction.
 bool isDereferenceablePointer(const Value *V, const DataLayout &DL,
                              const Instruction *CtxI = nullptr,
                              const DominatorTree *DT = nullptr);
@ -40,8 +39,7 @@ bool isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
                                        const Instruction *CtxI = nullptr,
                                        const DominatorTree *DT = nullptr);

-/// isSafeToLoadUnconditionally - Return true if we know that executing a load
-/// from this value cannot trap.
+/// Return true if we know that executing a load from this value cannot trap.
 ///
 /// If DT and ScanFrom are specified this method performs context-sensitive
 /// analysis and returns true if it is safe to load immediately before ScanFrom.
@ -54,12 +52,12 @@ bool isSafeToLoadUnconditionally(Value *V, unsigned Align,
                                 Instruction *ScanFrom = nullptr,
                                 const DominatorTree *DT = nullptr);

-/// DefMaxInstsToScan - the default number of maximum instructions
-/// to scan in the block, used by FindAvailableLoadedValue().
+/// The default number of maximum instructions to scan in the block, used by
+/// FindAvailableLoadedValue().
 extern cl::opt<unsigned> DefMaxInstsToScan;

-/// \brief Scan backwards to see if we have the value of the given load
-/// available locally within a small number of instructions.
+/// Scan backwards to see if we have the value of the given load available
+/// locally within a small number of instructions.
 ///
 /// You can use this function to scan across multiple blocks: after you call
 /// this function, if ScanFrom points at the beginning of the block, it's safe
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@ -208,6 +208,8 @@ public:
    SledKind Kind;
    bool AlwaysInstrument;
    const class Function *Fn;
+
+    void emit(int, MCStreamer *, const MCSymbol *) const;
  };

  // All the sleds to be emitted.
@ -216,6 +218,9 @@ public:
  // Helper function to record a given XRay sled.
  void recordSled(MCSymbol *Sled, const MachineInstr &MI, SledKind Kind);

+  /// Emit a table with all XRay instrumentation points.
+  void emitXRayTable();
+
  //===------------------------------------------------------------------===//
  // MachineFunctionPass Implementation.
  //===------------------------------------------------------------------===//
--- a/include/llvm/CodeGen/MachineDominators.h
+++ b/include/llvm/CodeGen/MachineDominators.h
@ -59,6 +59,9 @@ class MachineDominatorTree : public MachineFunctionPass {
  /// such as BB == elt.NewBB.
  mutable SmallSet<MachineBasicBlock *, 32> NewBBs;

+  /// The DominatorTreeBase that is used to compute a normal dominator tree
+  DominatorTreeBase<MachineBasicBlock>* DT;
+
  /// \brief Apply all the recorded critical edges to the DT.
  /// This updates the underlying DT information in a way that uses
  /// the fast query path of DT as much as possible.
@ -68,7 +71,6 @@ class MachineDominatorTree : public MachineFunctionPass {

 public:
  static char ID; // Pass ID, replacement for typeid
-  DominatorTreeBase<MachineBasicBlock>* DT;

  MachineDominatorTree();

--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@ -116,12 +116,12 @@ public:
    // An unsigned integer indicating the identity of the source file
    // corresponding to a machine instruction.
    uint16_t File;
-    // An unsigned integer whose value encodes the applicable instruction set
-    // architecture for the current instruction.
-    uint8_t Isa;
    // An unsigned integer representing the DWARF path discriminator value
    // for this location.
    uint32_t Discriminator;
+    // An unsigned integer whose value encodes the applicable instruction set
+    // architecture for the current instruction.
+    uint8_t Isa;
    // A boolean indicating that the current instruction is the beginning of a
    // statement.
    uint8_t IsStmt:1,
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@ -104,6 +104,13 @@ def int_amdgcn_dispatch_id :
 // Instruction Intrinsics
 //===----------------------------------------------------------------------===//

+// The first parameter is s_sendmsg immediate (i16),
+// the second one is copied to m0
+def int_amdgcn_s_sendmsg : GCCBuiltin<"__builtin_amdgcn_s_sendmsg">,
+  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
+def int_amdgcn_s_sendmsghalt : GCCBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
+  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
+
 def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
  Intrinsic<[], [], [IntrConvergent]>;

--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@ -2063,130 +2063,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                         llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
 }

-// Vector extract and insert
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_mask_vextractf32x4_512 :
-      GCCBuiltin<"__builtin_ia32_extractf32x4_mask">,
-                 Intrinsic<[llvm_v4f32_ty], [llvm_v16f32_ty, llvm_i32_ty,
-                            llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vextracti32x4_512 :
-      GCCBuiltin<"__builtin_ia32_extracti32x4_mask">,
-                 Intrinsic<[llvm_v4i32_ty], [llvm_v16i32_ty, llvm_i32_ty,
-                            llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vextractf32x4_256 :
-      GCCBuiltin<"__builtin_ia32_extractf32x4_256_mask">,
-                 Intrinsic<[llvm_v4f32_ty], [llvm_v8f32_ty, llvm_i32_ty,
-                            llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vextracti32x4_256 :
-      GCCBuiltin<"__builtin_ia32_extracti32x4_256_mask">,
-                 Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i32_ty,
-                            llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vextractf64x2_256 :
-      GCCBuiltin<"__builtin_ia32_extractf64x2_256_mask">,
-                 Intrinsic<[llvm_v2f64_ty], [llvm_v4f64_ty, llvm_i32_ty,
-                            llvm_v2f64_ty,  llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vextracti64x2_256 :
-      GCCBuiltin<"__builtin_ia32_extracti64x2_256_mask">,
-                 Intrinsic<[llvm_v2i64_ty], [llvm_v4i64_ty, llvm_i32_ty,
-                            llvm_v2i64_ty,  llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vextractf64x2_512 :
-      GCCBuiltin<"__builtin_ia32_extractf64x2_512_mask">,
-                 Intrinsic<[llvm_v2f64_ty], [llvm_v8f64_ty, llvm_i32_ty,
-                            llvm_v2f64_ty,  llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vextracti64x2_512 :
-      GCCBuiltin<"__builtin_ia32_extracti64x2_512_mask">,
-                 Intrinsic<[llvm_v2i64_ty], [llvm_v8i64_ty, llvm_i32_ty,
-                            llvm_v2i64_ty,  llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vextractf32x8_512 :
-      GCCBuiltin<"__builtin_ia32_extractf32x8_mask">,
-                 Intrinsic<[llvm_v8f32_ty], [llvm_v16f32_ty, llvm_i32_ty,
-                            llvm_v8f32_ty,  llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vextracti32x8_512 :
-      GCCBuiltin<"__builtin_ia32_extracti32x8_mask">,
-                 Intrinsic<[llvm_v8i32_ty],[llvm_v16i32_ty, llvm_i32_ty,
-                            llvm_v8i32_ty,  llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vextractf64x4_512 :
-      GCCBuiltin<"__builtin_ia32_extractf64x4_mask">,
-                 Intrinsic<[llvm_v4f64_ty], [llvm_v8f64_ty, llvm_i32_ty,
-                            llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vextracti64x4_512 :
-      GCCBuiltin<"__builtin_ia32_extracti64x4_mask">,
-                 Intrinsic<[llvm_v4i64_ty], [llvm_v8i64_ty, llvm_i32_ty,
-                            llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_mask_insertf32x4_256 :
-        GCCBuiltin<"__builtin_ia32_insertf32x4_256_mask">,
-          Intrinsic<[llvm_v8f32_ty],
-          [llvm_v8f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v8f32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_insertf32x4_512 :
-        GCCBuiltin<"__builtin_ia32_insertf32x4_mask">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_insertf32x8_512 :
-        GCCBuiltin<"__builtin_ia32_insertf32x8_mask">,
-          Intrinsic<[llvm_v16f32_ty],
-          [llvm_v16f32_ty, llvm_v8f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_insertf64x2_256 :
-        GCCBuiltin<"__builtin_ia32_insertf64x2_256_mask">,
-          Intrinsic<[llvm_v4f64_ty],
-          [llvm_v4f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v4f64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_insertf64x2_512 :
-        GCCBuiltin<"__builtin_ia32_insertf64x2_512_mask">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_insertf64x4_512 :
-        GCCBuiltin<"__builtin_ia32_insertf64x4_mask">,
-          Intrinsic<[llvm_v8f64_ty],
-          [llvm_v8f64_ty, llvm_v4f64_ty, llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_inserti32x4_256 :
-        GCCBuiltin<"__builtin_ia32_inserti32x4_256_mask">,
-          Intrinsic<[llvm_v8i32_ty],
-          [llvm_v8i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v8i32_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_inserti32x4_512 :
-        GCCBuiltin<"__builtin_ia32_inserti32x4_mask">,
-          Intrinsic<[llvm_v16i32_ty],
-          [llvm_v16i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_inserti32x8_512 :
-        GCCBuiltin<"__builtin_ia32_inserti32x8_mask">,
-          Intrinsic<[llvm_v16i32_ty],
-          [llvm_v16i32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_inserti64x2_256 :
-        GCCBuiltin<"__builtin_ia32_inserti64x2_256_mask">,
-          Intrinsic<[llvm_v4i64_ty],
-          [llvm_v4i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v4i64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_inserti64x2_512 :
-        GCCBuiltin<"__builtin_ia32_inserti64x2_512_mask">,
-          Intrinsic<[llvm_v8i64_ty],
-          [llvm_v8i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-
-  def int_x86_avx512_mask_inserti64x4_512 :
-        GCCBuiltin<"__builtin_ia32_inserti64x4_mask">,
-          Intrinsic<[llvm_v8i64_ty],
-          [llvm_v8i64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty],
-          [IntrNoMem]>;
-}
-
 // Conditional load ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
  def int_x86_avx2_maskload_d : GCCBuiltin<"__builtin_ia32_maskloadd">,
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@ -769,17 +769,13 @@ namespace detail {
  std::error_code directory_iterator_increment(DirIterState &);
  std::error_code directory_iterator_destruct(DirIterState &);

-  /// DirIterState - Keeps state for the directory_iterator. It is reference
-  /// counted in order to preserve InputIterator semantics on copy.
-  struct DirIterState : public RefCountedBase<DirIterState> {
-    DirIterState()
-      : IterationHandle(0) {}
-
+  /// Keeps state for the directory_iterator.
+  struct DirIterState {
    ~DirIterState() {
      directory_iterator_destruct(*this);
    }

-    intptr_t IterationHandle;
+    intptr_t IterationHandle = 0;
    directory_entry CurrentEntry;
  };
 } // end namespace detail
@ -788,23 +784,23 @@ namespace detail {
 /// operator++ because we need an error_code. If it's really needed we can make
 /// it call report_fatal_error on error.
 class directory_iterator {
-  IntrusiveRefCntPtr<detail::DirIterState> State;
+  std::shared_ptr<detail::DirIterState> State;

 public:
  explicit directory_iterator(const Twine &path, std::error_code &ec) {
-    State = new detail::DirIterState;
+    State = std::make_shared<detail::DirIterState>();
    SmallString<128> path_storage;
    ec = detail::directory_iterator_construct(*State,
            path.toStringRef(path_storage));
  }

  explicit directory_iterator(const directory_entry &de, std::error_code &ec) {
-    State = new detail::DirIterState;
+    State = std::make_shared<detail::DirIterState>();
    ec = detail::directory_iterator_construct(*State, de.path());
  }

  /// Construct end iterator.
-  directory_iterator() : State(nullptr) {}
+  directory_iterator() = default;

  // No operator++ because we need error_code.
  directory_iterator &increment(std::error_code &ec) {
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@ -209,6 +209,15 @@ struct DocumentListTraits {
  // static T::value_type& element(IO &io, T &seq, size_t index);
 };

+/// This class should be specialized by any type that needs to be converted
+/// to/from a YAML mapping in the case where the names of the keys are not known
+/// in advance, e.g. a string map.
+template <typename T>
+struct CustomMappingTraits {
+  // static void inputOne(IO &io, StringRef key, T &elem);
+  // static void output(IO &io, T &elem);
+};
+
 // Only used for better diagnostics of missing traits
 template <typename T>
 struct MissingTrait;
@ -358,6 +367,23 @@ public:
  static bool const value =  (sizeof(test<SequenceTraits<T>>(nullptr)) == 1);
 };

+// Test if CustomMappingTraits<T> is defined on type T.
+template <class T>
+struct has_CustomMappingTraits
+{
+  typedef void (*Signature_input)(IO &io, StringRef key, T &v);
+
+  template <typename U>
+  static char test(SameType<Signature_input, &U::inputOne>*);
+
+  template <typename U>
+  static double test(...);
+
+public:
+  static bool const value =
+      (sizeof(test<CustomMappingTraits<T>>(nullptr)) == 1);
+};
+
 // has_FlowTraits<int> will cause an error with some compilers because
 // it subclasses int.  Using this wrapper only instantiates the
 // real has_FlowTraits only if the template type is a class.
@ -493,6 +519,7 @@ struct missingTraits
                                        !has_BlockScalarTraits<T>::value &&
                                        !has_MappingTraits<T, Context>::value &&
                                        !has_SequenceTraits<T>::value &&
+                                        !has_CustomMappingTraits<T>::value &&
                                        !has_DocumentListTraits<T>::value> {};

 template <typename T, typename Context>
@ -531,6 +558,7 @@ public:
  virtual void endMapping() = 0;
  virtual bool preflightKey(const char*, bool, bool, bool &, void *&) = 0;
  virtual void postflightKey(void*) = 0;
+  virtual std::vector<StringRef> keys() = 0;

  virtual void beginFlowMapping() = 0;
  virtual void endFlowMapping() = 0;
@ -818,6 +846,21 @@ yamlize(IO &io, T &Val, bool, Context &Ctx) {
  }
 }

+template <typename T>
+typename std::enable_if<has_CustomMappingTraits<T>::value, void>::type
+yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) {
+  if ( io.outputting() ) {
+    io.beginMapping();
+    CustomMappingTraits<T>::output(io, Val);
+    io.endMapping();
+  } else {
+    io.beginMapping();
+    for (StringRef key : io.keys())
+      CustomMappingTraits<T>::inputOne(io, key, Val);
+    io.endMapping();
+  }
+}
+
 template <typename T>
 typename std::enable_if<missingTraits<T, EmptyContext>::value, void>::type
 yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) {
@ -1074,6 +1117,7 @@ private:
  void endMapping() override;
  bool preflightKey(const char *, bool, bool, bool &, void *&) override;
  void postflightKey(void *) override;
+  std::vector<StringRef> keys() override;
  void beginFlowMapping() override;
  void endFlowMapping() override;
  unsigned beginSequence() override;
@ -1154,10 +1198,8 @@ private:

    typedef llvm::StringMap<std::unique_ptr<HNode>> NameToNode;

-    bool isValidKey(StringRef key);
-
    NameToNode                        Mapping;
-    llvm::SmallVector<const char*, 6> ValidKeys;
+    llvm::SmallVector<std::string, 6> ValidKeys;
  };

  class SequenceHNode : public HNode {
@ -1215,6 +1257,7 @@ public:
  void endMapping() override;
  bool preflightKey(const char *key, bool, bool, bool &, void *&) override;
  void postflightKey(void *) override;
+  std::vector<StringRef> keys() override;
  void beginFlowMapping() override;
  void endFlowMapping() override;
  unsigned beginSequence() override;
@ -1384,6 +1427,17 @@ operator>>(Input &In, T &Val) {
  return In;
 }

+// Define non-member operator>> so that Input can stream in a string map.
+template <typename T>
+inline
+typename std::enable_if<has_CustomMappingTraits<T>::value, Input &>::type
+operator>>(Input &In, T &Val) {
+  EmptyContext Ctx;
+  if (In.setCurrentDocument())
+    yamlize(In, Val, true, Ctx);
+  return In;
+}
+
 // Provide better error message about types missing a trait specialization
 template <typename T>
 inline typename std::enable_if<missingTraits<T, EmptyContext>::value,
@ -1457,6 +1511,21 @@ operator<<(Output &Out, T &Val) {
  return Out;
 }

+// Define non-member operator<< so that Output can stream out a string map.
+template <typename T>
+inline
+typename std::enable_if<has_CustomMappingTraits<T>::value, Output &>::type
+operator<<(Output &Out, T &Val) {
+  EmptyContext Ctx;
+  Out.beginDocuments();
+  if (Out.preflightDocument(0)) {
+    yamlize(Out, Val, true, Ctx);
+    Out.postflightDocument();
+  }
+  Out.endDocuments();
+  return Out;
+}
+
 // Provide better error message about types missing a trait specialization
 template <typename T>
 inline typename std::enable_if<missingTraits<T, EmptyContext>::value,
@ -1476,6 +1545,18 @@ template <typename T> struct SequenceTraitsImpl {
  }
 };

+/// Implementation of CustomMappingTraits for std::map<std::string, T>.
+template <typename T> struct StdMapStringCustomMappingTraitsImpl {
+  typedef std::map<std::string, T> map_type;
+  static void inputOne(IO &io, StringRef key, map_type &v) {
+    io.mapRequired(key.str().c_str(), v[key]);
+  }
+  static void output(IO &io, map_type &v) {
+    for (auto &p : v)
+      io.mapRequired(p.first.c_str(), p.second);
+  }
+};
+
 } // end namespace yaml
 } // end namespace llvm

@ -1530,4 +1611,15 @@ template <typename T> struct SequenceTraitsImpl {
  }                                                                            \
  }

+/// Utility for declaring that std::map<std::string, _type> should be considered
+/// a YAML map.
+#define LLVM_YAML_IS_STRING_MAP(_type)                                         \
+  namespace llvm {                                                             \
+  namespace yaml {                                                             \
+  template <>                                                                  \
+  struct CustomMappingTraits<std::map<std::string, _type>>                     \
+      : public StdMapStringCustomMappingTraitsImpl<_type> {};                  \
+  }                                                                            \
+  }
+
 #endif // LLVM_SUPPORT_YAMLTRAITS_H
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@ -2542,9 +2542,6 @@ bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
    return !CFP->getValueAPF().isNegZero();

-  // FIXME: Magic number! At the least, this should be given a name because it's
-  // used similarly in CannotBeOrderedLessThanZero(). A better fix may be to
-  // expose it as a parameter, so it can be used for testing / experimenting.
  if (Depth == MaxDepth)
    return false;  // Limit search depth.

@ -2589,9 +2586,6 @@ bool llvm::CannotBeOrderedLessThanZero(const Value *V,
  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
    return !CFP->getValueAPF().isNegative() || CFP->getValueAPF().isZero();

-  // FIXME: Magic number! At the least, this should be given a name because it's
-  // used similarly in CannotBeNegativeZero(). A better fix may be to
-  // expose it as a parameter, so it can be used for testing / experimenting.
  if (Depth == MaxDepth)
    return false;  // Limit search depth.

--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@ -749,7 +749,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
    // handles the case where this is type ODRed with a definition needed
    // by the importing module, in which case the existing definition is
    // used.
-    if (IsImporting && !ImportFullTypeDefinitions &&
+    if (IsImporting && !ImportFullTypeDefinitions && Identifier &&
        (Tag == dwarf::DW_TAG_enumeration_type ||
         Tag == dwarf::DW_TAG_class_type ||
         Tag == dwarf::DW_TAG_structure_type ||
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@ -272,28 +272,10 @@ static const Value *getNoopInput(const Value *V,
               TLI.allowTruncateForTailCall(Op->getType(), I->getType())) {
      DataBits = std::min(DataBits, I->getType()->getPrimitiveSizeInBits());
      NoopInput = Op;
-    } else if (isa<CallInst>(I)) {
-      // Look through call (skipping callee)
-      for (User::const_op_iterator i = I->op_begin(), e = I->op_end() - 1;
-           i != e; ++i) {
-        unsigned attrInd = i - I->op_begin() + 1;
-        if (cast<CallInst>(I)->paramHasAttr(attrInd, Attribute::Returned) &&
-            isNoopBitcast((*i)->getType(), I->getType(), TLI)) {
-          NoopInput = *i;
-          break;
-        }
-      }
-    } else if (isa<InvokeInst>(I)) {
-      // Look through invoke (skipping BB, BB, Callee)
-      for (User::const_op_iterator i = I->op_begin(), e = I->op_end() - 3;
-           i != e; ++i) {
-        unsigned attrInd = i - I->op_begin() + 1;
-        if (cast<InvokeInst>(I)->paramHasAttr(attrInd, Attribute::Returned) &&
-            isNoopBitcast((*i)->getType(), I->getType(), TLI)) {
-          NoopInput = *i;
-          break;
-        }
-      }
+    } else if (auto CS = ImmutableCallSite(I)) {
+      const Value *ReturnedOp = CS.getReturnedArgOperand();
+      if (ReturnedOp && isNoopBitcast(ReturnedOp->getType(), I->getType(), TLI))
+        NoopInput = ReturnedOp;
    } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(V)) {
      // Value may come from either the aggregate or the scalar
      ArrayRef<unsigned> InsertLoc = IVI->getIndices();
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@ -37,6 +37,8 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
@ -2610,6 +2612,61 @@ AsmPrinterHandler::~AsmPrinterHandler() {}

 void AsmPrinterHandler::markFunctionEnd() {}

+// In the binary's "xray_instr_map" section, an array of these function entries
+// describes each instrumentation point.  When XRay patches your code, the index
+// into this table will be given to your handler as a patch point identifier.
+void AsmPrinter::XRayFunctionEntry::emit(int Bytes, MCStreamer *Out,
+                                         const MCSymbol *CurrentFnSym) const {
+  Out->EmitSymbolValue(Sled, Bytes);
+  Out->EmitSymbolValue(CurrentFnSym, Bytes);
+  auto Kind8 = static_cast<uint8_t>(Kind);
+  Out->EmitBytes(StringRef(reinterpret_cast<const char *>(&Kind8), 1));
+  Out->EmitBytes(
+      StringRef(reinterpret_cast<const char *>(&AlwaysInstrument), 1));
+  Out->EmitZeros(2 * Bytes - 2);  // Pad the previous two entries
+}
+
+void AsmPrinter::emitXRayTable() {
+  if (Sleds.empty())
+    return;
+
+  auto PrevSection = OutStreamer->getCurrentSectionOnly();
+  auto Fn = MF->getFunction();
+  MCSection *Section = nullptr;
+  if (MF->getSubtarget().getTargetTriple().isOSBinFormatELF()) {
+    if (Fn->hasComdat()) {
+      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+                                         ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
+                                         Fn->getComdat()->getName());
+    } else {
+      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+                                         ELF::SHF_ALLOC);
+    }
+  } else if (MF->getSubtarget().getTargetTriple().isOSBinFormatMachO()) {
+    Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
+                                         SectionKind::getReadOnlyWithRel());
+  } else {
+    llvm_unreachable("Unsupported target");
+  }
+
+  // Before we switch over, we force a reference to a label inside the
+  // xray_instr_map section. Since this function is always called just
+  // before the function's end, we assume that this is happening after
+  // the last return instruction.
+
+  auto WordSizeBytes = TM.getPointerSize();
+  MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true);
+  OutStreamer->EmitCodeAlignment(16);
+  OutStreamer->EmitSymbolValue(Tmp, WordSizeBytes, false);
+  OutStreamer->SwitchSection(Section);
+  OutStreamer->EmitLabel(Tmp);
+  for (const auto &Sled : Sleds)
+    Sled.emit(WordSizeBytes, OutStreamer.get(), CurrentFnSym);
+
+  OutStreamer->SwitchSection(PrevSection);
+  Sleds.clear();
+}
+
 void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI,
  SledKind Kind) {
  auto Fn = MI.getParent()->getParent()->getFunction();
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@ -1124,7 +1124,7 @@ void HoistSpillHelper::rmRedundantSpills(
  // earlier spill with smaller SlotIndex.
  for (const auto CurrentSpill : Spills) {
    MachineBasicBlock *Block = CurrentSpill->getParent();
-    MachineDomTreeNode *Node = MDT.DT->getNode(Block);
+    MachineDomTreeNode *Node = MDT.getBase().getNode(Block);
    MachineInstr *PrevSpill = SpillBBToSpill[Node];
    if (PrevSpill) {
      SlotIndex PIdx = LIS.getInstructionIndex(*PrevSpill);
@ -1132,9 +1132,9 @@ void HoistSpillHelper::rmRedundantSpills(
      MachineInstr *SpillToRm = (CIdx > PIdx) ? CurrentSpill : PrevSpill;
      MachineInstr *SpillToKeep = (CIdx > PIdx) ? PrevSpill : CurrentSpill;
      SpillsToRm.push_back(SpillToRm);
-      SpillBBToSpill[MDT.DT->getNode(Block)] = SpillToKeep;
+      SpillBBToSpill[MDT.getBase().getNode(Block)] = SpillToKeep;
    } else {
-      SpillBBToSpill[MDT.DT->getNode(Block)] = CurrentSpill;
+      SpillBBToSpill[MDT.getBase().getNode(Block)] = CurrentSpill;
    }
  }
  for (const auto SpillToRm : SpillsToRm)
@ -1209,7 +1209,7 @@ void HoistSpillHelper::getVisitOrders(
  // Sort the nodes in WorkSet in top-down order and save the nodes
  // in Orders. Orders will be used for hoisting in runHoistSpills.
  unsigned idx = 0;
-  Orders.push_back(MDT.DT->getNode(Root));
+  Orders.push_back(MDT.getBase().getNode(Root));
  do {
    MachineDomTreeNode *Node = Orders[idx++];
    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -4277,7 +4277,8 @@ struct BaseIndexOffset {
  }

  /// Parses tree in Ptr for base, index, offset addresses.
-  static BaseIndexOffset match(SDValue Ptr, SelectionDAG &DAG) {
+  static BaseIndexOffset match(SDValue Ptr, SelectionDAG &DAG,
+                               int64_t PartialOffset = 0) {
    bool IsIndexSignExt = false;

    // Split up a folded GlobalAddress+Offset into its component parts.
@ -4286,7 +4287,7 @@ struct BaseIndexOffset {
        return BaseIndexOffset(DAG.getGlobalAddress(GA->getGlobal(),
                                                    SDLoc(GA),
                                                    GA->getValueType(0),
-                                                    /*Offset=*/0,
+                                                    /*Offset=*/PartialOffset,
                                                    /*isTargetGA=*/false,
                                                    GA->getTargetFlags()),
                               SDValue(),
@ -4298,14 +4299,13 @@ struct BaseIndexOffset {
    // instruction, then it could be just the BASE or everything else we don't
    // know how to handle. Just use Ptr as BASE and give up.
    if (Ptr->getOpcode() != ISD::ADD)
-      return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+      return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);

    // We know that we have at least an ADD instruction. Try to pattern match
    // the simple case of BASE + OFFSET.
    if (isa<ConstantSDNode>(Ptr->getOperand(1))) {
      int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue();
-      return  BaseIndexOffset(Ptr->getOperand(0), SDValue(), Offset,
-                              IsIndexSignExt);
+      return match(Ptr->getOperand(0), DAG, Offset + PartialOffset);
    }

    // Inside a loop the current BASE pointer is calculated using an ADD and a
@ -4314,7 +4314,7 @@ struct BaseIndexOffset {
    //          (i64 mul (i64 %induction_var)
    //                   (i64 %element_size)))
    if (Ptr->getOperand(1)->getOpcode() == ISD::MUL)
-      return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+      return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);

    // Look at Base + Index + Offset cases.
    SDValue Base = Ptr->getOperand(0);
@ -4328,14 +4328,14 @@ struct BaseIndexOffset {

    // Either the case of Base + Index (no offset) or something else.
    if (IndexOffset->getOpcode() != ISD::ADD)
-      return BaseIndexOffset(Base, IndexOffset, 0, IsIndexSignExt);
+      return BaseIndexOffset(Base, IndexOffset, PartialOffset, IsIndexSignExt);

    // Now we have the case of Base + Index + offset.
    SDValue Index = IndexOffset->getOperand(0);
    SDValue Offset = IndexOffset->getOperand(1);

    if (!isa<ConstantSDNode>(Offset))
-      return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+      return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);

    // Ignore signextends.
    if (Index->getOpcode() == ISD::SIGN_EXTEND) {
@ -4344,7 +4344,7 @@ struct BaseIndexOffset {
    } else IsIndexSignExt = false;

    int64_t Off = cast<ConstantSDNode>(Offset)->getSExtValue();
-    return BaseIndexOffset(Base, Index, Off, IsIndexSignExt);
+    return BaseIndexOffset(Base, Index, Off + PartialOffset, IsIndexSignExt);
  }
 };
 } // namespace
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@ -88,15 +88,15 @@ void OProfileJITEventListener::NotifyObjectEmitted(
  // Use symbol info to iterate functions in the object.
  for (const std::pair<SymbolRef, uint64_t> &P : computeSymbolSizes(DebugObj)) {
    SymbolRef Sym = P.first;
-    if (Sym.getType() != SymbolRef::ST_Function)
+    if (!Sym.getType() || *Sym.getType() != SymbolRef::ST_Function)
      continue;

-    ErrorOr<StringRef> NameOrErr = Sym.getName();
-    if (NameOrErr.getError())
+    Expected<StringRef> NameOrErr = Sym.getName();
+    if (!NameOrErr)
      continue;
    StringRef Name = *NameOrErr;
-    ErrorOr<uint64_t> AddrOrErr = Sym.getAddress();
-    if (AddrOrErr.getError())
+    Expected<uint64_t> AddrOrErr = Sym.getAddress();
+    if (!AddrOrErr)
      continue;
    uint64_t Addr = *AddrOrErr;
    uint64_t Size = P.second;
@ -128,9 +128,9 @@ void OProfileJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
    for (symbol_iterator I = DebugObj.symbol_begin(),
                         E = DebugObj.symbol_end();
         I != E; ++I) {
-      if (I->getType() == SymbolRef::ST_Function) {
-        ErrorOr<uint64_t> AddrOrErr = I->getAddress();
-        if (AddrOrErr.getError())
+      if (I->getType() && *I->getType() == SymbolRef::ST_Function) {
+        Expected<uint64_t> AddrOrErr = I->getAddress();
+        if (!AddrOrErr)
          continue;
        uint64_t Addr = *AddrOrErr;

--- a/lib/Fuzzer/FuzzerTracePC.cpp
+++ b/lib/Fuzzer/FuzzerTracePC.cpp
@ -80,6 +80,7 @@ static bool IsInterestingCoverageFile(std::string &File) {
 }

 void TracePC::InitializePrintNewPCs() {
+  if (!DoPrintNewPCs) return;
  assert(!PrintedPCs);
  PrintedPCs = new std::set<uintptr_t>;
  for (size_t i = 1; i < GetNumPCs(); i++)
@ -88,6 +89,7 @@ void TracePC::InitializePrintNewPCs() {
 }

 void TracePC::PrintNewPCs() {
+  if (!DoPrintNewPCs) return;
  assert(PrintedPCs);
  for (size_t i = 1; i < GetNumPCs(); i++)
    if (PCs[i] && PrintedPCs->insert(PCs[i]).second)
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@ -342,8 +342,10 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
         Name.startswith("avx.vinsertf128.") || // Added in 3.7
         Name == "avx2.vinserti128" || // Added in 3.7
+         Name.startswith("avx512.mask.insert") || // Added in 4.0
         Name.startswith("avx.vextractf128.") || // Added in 3.7
         Name == "avx2.vextracti128" || // Added in 3.7
+         Name.startswith("avx512.mask.vextract") || // Added in 4.0
         Name.startswith("sse4a.movnt.") || // Added in 3.9
         Name.startswith("avx.movnt.") || // Added in 3.2
         Name.startswith("avx512.storent.") || // Added in 3.9
@ -1150,21 +1152,25 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {

      Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
    } else if (IsX86 && (Name.startswith("avx.vinsertf128.") ||
-                         Name == "avx2.vinserti128")) {
+                         Name == "avx2.vinserti128" ||
+                         Name.startswith("avx512.mask.insert"))) {
      Value *Op0 = CI->getArgOperand(0);
      Value *Op1 = CI->getArgOperand(1);
      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      VectorType *VecTy = cast<VectorType>(CI->getType());
-      unsigned NumElts = VecTy->getNumElements();
+      unsigned DstNumElts = CI->getType()->getVectorNumElements();
+      unsigned SrcNumElts = Op1->getType()->getVectorNumElements();
+      unsigned Scale = DstNumElts / SrcNumElts;

      // Mask off the high bits of the immediate value; hardware ignores those.
-      Imm = Imm & 1;
+      Imm = Imm % Scale;

-      // Extend the second operand into a vector that is twice as big.
+      // Extend the second operand into a vector the size of the destination.
      Value *UndefV = UndefValue::get(Op1->getType());
-      SmallVector<uint32_t, 8> Idxs(NumElts);
-      for (unsigned i = 0; i != NumElts; ++i)
+      SmallVector<uint32_t, 8> Idxs(DstNumElts);
+      for (unsigned i = 0; i != SrcNumElts; ++i)
        Idxs[i] = i;
+      for (unsigned i = SrcNumElts; i != DstNumElts; ++i)
+        Idxs[i] = SrcNumElts;
      Rep = Builder.CreateShuffleVector(Op1, UndefV, Idxs);

      // Insert the second operand into the first operand.
@ -1178,33 +1184,41 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
      // Imm = 1  <i32 0, i32 1, i32 2,  i32 3,  i32 8, i32 9, i32 10, i32 11>
      // Imm = 0  <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6,  i32 7 >

-      // The low half of the result is either the low half of the 1st operand
-      // or the low half of the 2nd operand (the inserted vector).
-      for (unsigned i = 0; i != NumElts / 2; ++i)
-        Idxs[i] = Imm ? i : (i + NumElts);
-      // The high half of the result is either the low half of the 2nd operand
-      // (the inserted vector) or the high half of the 1st operand.
-      for (unsigned i = NumElts / 2; i != NumElts; ++i)
-        Idxs[i] = Imm ? (i + NumElts / 2) : i;
+      // First fill with identify mask.
+      for (unsigned i = 0; i != DstNumElts; ++i)
+        Idxs[i] = i;
+      // Then replace the elements where we need to insert.
+      for (unsigned i = 0; i != SrcNumElts; ++i)
+        Idxs[i + Imm * SrcNumElts] = i + DstNumElts;
      Rep = Builder.CreateShuffleVector(Op0, Rep, Idxs);
+
+      // If the intrinsic has a mask operand, handle that.
+      if (CI->getNumArgOperands() == 5)
+        Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
+                            CI->getArgOperand(3));
    } else if (IsX86 && (Name.startswith("avx.vextractf128.") ||
-                         Name == "avx2.vextracti128")) {
+                         Name == "avx2.vextracti128" ||
+                         Name.startswith("avx512.mask.vextract"))) {
      Value *Op0 = CI->getArgOperand(0);
      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      VectorType *VecTy = cast<VectorType>(CI->getType());
-      unsigned NumElts = VecTy->getNumElements();
+      unsigned DstNumElts = CI->getType()->getVectorNumElements();
+      unsigned SrcNumElts = Op0->getType()->getVectorNumElements();
+      unsigned Scale = SrcNumElts / DstNumElts;

      // Mask off the high bits of the immediate value; hardware ignores those.
-      Imm = Imm & 1;
+      Imm = Imm % Scale;

-      // Get indexes for either the high half or low half of the input vector.
-      SmallVector<uint32_t, 4> Idxs(NumElts);
-      for (unsigned i = 0; i != NumElts; ++i) {
-        Idxs[i] = Imm ? (i + NumElts) : i;
+      // Get indexes for the subvector of the input vector.
+      SmallVector<uint32_t, 8> Idxs(DstNumElts);
+      for (unsigned i = 0; i != DstNumElts; ++i) {
+        Idxs[i] = i + (Imm * DstNumElts);
      }
+      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);

-      Value *UndefV = UndefValue::get(Op0->getType());
-      Rep = Builder.CreateShuffleVector(Op0, UndefV, Idxs);
+      // If the intrinsic has a mask operand, handle that.
+      if (CI->getNumArgOperands() == 4)
+        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                            CI->getArgOperand(2));
    } else if (!IsX86 && Name == "stackprotectorcheck") {
      Rep = nullptr;
    } else if (IsX86 && (Name.startswith("avx512.mask.perm.df.") ||
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@ -891,23 +891,17 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
                      AddStream, Cache);

-  // Partition numbers for ThinLTO jobs start at 1 (see comments for
-  // GlobalResolution in LTO.h). Task numbers, however, start at
-  // ParallelCodeGenParallelismLevel if an LTO module is present, as tasks 0
-  // through ParallelCodeGenParallelismLevel-1 are reserved for parallel code
-  // generation partitions.
+  // Task numbers start at ParallelCodeGenParallelismLevel if an LTO
+  // module is present, as tasks 0 through ParallelCodeGenParallelismLevel-1
+  // are reserved for parallel code generation partitions.
  unsigned Task =
      HasRegularLTO ? RegularLTO.ParallelCodeGenParallelismLevel : 0;
-  unsigned Partition = 1;
-
  for (auto &Mod : ThinLTO.ModuleMap) {
    if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first],
                                     ExportLists[Mod.first],
                                     ResolvedODR[Mod.first], ThinLTO.ModuleMap))
      return E;
-
    ++Task;
-    ++Partition;
  }

  return BackendProc->wait();
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@ -76,8 +76,12 @@ namespace llvm {
     compile-time arithmetic on PPC double-double numbers, it is not able
     to represent all possible values held by a PPC double-double number,
     for example: (long double) 1.0 + (long double) 0x1p-106
-     Should this be replaced by a full emulation of PPC double-double?  */
-  static const fltSemantics semPPCDoubleDouble = {0, 0, 0, 0};
+     Should this be replaced by a full emulation of PPC double-double?
+
+     Note: we need to make the value different from semBogus as otherwise
+     an unsafe optimization may collapse both values to a single address,
+     and we heavily rely on them having distinct addresses.             */
+  static const fltSemantics semPPCDoubleDouble = {-1, 0, 0, 0};

  /* There are temporary semantics for the real PPCDoubleDouble implementation.
     Currently, APFloat of PPCDoubleDouble holds one PPCDoubleDoubleImpl as the
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@ -1069,6 +1069,7 @@ StringRef sys::getHostCPUName() {
      .Case("POWER7", "pwr7")
      .Case("POWER8", "pwr8")
      .Case("POWER8E", "pwr8")
+      .Case("POWER8NVL", "pwr8")
      .Case("POWER9", "pwr9")
      .Default(generic);
 }
--- a/lib/Support/NativeFormatting.cpp
+++ b/lib/Support/NativeFormatting.cpp
@ -239,10 +239,7 @@ void llvm::write_double(raw_ostream &S, double N, FloatStyle Style,
    N *= 100.0;

  char Buf[32];
-  unsigned Len;
-  Len = format(Spec.c_str(), N).snprint(Buf, sizeof(Buf));
-  if (Style == FloatStyle::Percent)
-    ++Len;
+  format(Spec.c_str(), N).snprint(Buf, sizeof(Buf));
  S << Buf;
  if (Style == FloatStyle::Percent)
    S << '%';
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@ -118,6 +118,18 @@ void Input::beginMapping() {
  }
 }

+std::vector<StringRef> Input::keys() {
+  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  std::vector<StringRef> Ret;
+  if (!MN) {
+    setError(CurrentNode, "not a mapping");
+    return Ret;
+  }
+  for (auto &P : MN->Mapping)
+    Ret.push_back(P.first());
+  return Ret;
+}
+
 bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault,
                         void *&SaveInfo) {
  UseDefault = false;
@ -163,7 +175,7 @@ void Input::endMapping() {
  if (!MN)
    return;
  for (const auto &NN : MN->Mapping) {
-    if (!MN->isValidKey(NN.first())) {
+    if (!is_contained(MN->ValidKeys, NN.first())) {
      setError(NN.second.get(), Twine("unknown key '") + NN.first() + "'");
      break;
    }
@ -373,14 +385,6 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
  }
 }

-bool Input::MapHNode::isValidKey(StringRef Key) {
-  for (const char *K : ValidKeys) {
-    if (Key.equals(K))
-      return true;
-  }
-  return false;
-}
-
 void Input::setError(const Twine &Message) {
  this->setError(CurrentNode, Message);
 }
@ -451,6 +455,10 @@ void Output::endMapping() {
  StateStack.pop_back();
 }

+std::vector<StringRef> Output::keys() {
+  report_fatal_error("invalid call");
+}
+
 bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault,
                          bool &UseDefault, void *&) {
  UseDefault = false;
--- a/lib/TableGen/StringMatcher.cpp
+++ b/lib/TableGen/StringMatcher.cpp
@ -11,9 +11,15 @@
 //
 //===----------------------------------------------------------------------===//

-#include "llvm/TableGen/StringMatcher.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/StringMatcher.h"
+#include <cassert>
 #include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
 using namespace llvm;

 /// FindFirstNonCommonLetter - Find the first character in the keys of the
@ -67,7 +73,7 @@ EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
  }
  
  // Bucket the matches by the character we are comparing.
-  std::map<char, std::vector<const StringPair*> > MatchesByLetter;
+  std::map<char, std::vector<const StringPair*>> MatchesByLetter;
  
  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
    MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]);
@ -91,7 +97,7 @@ EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
      // FIXME: Need to escape general strings.
      OS << Indent << "if (memcmp(" << StrVariableName << ".data()+" << CharNo
         << ", \"" << Matches[0]->first.substr(CharNo, NumChars) << "\", "
-         << NumChars << "))\n";
+         << NumChars << ") != 0)\n";
      OS << Indent << "  break;\n";
    }
    
@ -103,7 +109,7 @@ EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
  OS << Indent << "switch (" << StrVariableName << "[" << CharNo << "]) {\n";
  OS << Indent << "default: break;\n";
  
-  for (std::map<char, std::vector<const StringPair*> >::iterator LI = 
+  for (std::map<char, std::vector<const StringPair*>>::iterator LI = 
       MatchesByLetter.begin(), E = MatchesByLetter.end(); LI != E; ++LI) {
    // TODO: escape hard stuff (like \n) if we ever care about it.
    OS << Indent << "case '" << LI->first << "':\t // "
@ -118,7 +124,6 @@ EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
  return true;
 }

-
 /// Emit - Top level entry point.
 ///
 void StringMatcher::Emit(unsigned Indent) const {
@ -126,7 +131,7 @@ void StringMatcher::Emit(unsigned Indent) const {
  if (Matches.empty()) return;
  
  // First level categorization: group strings by length.
-  std::map<unsigned, std::vector<const StringPair*> > MatchesByLength;
+  std::map<unsigned, std::vector<const StringPair*>> MatchesByLength;
  
  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
    MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]);
@ -136,7 +141,7 @@ void StringMatcher::Emit(unsigned Indent) const {
  OS.indent(Indent*2+2) << "switch (" << StrVariableName << ".size()) {\n";
  OS.indent(Indent*2+2) << "default: break;\n";
  
-  for (std::map<unsigned, std::vector<const StringPair*> >::iterator LI =
+  for (std::map<unsigned, std::vector<const StringPair*>>::iterator LI =
       MatchesByLength.begin(), E = MatchesByLength.end(); LI != E; ++LI) {
    OS.indent(Indent*2+2) << "case " << LI->first << ":\t // "
       << LI->second.size()
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@ -264,9 +264,13 @@ def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                   "Qualcomm Falkor processors", [
                                   FeatureCRC,
                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
                                   FeatureFPARMv8,
                                   FeatureNEON,
-                                   FeaturePerfMon
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive,
+                                   FeatureZCZeroing
                                   ]>;

 def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@ -76,7 +76,6 @@ public:
  void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
  void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);

-  void EmitXRayTable();
  void EmitSled(const MachineInstr &MI, SledKind Kind);

  /// \brief tblgen'erated driver function for lowering simple MI->MC
@ -95,7 +94,7 @@ public:
    AArch64FI = F.getInfo<AArch64FunctionInfo>();
    STI = static_cast<const AArch64Subtarget*>(&F.getSubtarget());
    bool Result = AsmPrinter::runOnMachineFunction(F);
-    EmitXRayTable();
+    emitXRayTable();
    return Result;
  }

@ -150,59 +149,6 @@ void AArch64AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
  EmitSled(MI, SledKind::TAIL_CALL);
 }

-void AArch64AsmPrinter::EmitXRayTable()
-{
-  //TODO: merge the logic for ELF XRay sleds at a higher level, so to avoid
-  // code duplication as it is now for x86_64, ARM32 and AArch64.
-  if (Sleds.empty())
-    return;
-
-  auto PrevSection = OutStreamer->getCurrentSectionOnly();
-  auto Fn = MF->getFunction();
-  MCSection *Section;
-
-  if (STI->isTargetELF()) {
-    if (Fn->hasComdat())
-      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
-        ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
-        Fn->getComdat()->getName());
-    else
-      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
-        ELF::SHF_ALLOC);
-  } else if (STI->isTargetMachO()) {
-    Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
-                                         SectionKind::getReadOnlyWithRel());
-  } else {
-    llvm_unreachable("Unsupported target");
-  }
-
-  // Before we switch over, we force a reference to a label inside the
-  // xray_instr_map section. Since EmitXRayTable() is always called just
-  // before the function's end, we assume that this is happening after the
-  // last return instruction.
-  //
-  // We then align the reference to 16 byte boundaries, which we determined
-  // experimentally to be beneficial to avoid causing decoder stalls.
-  MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true);
-  OutStreamer->EmitCodeAlignment(16);
-  OutStreamer->EmitSymbolValue(Tmp, 8, false);
-  OutStreamer->SwitchSection(Section);
-  OutStreamer->EmitLabel(Tmp);
-  for (const auto &Sled : Sleds) {
-    OutStreamer->EmitSymbolValue(Sled.Sled, 8);
-    OutStreamer->EmitSymbolValue(CurrentFnSym, 8);
-    auto Kind = static_cast<uint8_t>(Sled.Kind);
-    OutStreamer->EmitBytes(
-      StringRef(reinterpret_cast<const char *>(&Kind), 1));
-    OutStreamer->EmitBytes(
-      StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
-    OutStreamer->EmitZeros(14);
-  }
-  OutStreamer->SwitchSection(PrevSection);
-
-  Sleds.clear();
-}
-
 void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
 {
  static const int8_t NoopsInSledCount = 7;
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@ -1470,6 +1470,9 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
  bool IsUnscaled = TII->isUnscaledLdSt(MI);
  int Offset = getLdStOffsetOp(MI).getImm();
  int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
+  // Allow one more for offset.
+  if (Offset > 0)
+    Offset -= OffsetStride;
  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
    return false;

--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -3048,6 +3048,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
  NODE_NAME_CASE(KILL)
  case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
  NODE_NAME_CASE(SENDMSG)
+  NODE_NAME_CASE(SENDMSGHALT)
  NODE_NAME_CASE(INTERP_MOV)
  NODE_NAME_CASE(INTERP_P1)
  NODE_NAME_CASE(INTERP_P2)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@ -313,6 +313,7 @@ enum NodeType : unsigned {
  /// Pointer to the start of the shader's constant data.
  CONST_DATA_PTR,
  SENDMSG,
+  SENDMSGHALT,
  INTERP_MOV,
  INTERP_P1,
  INTERP_P2,
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@ -266,6 +266,10 @@ def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
                    SDTypeProfile<0, 1, [SDTCisInt<0>]>,
                    [SDNPHasChain, SDNPInGlue]>;

+def AMDGPUsendmsghalt : SDNode<"AMDGPUISD::SENDMSGHALT",
+                    SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+                    [SDNPHasChain, SDNPInGlue]>;
+
 def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV",
                        SDTypeProfile<1, 3, [SDTCisFP<0>]>,
                        [SDNPInGlue]>;
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -2706,12 +2706,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

  switch (IntrinsicID) {
-  case AMDGPUIntrinsic::SI_sendmsg: {
+  case AMDGPUIntrinsic::SI_sendmsg:
+  case Intrinsic::amdgcn_s_sendmsg: {
    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
    SDValue Glue = Chain.getValue(1);
    return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
                       Op.getOperand(2), Glue);
  }
+  case Intrinsic::amdgcn_s_sendmsghalt: {
+    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+    SDValue Glue = Chain.getValue(1);
+    return DAG.getNode(AMDGPUISD::SENDMSGHALT, DL, MVT::Other, Chain,
+                       Op.getOperand(2), Glue);
+  }
  case AMDGPUIntrinsic::SI_tbuffer_store: {
    SDValue Ops[] = {
      Chain,
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@ -504,7 +504,7 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
    return;

  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
-  if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
+  if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
    LastInstWritesM0 = false;
    return;
@ -619,7 +619,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
      // signalling other hardware blocks
      if ((I->getOpcode() == AMDGPU::S_BARRIER &&
               ST->needWaitcntBeforeBarrier()) ||
-           I->getOpcode() == AMDGPU::S_SENDMSG)
+           I->getOpcode() == AMDGPU::S_SENDMSG ||
+           I->getOpcode() == AMDGPU::S_SENDMSGHALT)
        Required = LastIssued;
      else
        Required = handleOperands(*I);
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@ -828,9 +828,12 @@ let Uses = [EXEC, M0] in {
 def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
  [(AMDGPUsendmsg (i32 imm:$simm16))]
 >;
+
+def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16",
+  [(AMDGPUsendmsghalt (i32 imm:$simm16))]
+>;
 } // End Uses = [EXEC, M0]

-def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16">;
 def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
 def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
  let simm16 = 0;
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@ -164,9 +164,6 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
  // Emit the rest of the function body.
  EmitFunctionBody();

-  // Emit the XRay table for this function.
-  EmitXRayTable();
-
  // If we need V4T thumb mode Register Indirect Jump pads, emit them.
  // These are created per function, rather than per TU, since it's
  // relatively easy to exceed the thumb branch range within a TU.
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@ -113,9 +113,6 @@ public:
  void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
  void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
  void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
-  // Helper function that emits the XRay sleds we've collected for a particular
-  // function.
-  void EmitXRayTable();

 private:
  void EmitSled(const MachineInstr &MI, SledKind Kind);
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@ -22,9 +22,6 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
 using namespace llvm;
@ -226,38 +223,3 @@ void ARMAsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
 {
  EmitSled(MI, SledKind::TAIL_CALL);
 }
-
-void ARMAsmPrinter::EmitXRayTable()
-{
-  if (Sleds.empty())
-    return;
-
-  MCSection *Section = nullptr;
-  if (Subtarget->isTargetELF()) {
-    Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
-                                       ELF::SHF_ALLOC | ELF::SHF_GROUP |
-                                           ELF::SHF_MERGE,
-                                       0, CurrentFnSym->getName());
-  } else if (Subtarget->isTargetMachO()) {
-    Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
-                                         SectionKind::getReadOnlyWithRel());
-  } else {
-    llvm_unreachable("Unsupported target");
-  }
-
-  auto PrevSection = OutStreamer->getCurrentSectionOnly();
-  OutStreamer->SwitchSection(Section);
-  for (const auto &Sled : Sleds) {
-    OutStreamer->EmitSymbolValue(Sled.Sled, 4);
-    OutStreamer->EmitSymbolValue(CurrentFnSym, 4);
-    auto Kind = static_cast<uint8_t>(Sled.Kind);
-    OutStreamer->EmitBytes(
-      StringRef(reinterpret_cast<const char *>(&Kind), 1));
-    OutStreamer->EmitBytes(
-      StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
-    OutStreamer->EmitZeros(6);
-  }
-  OutStreamer->SwitchSection(PrevSection);
-
-  Sleds.clear();
-}
--- a/lib/Target/Hexagon/BitTracker.cpp
+++ b/lib/Target/Hexagon/BitTracker.cpp
@ -53,28 +53,36 @@
 //
 // The code below is intended to be fully target-independent.

+#include "BitTracker.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-
-#include "BitTracker.h"
+#include <iterator>
+#include <cassert>
+#include <cstdint>

 using namespace llvm;

 typedef BitTracker BT;

 namespace {
+
  // Local trickery to pretty print a register (without the whole "%vreg"
  // business).
  struct printv {
    printv(unsigned r) : R(r) {}
+
    unsigned R;
  };
+
  raw_ostream &operator<< (raw_ostream &OS, const printv &PV) {
    if (PV.R)
      OS << 'v' << TargetRegisterInfo::virtReg2Index(PV.R);
@ -82,9 +90,11 @@ namespace {
      OS << 's';
    return OS;
  }
-}
+
+} // end anonymous namespace

 namespace llvm {
+
  raw_ostream &operator<<(raw_ostream &OS, const BT::BitValue &BV) {
    switch (BV.Type) {
      case BT::BitValue::Top:
@ -167,14 +177,14 @@ namespace llvm {

    return OS;
  }
-}
+
+} // end namespace llvm

 void BitTracker::print_cells(raw_ostream &OS) const {
  for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I)
    dbgs() << PrintReg(I->first, &ME.TRI) << " -> " << I->second << "\n";
 }

-
 BitTracker::BitTracker(const MachineEvaluator &E, MachineFunction &F)
    : Trace(false), ME(E), MF(F), MRI(F.getRegInfo()), Map(*new CellMapType) {}

@ -182,7 +192,6 @@ BitTracker::~BitTracker() {
  delete &Map;
 }

-
 // If we were allowed to update a cell for a part of a register, the meet
 // operation would need to be parametrized by the register number and the
 // exact part of the register, so that the computer BitRefs correspond to
@ -201,7 +210,6 @@ bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) {
  return Changed;
 }

-
 // Insert the entire cell RC into the current cell at position given by M.
 BT::RegisterCell &BT::RegisterCell::insert(const BT::RegisterCell &RC,
      const BitMask &M) {
@ -224,7 +232,6 @@ BT::RegisterCell &BT::RegisterCell::insert(const BT::RegisterCell &RC,
  return *this;
 }

-
 BT::RegisterCell BT::RegisterCell::extract(const BitMask &M) const {
  uint16_t B = M.first(), E = M.last(), W = width();
  assert(B < W && E < W);
@ -243,7 +250,6 @@ BT::RegisterCell BT::RegisterCell::extract(const BitMask &M) const {
  return RC;
 }

-
 BT::RegisterCell &BT::RegisterCell::rol(uint16_t Sh) {
  // Rotate left (i.e. towards increasing bit indices).
  // Swap the two parts:  [0..W-Sh-1] [W-Sh..W-1]
@ -265,7 +271,6 @@ BT::RegisterCell &BT::RegisterCell::rol(uint16_t Sh) {
  return *this;
 }

-
 BT::RegisterCell &BT::RegisterCell::fill(uint16_t B, uint16_t E,
      const BitValue &V) {
  assert(B <= E);
@ -274,7 +279,6 @@ BT::RegisterCell &BT::RegisterCell::fill(uint16_t B, uint16_t E,
  return *this;
 }

-
 BT::RegisterCell &BT::RegisterCell::cat(const RegisterCell &RC) {
  // Append the cell given as the argument to the "this" cell.
  // Bit 0 of RC becomes bit W of the result, where W is this->width().
@ -285,7 +289,6 @@ BT::RegisterCell &BT::RegisterCell::cat(const RegisterCell &RC) {
  return *this;
 }

-
 uint16_t BT::RegisterCell::ct(bool B) const {
  uint16_t W = width();
  uint16_t C = 0;
@ -295,7 +298,6 @@ uint16_t BT::RegisterCell::ct(bool B) const {
  return C;
 }

-
 uint16_t BT::RegisterCell::cl(bool B) const {
  uint16_t W = width();
  uint16_t C = 0;
@ -305,7 +307,6 @@ uint16_t BT::RegisterCell::cl(bool B) const {
  return C;
 }

-
 bool BT::RegisterCell::operator== (const RegisterCell &RC) const {
  uint16_t W = Bits.size();
  if (RC.Bits.size() != W)
@ -316,7 +317,6 @@ bool BT::RegisterCell::operator== (const RegisterCell &RC) const {
  return true;
 }

-
 uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
  // The general problem is with finding a register class that corresponds
  // to a given reference reg:sub. There can be several such classes, and
@ -342,7 +342,6 @@ uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
  return BW;
 }

-
 BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
      const CellMapType &M) const {
  uint16_t BW = getRegBitWidth(RR);
@ -370,7 +369,6 @@ BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
  return RegisterCell::top(BW);
 }

-
 void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
      CellMapType &M) const {
  // While updating the cell map can be done in a meaningful way for
@ -388,7 +386,6 @@ void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
  M[RR.Reg] = RC;
 }

-
 // Check if the cell represents a compile-time integer value.
 bool BT::MachineEvaluator::isInt(const RegisterCell &A) const {
  uint16_t W = A.width();
@ -398,7 +395,6 @@ bool BT::MachineEvaluator::isInt(const RegisterCell &A) const {
  return true;
 }

-
 // Convert a cell to the integer value. The result must fit in uint64_t.
 uint64_t BT::MachineEvaluator::toInt(const RegisterCell &A) const {
  assert(isInt(A));
@ -411,7 +407,6 @@ uint64_t BT::MachineEvaluator::toInt(const RegisterCell &A) const {
  return Val;
 }

-
 // Evaluator helper functions. These implement some common operation on
 // register cells that can be used to implement target-specific instructions
 // in a target-specific evaluator.
@ -426,7 +421,6 @@ BT::RegisterCell BT::MachineEvaluator::eIMM(int64_t V, uint16_t W) const {
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eIMM(const ConstantInt *CI) const {
  const APInt &A = CI->getValue();
  uint16_t BW = A.getBitWidth();
@ -437,7 +431,6 @@ BT::RegisterCell BT::MachineEvaluator::eIMM(const ConstantInt *CI) const {
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eADD(const RegisterCell &A1,
      const RegisterCell &A2) const {
  uint16_t W = A1.width();
@ -471,7 +464,6 @@ BT::RegisterCell BT::MachineEvaluator::eADD(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eSUB(const RegisterCell &A1,
      const RegisterCell &A2) const {
  uint16_t W = A1.width();
@ -505,29 +497,26 @@ BT::RegisterCell BT::MachineEvaluator::eSUB(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eMLS(const RegisterCell &A1,
      const RegisterCell &A2) const {
  uint16_t W = A1.width() + A2.width();
-  uint16_t Z = A1.ct(0) + A2.ct(0);
+  uint16_t Z = A1.ct(false) + A2.ct(false);
  RegisterCell Res(W);
  Res.fill(0, Z, BitValue::Zero);
  Res.fill(Z, W, BitValue::self());
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eMLU(const RegisterCell &A1,
      const RegisterCell &A2) const {
  uint16_t W = A1.width() + A2.width();
-  uint16_t Z = A1.ct(0) + A2.ct(0);
+  uint16_t Z = A1.ct(false) + A2.ct(false);
  RegisterCell Res(W);
  Res.fill(0, Z, BitValue::Zero);
  Res.fill(Z, W, BitValue::self());
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eASL(const RegisterCell &A1,
      uint16_t Sh) const {
  assert(Sh <= A1.width());
@ -537,7 +526,6 @@ BT::RegisterCell BT::MachineEvaluator::eASL(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eLSR(const RegisterCell &A1,
      uint16_t Sh) const {
  uint16_t W = A1.width();
@ -548,7 +536,6 @@ BT::RegisterCell BT::MachineEvaluator::eLSR(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eASR(const RegisterCell &A1,
      uint16_t Sh) const {
  uint16_t W = A1.width();
@ -560,7 +547,6 @@ BT::RegisterCell BT::MachineEvaluator::eASR(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eAND(const RegisterCell &A1,
      const RegisterCell &A2) const {
  uint16_t W = A1.width();
@ -583,7 +569,6 @@ BT::RegisterCell BT::MachineEvaluator::eAND(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eORL(const RegisterCell &A1,
      const RegisterCell &A2) const {
  uint16_t W = A1.width();
@ -606,7 +591,6 @@ BT::RegisterCell BT::MachineEvaluator::eORL(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eXOR(const RegisterCell &A1,
      const RegisterCell &A2) const {
  uint16_t W = A1.width();
@ -627,7 +611,6 @@ BT::RegisterCell BT::MachineEvaluator::eXOR(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eNOT(const RegisterCell &A1) const {
  uint16_t W = A1.width();
  RegisterCell Res(W);
@ -643,7 +626,6 @@ BT::RegisterCell BT::MachineEvaluator::eNOT(const RegisterCell &A1) const {
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eSET(const RegisterCell &A1,
      uint16_t BitN) const {
  assert(BitN < A1.width());
@ -652,7 +634,6 @@ BT::RegisterCell BT::MachineEvaluator::eSET(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eCLR(const RegisterCell &A1,
      uint16_t BitN) const {
  assert(BitN < A1.width());
@ -661,7 +642,6 @@ BT::RegisterCell BT::MachineEvaluator::eCLR(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eCLB(const RegisterCell &A1, bool B,
      uint16_t W) const {
  uint16_t C = A1.cl(B), AW = A1.width();
@ -672,7 +652,6 @@ BT::RegisterCell BT::MachineEvaluator::eCLB(const RegisterCell &A1, bool B,
  return RegisterCell::self(0, W);
 }

-
 BT::RegisterCell BT::MachineEvaluator::eCTB(const RegisterCell &A1, bool B,
      uint16_t W) const {
  uint16_t C = A1.ct(B), AW = A1.width();
@ -683,7 +662,6 @@ BT::RegisterCell BT::MachineEvaluator::eCTB(const RegisterCell &A1, bool B,
  return RegisterCell::self(0, W);
 }

-
 BT::RegisterCell BT::MachineEvaluator::eSXT(const RegisterCell &A1,
      uint16_t FromN) const {
  uint16_t W = A1.width();
@ -695,7 +673,6 @@ BT::RegisterCell BT::MachineEvaluator::eSXT(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eZXT(const RegisterCell &A1,
      uint16_t FromN) const {
  uint16_t W = A1.width();
@ -705,7 +682,6 @@ BT::RegisterCell BT::MachineEvaluator::eZXT(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eXTR(const RegisterCell &A1,
      uint16_t B, uint16_t E) const {
  uint16_t W = A1.width();
@ -718,7 +694,6 @@ BT::RegisterCell BT::MachineEvaluator::eXTR(const RegisterCell &A1,
  return Res;
 }

-
 BT::RegisterCell BT::MachineEvaluator::eINS(const RegisterCell &A1,
      const RegisterCell &A2, uint16_t AtN) const {
  uint16_t W1 = A1.width(), W2 = A2.width();
@ -731,7 +706,6 @@ BT::RegisterCell BT::MachineEvaluator::eINS(const RegisterCell &A1,
  return Res;
 }

-
 BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const {
  assert(Sub == 0 && "Generic BitTracker::mask called for Sub != 0");
  uint16_t W = getRegBitWidth(Reg);
@ -785,7 +759,6 @@ bool BT::MachineEvaluator::evaluate(const MachineInstr &MI,
  return true;
 }

-
 // Main W-Z implementation.

 void BT::visitPHI(const MachineInstr &PI) {
@ -977,7 +950,6 @@ void BT::visitBranchesFrom(const MachineInstr &BI) {
  }
 }

-
 void BT::visitUsesOf(unsigned Reg) {
  if (Trace)
    dbgs() << "visiting uses of " << PrintReg(Reg, &ME.TRI) << "\n";
@ -997,17 +969,14 @@ void BT::visitUsesOf(unsigned Reg) {
  }
 }

-
 BT::RegisterCell BT::get(RegisterRef RR) const {
  return ME.getCell(RR, Map);
 }

-
 void BT::put(RegisterRef RR, const RegisterCell &RC) {
  ME.putCell(RR, RC, Map);
 }

-
 // Replace all references to bits from OldRR with the corresponding bits
 // in NewRR.
 void BT::subst(RegisterRef OldRR, RegisterRef NewRR) {
@ -1033,7 +1002,6 @@ void BT::subst(RegisterRef OldRR, RegisterRef NewRR) {
  }
 }

-
 // Check if the block has been "executed" during propagation. (If not, the
 // block is dead, but it may still appear to be reachable.)
 bool BT::reached(const MachineBasicBlock *B) const {
@ -1047,7 +1015,6 @@ bool BT::reached(const MachineBasicBlock *B) const {
  return false;
 }

-
 // Visit an individual instruction. This could be a newly added instruction,
 // or one that has been modified by an optimization.
 void BT::visit(const MachineInstr &MI) {
@ -1061,14 +1028,12 @@ void BT::visit(const MachineInstr &MI) {
    FlowQ.pop();
 }

-
 void BT::reset() {
  EdgeExec.clear();
  InstrExec.clear();
  Map.clear();
 }

-
 void BT::run() {
  reset();
  assert(FlowQ.empty());
@ -1141,4 +1106,3 @@ void BT::run() {
  if (Trace)
    print_cells(dbgs() << "Cells after propagation:\n");
 }
-
--- a/lib/Target/Hexagon/BitTracker.h
+++ b/lib/Target/Hexagon/BitTracker.h
@ -1,4 +1,4 @@
-//===--- BitTracker.h -----------------------------------------------------===//
+//===--- BitTracker.h -------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -7,24 +7,27 @@
 //
 //===----------------------------------------------------------------------===//

-#ifndef BITTRACKER_H
-#define BITTRACKER_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_BITTRACKER_H
+#define LLVM_LIB_TARGET_HEXAGON_BITTRACKER_H

 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
-
+#include "llvm/CodeGen/MachineOperand.h"
+#include <cassert>
+#include <cstdint>
 #include <map>
 #include <queue>
 #include <set>
+#include <utility>

 namespace llvm {
-  class ConstantInt;
-  class MachineRegisterInfo;
-  class MachineBasicBlock;
-  class MachineInstr;
-  class MachineOperand;
-  class raw_ostream;
+
+class ConstantInt;
+class MachineRegisterInfo;
+class MachineBasicBlock;
+class MachineInstr;
+class raw_ostream;

 struct BitTracker {
  struct BitRef;
@ -76,19 +79,19 @@ private:
  CellMapType &Map;
 };

-
 // Abstraction of a reference to bit at position Pos from a register Reg.
 struct BitTracker::BitRef {
  BitRef(unsigned R = 0, uint16_t P = 0) : Reg(R), Pos(P) {}
+
  bool operator== (const BitRef &BR) const {
    // If Reg is 0, disregard Pos.
    return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos);
  }
+
  unsigned Reg;
  uint16_t Pos;
 };

-
 // Abstraction of a register reference in MachineOperand.  It contains the
 // register number and the subregister index.
 struct BitTracker::RegisterRef {
@ -96,10 +99,10 @@ struct BitTracker::RegisterRef {
    : Reg(R), Sub(S) {}
  RegisterRef(const MachineOperand &MO)
      : Reg(MO.getReg()), Sub(MO.getSubReg()) {}
+
  unsigned Reg, Sub;
 };

-
 // Value that a single bit can take.  This is outside of the context of
 // any register, it is more of an abstraction of the two-element set of
 // possible bit values.  One extension here is the "Ref" type, which
@ -158,6 +161,7 @@ struct BitTracker::BitValue {
  bool operator!= (const BitValue &V) const {
    return !operator==(V);
  }
+
  bool is(unsigned T) const {
    assert(T == 0 || T == 1);
    return T == 0 ? Type == Zero
@ -209,6 +213,7 @@ struct BitTracker::BitValue {
  bool num() const {
    return Type == Zero || Type == One;
  }
+
  operator bool() const {
    assert(Type == Zero || Type == One);
    return Type == One;
@ -217,7 +222,6 @@ struct BitTracker::BitValue {
  friend raw_ostream &operator<<(raw_ostream &OS, const BitValue &BV);
 };

-
 // This operation must be idempotent, i.e. ref(ref(V)) == ref(V).
 inline BitTracker::BitValue
 BitTracker::BitValue::ref(const BitValue &V) {
@ -228,25 +232,25 @@ BitTracker::BitValue::ref(const BitValue &V) {
  return self();
 }

-
 inline BitTracker::BitValue
 BitTracker::BitValue::self(const BitRef &Self) {
  return BitValue(Self.Reg, Self.Pos);
 }

-
 // A sequence of bits starting from index B up to and including index E.
 // If E < B, the mask represents two sections: [0..E] and [B..W) where
 // W is the width of the register.
 struct BitTracker::BitMask {
-  BitMask() : B(0), E(0) {}
+  BitMask() = default;
  BitMask(uint16_t b, uint16_t e) : B(b), E(e) {}
+
  uint16_t first() const { return B; }
  uint16_t last() const { return E; }
-private:
-  uint16_t B, E;
-};

+private:
+  uint16_t B = 0;
+  uint16_t E = 0;
+};

 // Representation of a register: a list of BitValues.
 struct BitTracker::RegisterCell {
@ -255,6 +259,7 @@ struct BitTracker::RegisterCell {
  uint16_t width() const {
    return Bits.size();
  }
+
  const BitValue &operator[](uint16_t BitN) const {
    assert(BitN < Bits.size());
    return Bits[BitN];
@ -297,12 +302,10 @@ private:
  friend raw_ostream &operator<<(raw_ostream &OS, const RegisterCell &RC);
 };

-
 inline bool BitTracker::has(unsigned Reg) const {
  return Map.find(Reg) != Map.end();
 }

-
 inline const BitTracker::RegisterCell&
 BitTracker::lookup(unsigned Reg) const {
  CellMapType::const_iterator F = Map.find(Reg);
@ -310,7 +313,6 @@ BitTracker::lookup(unsigned Reg) const {
  return F->second;
 }

-
 inline BitTracker::RegisterCell
 BitTracker::RegisterCell::self(unsigned Reg, uint16_t Width) {
  RegisterCell RC(Width);
@ -319,7 +321,6 @@ BitTracker::RegisterCell::self(unsigned Reg, uint16_t Width) {
  return RC;
 }

-
 inline BitTracker::RegisterCell
 BitTracker::RegisterCell::top(uint16_t Width) {
  RegisterCell RC(Width);
@ -328,7 +329,6 @@ BitTracker::RegisterCell::top(uint16_t Width) {
  return RC;
 }

-
 inline BitTracker::RegisterCell
 BitTracker::RegisterCell::ref(const RegisterCell &C) {
  uint16_t W = C.width();
@ -345,12 +345,13 @@ BitTracker::RegisterCell::ref(const RegisterCell &C) {
 struct BitTracker::MachineEvaluator {
  MachineEvaluator(const TargetRegisterInfo &T, MachineRegisterInfo &M)
      : TRI(T), MRI(M) {}
-  virtual ~MachineEvaluator() {}
+  virtual ~MachineEvaluator() = default;

  uint16_t getRegBitWidth(const RegisterRef &RR) const;

  RegisterCell getCell(const RegisterRef &RR, const CellMapType &M) const;
  void putCell(const RegisterRef &RR, RegisterCell RC, CellMapType &M) const;
+
  // A result of any operation should use refs to the source cells, not
  // the cells directly. This function is a convenience wrapper to quickly
  // generate a ref for a cell corresponding to a register reference.
@ -435,4 +436,4 @@ struct BitTracker::MachineEvaluator {

 } // end namespace llvm

-#endif
+#endif // LLVM_LIB_TARGET_HEXAGON_BITTRACKER_H
--- a/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@ -7,16 +7,30 @@
 //
 //===----------------------------------------------------------------------===//

-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
 #include "Hexagon.h"
+#include "HexagonBitTracker.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonTargetMachine.h"
-#include "HexagonBitTracker.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <utility>
+#include <vector>

 using namespace llvm;

@ -76,11 +90,11 @@ HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
  }
 }

-
 BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
+  using namespace Hexagon;
+
  if (Sub == 0)
    return MachineEvaluator::mask(Reg, 0);
-  using namespace Hexagon;
  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
  unsigned ID = RC->getID();
  uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub));
@ -102,6 +116,7 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
 }

 namespace {
+
 class RegisterRefs {
  std::vector<BT::RegisterRef> Vector;

@ -117,17 +132,21 @@ public:
  }

  size_t size() const { return Vector.size(); }
+
  const BT::RegisterRef &operator[](unsigned n) const {
    // The main purpose of this operator is to assert with bad argument.
    assert(n < Vector.size());
    return Vector[n];
  }
 };
-}
+
+} // end anonymous namespace

 bool HexagonEvaluator::evaluate(const MachineInstr &MI,
                                const CellMapType &Inputs,
                                CellMapType &Outputs) const {
+  using namespace Hexagon;
+
  unsigned NumDefs = 0;

  // Sanity verification: there should not be any defs with subregisters.
@ -142,7 +161,6 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
  if (NumDefs == 0)
    return false;

-  using namespace Hexagon;
  unsigned Opc = MI.getOpcode();

  if (MI.mayLoad()) {
@ -779,10 +797,10 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
    case S2_cl0:
    case S2_cl0p:
      // Always produce a 32-bit result.
-      return rr0(eCLB(rc(1), 0/*bit*/, 32), Outputs);
+      return rr0(eCLB(rc(1), false/*bit*/, 32), Outputs);
    case S2_cl1:
    case S2_cl1p:
-      return rr0(eCLB(rc(1), 1/*bit*/, 32), Outputs);
+      return rr0(eCLB(rc(1), true/*bit*/, 32), Outputs);
    case S2_clb:
    case S2_clbp: {
      uint16_t W1 = getRegBitWidth(Reg[1]);
@ -794,10 +812,10 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
    }
    case S2_ct0:
    case S2_ct0p:
-      return rr0(eCTB(rc(1), 0/*bit*/, 32), Outputs);
+      return rr0(eCTB(rc(1), false/*bit*/, 32), Outputs);
    case S2_ct1:
    case S2_ct1p:
-      return rr0(eCTB(rc(1), 1/*bit*/, 32), Outputs);
+      return rr0(eCTB(rc(1), true/*bit*/, 32), Outputs);
    case S5_popcountp:
      // TODO
      break;
@ -953,6 +971,8 @@ bool HexagonEvaluator::evaluate(const MachineInstr &BI,
 bool HexagonEvaluator::evaluateLoad(const MachineInstr &MI,
                                    const CellMapType &Inputs,
                                    CellMapType &Outputs) const {
+  using namespace Hexagon;
+
  if (TII.isPredicated(MI))
    return false;
  assert(MI.mayLoad() && "A load that mayn't?");
@ -960,7 +980,6 @@ bool HexagonEvaluator::evaluateLoad(const MachineInstr &MI,

  uint16_t BitNum;
  bool SignEx;
-  using namespace Hexagon;

  switch (Opc) {
    default:
@ -1141,9 +1160,9 @@ bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr &MI,
  return true;
 }

-
 unsigned HexagonEvaluator::getNextPhysReg(unsigned PReg, unsigned Width) const {
  using namespace Hexagon;
+
  bool Is64 = DoubleRegsRegClass.contains(PReg);
  assert(PReg == 0 || Is64 || IntRegsRegClass.contains(PReg));

@ -1180,7 +1199,6 @@ unsigned HexagonEvaluator::getNextPhysReg(unsigned PReg, unsigned Width) const {
  return (Idx64+1 < Num64) ? Phys64[Idx64+1] : 0;
 }

-
 unsigned HexagonEvaluator::getVirtRegFor(unsigned PReg) const {
  typedef MachineRegisterInfo::livein_iterator iterator;
  for (iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
--- a/lib/Target/Hexagon/HexagonBitTracker.h
+++ b/lib/Target/Hexagon/HexagonBitTracker.h
@ -1,4 +1,4 @@
-//===--- HexagonBitTracker.h ----------------------------------------------===//
+//===--- HexagonBitTracker.h ------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -7,15 +7,17 @@
 //
 //===----------------------------------------------------------------------===//

-#ifndef HEXAGONBITTRACKER_H
-#define HEXAGONBITTRACKER_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONBITTRACKER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONBITTRACKER_H

 #include "BitTracker.h"
 #include "llvm/ADT/DenseMap.h"
+#include <cstdint>

 namespace llvm {
-  class HexagonInstrInfo;
-  class HexagonRegisterInfo;
+
+class HexagonInstrInfo;
+class HexagonRegisterInfo;

 struct HexagonEvaluator : public BitTracker::MachineEvaluator {
  typedef BitTracker::CellMapType CellMapType;
@ -49,10 +51,12 @@ private:
  // Type of formal parameter extension.
  struct ExtType {
    enum { SExt, ZExt };
-    char Type;
-    uint16_t Width;
-    ExtType() : Type(0), Width(0) {}
+
+    ExtType() = default;
    ExtType(char t, uint16_t w) : Type(t), Width(w) {}
+
+    char Type = 0;
+    uint16_t Width = 0;
  };
  // Map VR -> extension type.
  typedef DenseMap<unsigned, ExtType> RegExtMap;
@ -61,4 +65,4 @@ private:

 } // end namespace llvm

-#endif
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONBITTRACKER_H
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@ -16,9 +16,14 @@

 #include "HexagonRegisterInfo.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include <cstdint>
+#include <vector>

 #define GET_INSTRINFO_HEADER
 #include "HexagonGenInstrInfo.inc"
@ -29,9 +34,10 @@ struct EVT;
 class HexagonSubtarget;

 class HexagonInstrInfo : public HexagonGenInstrInfo {
-  virtual void anchor();
  const HexagonRegisterInfo RI;

+  virtual void anchor();
+
 public:
  explicit HexagonInstrInfo(HexagonSubtarget &ST);

@ -260,7 +266,7 @@ public:
  /// PredCost.
  unsigned getInstrLatency(const InstrItineraryData *ItinData,
                           const MachineInstr &MI,
-                           unsigned *PredCost = 0) const override;
+                           unsigned *PredCost = nullptr) const override;

  /// Create machine specific model for scheduling.
  DFAPacketizer *
@ -378,7 +384,6 @@ public:
  bool PredOpcodeHasJMP_c(unsigned Opcode) const;
  bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;

-
  short getAbsoluteForm(const MachineInstr &MI) const;
  unsigned getAddrMode(const MachineInstr &MI) const;
  unsigned getBaseAndOffset(const MachineInstr &MI, int &Offset,
@ -421,13 +426,11 @@ public:
  unsigned getUnits(const MachineInstr &MI) const;
  unsigned getValidSubTargets(const unsigned Opcode) const;

-
  /// getInstrTimingClassLatency - Compute the instruction latency of a given
  /// instruction using Timing Class information, if available.
  unsigned nonDbgBBSize(const MachineBasicBlock *BB) const;
  unsigned nonDbgBundleSize(MachineBasicBlock::const_iterator BundleHead) const;

-
  void immediateExtend(MachineInstr &MI) const;
  bool invertAndChangeJumpTarget(MachineInstr &MI,
                                 MachineBasicBlock* NewTarget) const;
@ -438,6 +441,6 @@ public:
  short xformRegToImmOffset(const MachineInstr &MI) const;
 };

-}
+} // end namespace llvm

-#endif
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONINSTRINFO_H
--- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@ -15,33 +15,31 @@

 namespace llvm {

-  namespace Hexagon {
+namespace Hexagon {
+
    const unsigned int StartPacket = 0x1;
    const unsigned int EndPacket = 0x2;
-  }

+} // end namespace Hexagon

 /// Hexagon target-specific information for each MachineFunction.
 class HexagonMachineFunctionInfo : public MachineFunctionInfo {
  // SRetReturnReg - Some subtargets require that sret lowering includes
  // returning the value of the returned struct in a register. This field
  // holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg;
-  unsigned StackAlignBaseVReg;    // Aligned-stack base register (virtual)
-  unsigned StackAlignBasePhysReg; //                             (physical)
+  unsigned SRetReturnReg = 0;
+  unsigned StackAlignBaseVReg = 0;    // Aligned-stack base register (virtual)
+  unsigned StackAlignBasePhysReg = 0; //                             (physical)
  int VarArgsFrameIndex;
-  bool HasClobberLR;
-  bool HasEHReturn;
+  bool HasClobberLR = false;
+  bool HasEHReturn = false;
  std::map<const MachineInstr*, unsigned> PacketInfo;
  virtual void anchor();

 public:
-  HexagonMachineFunctionInfo() : SRetReturnReg(0), StackAlignBaseVReg(0),
-      StackAlignBasePhysReg(0), HasClobberLR(0), HasEHReturn(false) {}
+  HexagonMachineFunctionInfo() = default;

-  HexagonMachineFunctionInfo(MachineFunction &MF) : SRetReturnReg(0),
-      StackAlignBaseVReg(0), StackAlignBasePhysReg(0), HasClobberLR(0),
-      HasEHReturn(false) {}
+  HexagonMachineFunctionInfo(MachineFunction &MF) {}

  unsigned getSRetReturnReg() const { return SRetReturnReg; }
  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
@ -75,6 +73,7 @@ public:
  void setStackAlignBasePhysReg(unsigned R) { StackAlignBasePhysReg = R; }
  unsigned getStackAlignBasePhysReg() const { return StackAlignBasePhysReg; }
 };
-} // End llvm namespace

-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEFUNCTIONINFO_H
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@ -10,17 +10,27 @@
 // This file contains the declarations of the HexagonTargetAsmInfo properties.
 //
 //===----------------------------------------------------------------------===//
+
 #define DEBUG_TYPE "hexagon-sdata"

-#include "HexagonTargetMachine.h"
 #include "HexagonTargetObjectFile.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Type.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"

 using namespace llvm;

@ -44,13 +54,21 @@ static cl::opt<bool> TraceGVPlacement("trace-gv-placement",
 // (e.g. -debug and -debug-only=globallayout)
 #define TRACE_TO(s, X) s << X
 #ifdef NDEBUG
-#define TRACE(X) do { if (TraceGVPlacement) { TRACE_TO(errs(), X); } } while (0)
+#define TRACE(X)                                                               \
+  do {                                                                         \
+    if (TraceGVPlacement) {                                                    \
+      TRACE_TO(errs(), X);                                                     \
+    }                                                                          \
+  } while (false)
 #else
-#define TRACE(X) \
-  do { \
-    if (TraceGVPlacement) { TRACE_TO(errs(), X); } \
-    else { DEBUG( TRACE_TO(dbgs(), X) ); } \
-  } while (0)
+#define TRACE(X)                                                               \
+  do {                                                                         \
+    if (TraceGVPlacement) {                                                    \
+      TRACE_TO(errs(), X);                                                     \
+    } else {                                                                   \
+      DEBUG(TRACE_TO(dbgs(), X));                                              \
+    }                                                                          \
+  } while (false)
 #endif

 // Returns true if the section name is such that the symbol will be put
@ -69,7 +87,6 @@ static bool isSmallDataSection(StringRef Sec) {
         Sec.find(".scommon.") != StringRef::npos;
 }

-
 static const char *getSectionSuffixForSize(unsigned Size) {
  switch (Size) {
  default:
@ -163,7 +180,6 @@ MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal(
  return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, Kind, TM);
 }

-
 /// Return true if this global value should be placed into small data/bss
 /// section.
 bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
@ -232,17 +248,14 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
  return true;
 }

-
 bool HexagonTargetObjectFile::isSmallDataEnabled() const {
  return SmallDataThreshold > 0;
 }

-
 unsigned HexagonTargetObjectFile::getSmallDataSize() const {
  return SmallDataThreshold;
 }

-
 /// Descends any type down to "elementary" components,
 /// discovering the smallest addressable one.
 /// If zero is returned, declaration will not be modified.
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@ -1,5 +1,4 @@
-
-//=== HexagonMCCompound.cpp - Hexagon Compound checker  -------===//
+//=== HexagonMCCompound.cpp - Hexagon Compound checker  -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -11,18 +10,17 @@
 // This file is looks at a packet and tries to form compound insns
 //
 //===----------------------------------------------------------------------===//
+
 #include "Hexagon.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
-#include "MCTargetDesc/HexagonMCShuffler.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCAssembler.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>

 using namespace llvm;
 using namespace Hexagon;
@ -79,8 +77,7 @@ static const unsigned cmpgtn1BitOpcode[8] = {
 };

 // enum HexagonII::CompoundGroup
-namespace {
-unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
+static unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
  unsigned DstReg, SrcReg, Src1Reg, Src2Reg;

  switch (MI.getOpcode()) {
@ -173,11 +170,9 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {

  return HexagonII::HCG_None;
 }
-}

 /// getCompoundOp - Return the index from 0-7 into the above opcode lists.
-namespace {
-unsigned getCompoundOp(MCInst const &HMCI) {
+static unsigned getCompoundOp(MCInst const &HMCI) {
  const MCOperand &Predicate = HMCI.getOperand(0);
  unsigned PredReg = Predicate.getReg();

@ -198,11 +193,10 @@ unsigned getCompoundOp(MCInst const &HMCI) {
    return (PredReg == Hexagon::P0) ? tp0_jump_t : tp1_jump_t;
  }
 }
-}

-namespace {
-MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
-  MCInst *CompoundInsn = 0;
+static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
+                               MCInst const &R) {
+  MCInst *CompoundInsn = nullptr;
  unsigned compoundOpcode;
  MCOperand Rs, Rt;
  int64_t Value;
@ -336,12 +330,10 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {

  return CompoundInsn;
 }
-}

 /// Non-Symmetrical. See if these two instructions are fit for compound pair.
-namespace {
-bool isOrderedCompoundPair(MCInst const &MIa, bool IsExtendedA,
-                           MCInst const &MIb, bool IsExtendedB) {
+static bool isOrderedCompoundPair(MCInst const &MIa, bool IsExtendedA,
+                                  MCInst const &MIb, bool IsExtendedB) {
  unsigned MIaG = getCompoundCandidateGroup(MIa, IsExtendedA);
  unsigned MIbG = getCompoundCandidateGroup(MIb, IsExtendedB);
  // We have two candidates - check that this is the same register
@ -353,10 +345,9 @@ bool isOrderedCompoundPair(MCInst const &MIa, bool IsExtendedA,
  return ((MIaG == HexagonII::HCG_A && MIbG == HexagonII::HCG_B) &&
          (MIa.getOperand(0).getReg() == MIb.getOperand(0).getReg()));
 }
-}

-namespace {
-bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) {
+static bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context,
+                            MCInst &MCI) {
  assert(HexagonMCInstrInfo::isBundle(MCI));
  bool JExtended = false;
  for (MCInst::iterator J =
@ -367,8 +358,7 @@ bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) {
      JExtended = true;
      continue;
    }
-    if (llvm::HexagonMCInstrInfo::getType(MCII, *JumpInst) ==
-        HexagonII::TypeJ) {
+    if (HexagonMCInstrInfo::getType(MCII, *JumpInst) == HexagonII::TypeJ) {
      // Try to pair with another insn (B)undled with jump.
      bool BExtended = false;
      for (MCInst::iterator B =
@ -401,7 +391,6 @@ bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) {
  }
  return false;
 }
-}

 /// tryCompound - Given a bundle check for compound insns when one
 /// is found update the contents fo the bundle with the compound insn.
@ -420,6 +409,4 @@ void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
  // a compound is found.
  while (lookForCompound(MCII, Context, MCI))
    ;
-
-  return;
 }
--- a/lib/Target/Hexagon/RDFCopy.h
+++ b/lib/Target/Hexagon/RDFCopy.h
@ -1,4 +1,4 @@
-//===--- RDFCopy.h --------------------------------------------------------===//
+//===--- RDFCopy.h ----------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -7,23 +7,26 @@
 //
 //===----------------------------------------------------------------------===//

-#ifndef RDF_COPY_H
-#define RDF_COPY_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
+#define LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H

 #include "RDFGraph.h"
 #include <map>
 #include <vector>

 namespace llvm {
+
  class MachineBasicBlock;
  class MachineDominatorTree;
  class MachineInstr;

 namespace rdf {
+
  struct CopyPropagation {
    CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
        Trace(false) {}
-    virtual ~CopyPropagation() {}
+
+    virtual ~CopyPropagation() = default;

    bool run();
    void trace(bool On) { Trace = On; }
@ -49,7 +52,9 @@ namespace rdf {
    void updateMap(NodeAddr<InstrNode*> IA);
    bool scanBlock(MachineBasicBlock *B);
  };
-} // namespace rdf
-} // namespace llvm

-#endif
+} // end namespace rdf
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@ -10,16 +10,31 @@
 // Target-independent, SSA-based data flow graph for register data flow (RDF).
 //
 #include "RDFGraph.h"
-
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <utility>
+#include <vector>

 using namespace llvm;
 using namespace rdf;
@ -88,14 +103,12 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) {
  return OS;
 }

-namespace {
-  void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA,
-        const DataFlowGraph &G) {
-    OS << Print<NodeId>(RA.Id, G) << '<'
-       << Print<RegisterRef>(RA.Addr->getRegRef(G), G) << '>';
-    if (RA.Addr->getFlags() & NodeAttrs::Fixed)
-      OS << '!';
-  }
+static void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA,
+                const DataFlowGraph &G) {
+  OS << Print<NodeId>(RA.Id, G) << '<'
+     << Print<RegisterRef>(RA.Addr->getRegRef(G), G) << '>';
+  if (RA.Addr->getFlags() & NodeAttrs::Fixed)
+    OS << '!';
 }

 template<>
@ -183,9 +196,11 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) {
 }

 namespace {
+
  template <typename T>
  struct PrintListV {
    PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {}
+
    typedef T Type;
    const NodeList &List;
    const DataFlowGraph &G;
@ -201,7 +216,8 @@ namespace {
    }
    return OS;
  }
-}
+
+} // end anonymous namespace

 template<>
 raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) {
@ -219,10 +235,10 @@ raw_ostream &operator<< (raw_ostream &OS,
  // Print the target for calls and branches (for readability).
  if (MI.isCall() || MI.isBranch()) {
    MachineInstr::const_mop_iterator T =
-          find_if(MI.operands(),
-                  [] (const MachineOperand &Op) -> bool {
-                    return Op.isMBB() || Op.isGlobal() || Op.isSymbol();
-                  });
+          llvm::find_if(MI.operands(),
+                        [] (const MachineOperand &Op) -> bool {
+                          return Op.isMBB() || Op.isGlobal() || Op.isSymbol();
+                        });
    if (T != MI.operands_end()) {
      OS << ' ';
      if (T->isMBB())
@ -327,8 +343,8 @@ raw_ostream &operator<< (raw_ostream &OS,
  return OS;
 }

-} // namespace rdf
-} // namespace llvm
+} // end namespace rdf
+} // end namespace llvm

 // Node allocation functions.
 //
@ -390,7 +406,6 @@ void NodeAllocator::clear() {
  ActiveEnd = nullptr;
 }

-
 // Insert node NA after "this" in the circular chain.
 void NodeBase::append(NodeAddr<NodeBase*> NA) {
  NodeId Nx = Next;
@ -401,7 +416,6 @@ void NodeBase::append(NodeAddr<NodeBase*> NA) {
  }
 }

-
 // Fundamental node manipulator functions.

 // Obtain the register reference from a reference node.
@ -590,7 +604,6 @@ NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) {
  return findBlock(EntryB, G);
 }

-
 // Target operand information.
 //

@ -641,7 +654,6 @@ bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
  return false;
 }

-
 RegisterRef RegisterAggr::normalize(RegisterRef RR) const {
  RegisterId SuperReg = RR.Reg;
  while (true) {
@ -745,7 +757,6 @@ void RegisterAggr::print(raw_ostream &OS) const {
  OS << " }";
 }

-
 //
 // The data flow graph construction.
 //
@ -753,10 +764,9 @@ void RegisterAggr::print(raw_ostream &OS) const {
 DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
      const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
      const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi)
-    : LMI(), MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), TOI(toi) {
+    : MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), TOI(toi) {
 }

-
 // The implementation of the definition stack.
 // Each register reference has its own definition stack. In particular,
 // for a register references "Reg" and "Reg:subreg" will each have their
@ -845,7 +855,6 @@ unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const {
  return P;
 }

-
 // Register information.

 // Get the list of references aliased to RR. Lane masks are ignored.
@ -915,7 +924,6 @@ NodeAddr<NodeBase*> DataFlowGraph::cloneNode(const NodeAddr<NodeBase*> B) {
  return NA;
 }

-
 // Allocation routines for specific node types/kinds.

 NodeAddr<UseNode*> DataFlowGraph::newUse(NodeAddr<InstrNode*> Owner,
@ -1248,7 +1256,6 @@ bool DataFlowGraph::alias(RegisterRef RA, RegisterRef RB) const {
  return false;
 }

-
 // Clear all information in the graph.
 void DataFlowGraph::reset() {
  Memory.clear();
@ -1256,7 +1263,6 @@ void DataFlowGraph::reset() {
  Func = NodeAddr<FuncNode*>();
 }

-
 // Return the next reference node in the instruction node IA that is related
 // to RA. Conceptually, two reference nodes are related if they refer to the
 // same instance of a register access, but differ in flags or other minor
--- a/lib/Target/Hexagon/RDFGraph.h
+++ b/lib/Target/Hexagon/RDFGraph.h
@ -1,4 +1,4 @@
-//===--- RDFGraph.h -------------------------------------------------------===//
+//===--- RDFGraph.h ---------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -221,20 +221,25 @@
 // The statement s5 has two use nodes for t0: u7" and u9". The quotation
 // mark " indicates that the node is a shadow.
 //
-#ifndef RDF_GRAPH_H
-#define RDF_GRAPH_H
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
+#define LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H

 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Timer.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-
+#include <cassert>
+#include <cstdint>
+#include <cstring>
 #include <functional>
 #include <map>
 #include <set>
 #include <unordered_map>
+#include <utility>
 #include <vector>

 // RDF uses uint32_t to refer to registers. This is to ensure that the type
@ -243,6 +248,7 @@
 static_assert(sizeof(uint32_t) == sizeof(unsigned), "Those should be equal");

 namespace llvm {
+
  class MachineBasicBlock;
  class MachineFunction;
  class MachineInstr;
@ -252,6 +258,7 @@ namespace llvm {
  class TargetInstrInfo;

 namespace rdf {
+
  typedef uint32_t NodeId;
  typedef uint32_t RegisterId;

@ -293,9 +300,11 @@ namespace rdf {
    static uint16_t set_type(uint16_t A, uint16_t T) {
      return (A & ~TypeMask) | T;
    }
+
    static uint16_t set_kind(uint16_t A, uint16_t K) {
      return (A & ~KindMask) | K;
    }
+
    static uint16_t set_flags(uint16_t A, uint16_t F) {
      return (A & ~FlagMask) | F;
    }
@ -326,9 +335,14 @@ namespace rdf {
  };

  template <typename T> struct NodeAddr {
-    NodeAddr() : Addr(nullptr), Id(0) {}
+    NodeAddr() : Addr(nullptr) {}
    NodeAddr(T A, NodeId I) : Addr(A), Id(I) {}

+    // Type cast (casting constructor). The reason for having this class
+    // instead of std::pair.
+    template <typename S> NodeAddr(const NodeAddr<S> &NA)
+      : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}
+
    bool operator== (const NodeAddr<T> &NA) const {
      assert((Addr == NA.Addr) == (Id == NA.Id));
      return Addr == NA.Addr;
@ -336,13 +350,9 @@ namespace rdf {
    bool operator!= (const NodeAddr<T> &NA) const {
      return !operator==(NA);
    }
-    // Type cast (casting constructor). The reason for having this class
-    // instead of std::pair.
-    template <typename S> NodeAddr(const NodeAddr<S> &NA)
-      : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}

    T Addr;
-    NodeId Id;
+    NodeId Id = 0;
  };

  struct NodeBase;
@ -366,17 +376,20 @@ namespace rdf {
  struct NodeAllocator {
    // Amount of storage for a single node.
    enum { NodeMemSize = 32 };
+
    NodeAllocator(uint32_t NPB = 4096)
        : NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)),
-          IndexMask((1 << BitsPerIndex)-1), ActiveEnd(nullptr) {
+          IndexMask((1 << BitsPerIndex)-1) {
      assert(isPowerOf2_32(NPB));
    }
+
    NodeBase *ptr(NodeId N) const {
      uint32_t N1 = N-1;
      uint32_t BlockN = N1 >> BitsPerIndex;
      uint32_t Offset = (N1 & IndexMask) * NodeMemSize;
      return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset);
    }
+
    NodeId id(const NodeBase *P) const;
    NodeAddr<NodeBase*> New();
    void clear();
@ -384,6 +397,7 @@ namespace rdf {
  private:
    void startNewBlock();
    bool needNewBlock();
+
    uint32_t makeId(uint32_t Block, uint32_t Index) const {
      // Add 1 to the id, to avoid the id of 0, which is treated as "null".
      return ((Block << BitsPerIndex) | Index) + 1;
@ -392,7 +406,7 @@ namespace rdf {
    const uint32_t NodesPerBlock;
    const uint32_t BitsPerIndex;
    const uint32_t IndexMask;
-    char *ActiveEnd;
+    char *ActiveEnd = nullptr;
    std::vector<char*> Blocks;
    typedef BumpPtrAllocatorImpl<MallocAllocator, 65536> AllocatorTy;
    AllocatorTy MemPool;
@ -405,6 +419,7 @@ namespace rdf {
    RegisterRef() : RegisterRef(0) {}
    explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
      : Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
+
    operator bool() const { return Reg != 0 && Mask.any(); }
    bool operator== (const RegisterRef &RR) const {
      return Reg == RR.Reg && Mask == RR.Mask;
@ -420,7 +435,8 @@ namespace rdf {

  struct TargetOperandInfo {
    TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
-    virtual ~TargetOperandInfo() {}
+    virtual ~TargetOperandInfo() = default;
+
    virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const;
    virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const;
    virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const;
@ -428,7 +444,6 @@ namespace rdf {
    const TargetInstrInfo &TII;
  };

-
  // Packed register reference. Only used for storage.
  struct PackedRegisterRef {
    RegisterId Reg;
@ -442,11 +457,13 @@ namespace rdf {
  template <typename T, unsigned N = 32>
  struct IndexedSet {
    IndexedSet() : Map() { Map.reserve(N); }
+
    T get(uint32_t Idx) const {
      // Index Idx corresponds to Map[Idx-1].
      assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
      return Map[Idx-1];
    }
+
    uint32_t insert(T Val) {
      // Linear search.
      auto F = llvm::find(Map, Val);
@ -455,11 +472,13 @@ namespace rdf {
      Map.push_back(Val);
      return Map.size();  // Return actual_index + 1.
    }
+
    uint32_t find(T Val) const {
      auto F = llvm::find(Map, Val);
      assert(F != Map.end());
      return F - Map.begin();
    }
+
  private:
    std::vector<T> Map;
  };
@ -478,12 +497,14 @@ namespace rdf {
      assert(LM.any());
      return LM.all() ? 0 : find(LM);
    }
+
    PackedRegisterRef pack(RegisterRef RR) {
      return { RR.Reg, getIndexForLaneMask(RR.Mask) };
    }
    PackedRegisterRef pack(RegisterRef RR) const {
      return { RR.Reg, getIndexForLaneMask(RR.Mask) };
    }
+
    RegisterRef unpack(PackedRegisterRef PR) const {
      return RegisterRef(PR.Reg, getLaneMaskForIndex(PR.MaskId));
    }
@ -491,11 +512,8 @@ namespace rdf {

  struct RegisterAggr {
    RegisterAggr(const TargetRegisterInfo &tri)
-        : Masks(), ExpAliasUnits(tri.getNumRegUnits()), CheckUnits(false),
-          TRI(tri) {}
-    RegisterAggr(const RegisterAggr &RG)
-        : Masks(RG.Masks), ExpAliasUnits(RG.ExpAliasUnits),
-          CheckUnits(RG.CheckUnits), TRI(RG.TRI) {}
+        : ExpAliasUnits(tri.getNumRegUnits()), CheckUnits(false), TRI(tri) {}
+    RegisterAggr(const RegisterAggr &RG) = default;

    bool empty() const { return Masks.empty(); }
    bool hasAliasOf(RegisterRef RR) const;
@ -530,11 +548,11 @@ namespace rdf {
    const TargetRegisterInfo &TRI;
  };

-
  struct NodeBase {
  public:
    // Make sure this is a POD.
    NodeBase() = default;
+
    uint16_t getType()  const { return NodeAttrs::type(Attrs); }
    uint16_t getKind()  const { return NodeAttrs::kind(Attrs); }
    uint16_t getFlags() const { return NodeAttrs::flags(Attrs); }
@ -596,29 +614,36 @@ namespace rdf {

  struct RefNode : public NodeBase {
    RefNode() = default;
+
    RegisterRef getRegRef(const DataFlowGraph &G) const;
+
    MachineOperand &getOp() {
      assert(!(getFlags() & NodeAttrs::PhiRef));
      return *Ref.Op;
    }
+
    void setRegRef(RegisterRef RR, DataFlowGraph &G);
    void setRegRef(MachineOperand *Op, DataFlowGraph &G);
+
    NodeId getReachingDef() const {
      return Ref.RD;
    }
    void setReachingDef(NodeId RD) {
      Ref.RD = RD;
    }
+
    NodeId getSibling() const {
      return Ref.Sib;
    }
    void setSibling(NodeId Sib) {
      Ref.Sib = Sib;
    }
+
    bool isUse() const {
      assert(getType() == NodeAttrs::Ref);
      return getKind() == NodeAttrs::Use;
    }
+
    bool isDef() const {
      assert(getType() == NodeAttrs::Ref);
      return getKind() == NodeAttrs::Def;
@ -702,6 +727,7 @@ namespace rdf {
    MachineBasicBlock *getCode() const {
      return CodeNode::getCode<MachineBasicBlock*>();
    }
+
    void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G);
  };

@ -709,6 +735,7 @@ namespace rdf {
    MachineFunction *getCode() const {
      return CodeNode::getCode<MachineFunction*>();
    }
+
    NodeAddr<BlockNode*> findBlock(const MachineBasicBlock *BB,
        const DataFlowGraph &G) const;
    NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G);
@ -723,6 +750,7 @@ namespace rdf {
    template <typename T> T ptr(NodeId N) const {
      return static_cast<T>(ptr(N));
    }
+
    NodeId id(const NodeBase *P) const;

    template <typename T> NodeAddr<T> addr(NodeId N) const {
@ -738,13 +766,17 @@ namespace rdf {

    struct DefStack {
      DefStack() = default;
+
      bool empty() const { return Stack.empty() || top() == bottom(); }
+
    private:
      typedef NodeAddr<DefNode*> value_type;
      struct Iterator {
        typedef DefStack::value_type value_type;
+
        Iterator &up() { Pos = DS.nextUp(Pos); return *this; }
        Iterator &down() { Pos = DS.nextDown(Pos); return *this; }
+
        value_type operator*() const {
          assert(Pos >= 1);
          return DS.Stack[Pos-1];
@ -755,14 +787,17 @@ namespace rdf {
        }
        bool operator==(const Iterator &It) const { return Pos == It.Pos; }
        bool operator!=(const Iterator &It) const { return Pos != It.Pos; }
+
      private:
        Iterator(const DefStack &S, bool Top);
+
        // Pos-1 is the index in the StorageType object that corresponds to
        // the top of the DefStack.
        const DefStack &DS;
        unsigned Pos;
        friend struct DefStack;
      };
+
    public:
      typedef Iterator iterator;
      iterator top() const { return Iterator(*this, true); }
@ -773,14 +808,18 @@ namespace rdf {
      void pop();
      void start_block(NodeId N);
      void clear_block(NodeId N);
+
    private:
      friend struct Iterator;
      typedef std::vector<value_type> StorageType;
+
      bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const {
        return (P.Addr == nullptr) && (N == 0 || P.Id == N);
      }
+
      unsigned nextUp(unsigned P) const;
      unsigned nextDown(unsigned P) const;
+
      StorageType Stack;
    };

@ -819,6 +858,7 @@ namespace rdf {
      if (RemoveFromOwner)
        removeFromOwner(UA);
    }
+
    void unlinkDef(NodeAddr<DefNode*> DA, bool RemoveFromOwner) {
      unlinkDefDF(DA);
      if (RemoveFromOwner)
@ -831,23 +871,28 @@ namespace rdf {
      return BA.Addr->getType() == NodeAttrs::Ref &&
             BA.Addr->getKind() == Kind;
    }
+
    template <uint16_t Kind>
    static bool IsCode(const NodeAddr<NodeBase*> BA) {
      return BA.Addr->getType() == NodeAttrs::Code &&
             BA.Addr->getKind() == Kind;
    }
+
    static bool IsDef(const NodeAddr<NodeBase*> BA) {
      return BA.Addr->getType() == NodeAttrs::Ref &&
             BA.Addr->getKind() == NodeAttrs::Def;
    }
+
    static bool IsUse(const NodeAddr<NodeBase*> BA) {
      return BA.Addr->getType() == NodeAttrs::Ref &&
             BA.Addr->getKind() == NodeAttrs::Use;
    }
+
    static bool IsPhi(const NodeAddr<NodeBase*> BA) {
      return BA.Addr->getType() == NodeAttrs::Code &&
             BA.Addr->getKind() == NodeAttrs::Phi;
    }
+
    static bool IsPreservingDef(const NodeAddr<DefNode*> DA) {
      uint16_t Flags = DA.Addr->getFlags();
      return (Flags & NodeAttrs::Preserving) && !(Flags & NodeAttrs::Undef);
@ -902,6 +947,7 @@ namespace rdf {

    void unlinkUseDF(NodeAddr<UseNode*> UA);
    void unlinkDefDF(NodeAddr<DefNode*> DA);
+
    void removeFromOwner(NodeAddr<RefNode*> RA) {
      NodeAddr<InstrNode*> IA = RA.Addr->getOwner(*this);
      IA.Addr->removeMember(RA, *this);
@ -967,7 +1013,6 @@ namespace rdf {
    return MM;
  }

-
  // Optionally print the lane mask, if it is not ~0.
  struct PrintLaneMaskOpt {
    PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
@ -991,7 +1036,9 @@ namespace rdf {
    PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g)
      : Print<NodeAddr<T>>(x, g) {}
  };
-} // namespace rdf
-} // namespace llvm

-#endif // RDF_GRAPH_H
+} // end namespace rdf
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@ -28,6 +28,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@ -43,6 +44,11 @@ bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
  return MipsDAGToDAGISel::runOnMachineFunction(MF);
 }

+void MipsSEDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeWrapperPass>();
+  SelectionDAGISel::getAnalysisUsage(AU);
+}
+
 void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
                                               MachineFunction &MF) {
  MachineInstrBuilder MIB(MF, &MI);
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@ -28,6 +28,8 @@ private:

  bool runOnMachineFunction(MachineFunction &MF) override;

+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
  void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
                             MachineFunction &MF);

--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@ -3981,40 +3981,46 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
 static bool isFunctionGlobalAddress(SDValue Callee);

 static bool
-resideInSameModule(SDValue Callee, Reloc::Model RelMod) {
+resideInSameSection(const Function *Caller, SDValue Callee,
+                    const TargetMachine &TM) {
  // If !G, Callee can be an external symbol.
  GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-  if (!G) return false;
-
-  const GlobalValue *GV = G->getGlobal();
-
-  if (GV->isDeclaration()) return false;
-
-  switch(GV->getLinkage()) {
-  default: llvm_unreachable("unknow linkage type");
-  case GlobalValue::AvailableExternallyLinkage:
-  case GlobalValue::ExternalWeakLinkage:
+  if (!G)
    return false;

-  // Callee with weak linkage is allowed if it has hidden or protected
-  // visibility
-  case GlobalValue::LinkOnceAnyLinkage:
-  case GlobalValue::LinkOnceODRLinkage: // e.g. c++ inline functions
-  case GlobalValue::WeakAnyLinkage:
-  case GlobalValue::WeakODRLinkage:     // e.g. c++ template instantiation
-    if (GV->hasDefaultVisibility())
-      return false;
+  const GlobalValue *GV = G->getGlobal();
+  if (!GV->isStrongDefinitionForLinker())
+    return false;

-  case GlobalValue::ExternalLinkage:
-  case GlobalValue::InternalLinkage:
-  case GlobalValue::PrivateLinkage:
-    break;
+  // Any explicitly-specified sections and section prefixes must also match.
+  // Also, if we're using -ffunction-sections, then each function is always in
+  // a different section (the same is true for COMDAT functions).
+  if (TM.getFunctionSections() || GV->hasComdat() || Caller->hasComdat() ||
+      GV->getSection() != Caller->getSection())
+    return false;
+  if (const auto *F = dyn_cast<Function>(GV)) {
+    if (F->getSectionPrefix() != Caller->getSectionPrefix())
+      return false;
  }

-  // With '-fPIC', calling default visiblity function need insert 'nop' after
-  // function call, no matter that function resides in same module or not, so
-  // we treat it as in different module.
-  if (RelMod == Reloc::PIC_ && GV->hasDefaultVisibility())
+  // If the callee might be interposed, then we can't assume the ultimate call
+  // target will be in the same section. Even in cases where we can assume that
+  // interposition won't happen, in any case where the linker might insert a
+  // stub to allow for interposition, we must generate code as though
+  // interposition might occur. To understand why this matters, consider a
+  // situation where: a -> b -> c where the arrows indicate calls. b and c are
+  // in the same section, but a is in a different module (i.e. has a different
+  // TOC base pointer). If the linker allows for interposition between b and c,
+  // then it will generate a stub for the call edge between b and c which will
+  // save the TOC pointer into the designated stack slot allocated by b. If we
+  // return true here, and therefore allow a tail call between b and c, that
+  // stack slot won't exist and the b -> c stub will end up saving b'c TOC base
+  // pointer into the stack slot allocated by a (where the a -> b stub saved
+  // a's TOC base pointer). If we're not considering a tail call, but rather,
+  // whether a nop is needed after the call instruction in b, because the linker
+  // will insert a stub, it might complain about a missing nop if we omit it
+  // (although many don't complain in this case).
+  if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
    return false;

  return true;
@ -4130,11 +4136,11 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
      !isa<ExternalSymbolSDNode>(Callee))
    return false;

-  // Check if Callee resides in the same module, because for now, PPC64 SVR4 ABI
-  // (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
-  // module.
+  // Check if Callee resides in the same section, because for now, PPC64 SVR4
+  // ABI (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
+  // section.
  // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
-  if (!resideInSameModule(Callee, getTargetMachine().getRelocationModel()))
+  if (!resideInSameSection(MF.getFunction(), Callee, getTargetMachine()))
    return false;

  // TCO allows altering callee ABI, so we don't have to check further.
@ -4592,14 +4598,6 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
  return CallOpc;
 }

-static
-bool isLocalCall(const SDValue &Callee)
-{
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    return G->getGlobal()->isStrongDefinitionForLinker();
-  return false;
-}
-
 SDValue PPCTargetLowering::LowerCallResult(
    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@ -4701,6 +4699,7 @@ SDValue PPCTargetLowering::FinishCall(
  // stack frame. If caller and callee belong to the same module (and have the
  // same TOC), the NOP will remain unchanged.

+  MachineFunction &MF = DAG.getMachineFunction();
  if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() &&
      !isPatchPoint) {
    if (CallOpc == PPCISD::BCTRL) {
@ -4724,11 +4723,11 @@ SDValue PPCTargetLowering::FinishCall(
      // The address needs to go after the chain input but before the flag (or
      // any other variadic arguments).
      Ops.insert(std::next(Ops.begin()), AddTOC);
-    } else if ((CallOpc == PPCISD::CALL) &&
-               (!isLocalCall(Callee) ||
-                DAG.getTarget().getRelocationModel() == Reloc::PIC_))
+    } else if (CallOpc == PPCISD::CALL &&
+      !resideInSameSection(MF.getFunction(), Callee, DAG.getTarget())) {
      // Otherwise insert NOP for non-local calls.
      CallOpc = PPCISD::CALL_NOP;
+    }
  }

  Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@ -70,7 +70,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
  EmitFunctionBody();

  // Emit the XRay table for this function.
-  EmitXRayTable();
+  emitXRayTable();

  // We didn't modify anything.
  return false;
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@ -373,6 +373,10 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
  MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
  MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
                                                       : std::next(MBBI);
+  PI = skipDebugInstructionsBackward(PI, MBB.begin());
+  if (NI != nullptr)
+    NI = skipDebugInstructionsForward(NI, MBB.end());
+
  unsigned Opc = PI->getOpcode();
  int Offset = 0;

@ -2586,6 +2590,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
  uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
  uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
  I = MBB.erase(I);
+  auto InsertPos = skipDebugInstructionsForward(I, MBB.end());

  if (!reserveCallFrame) {
    // If the stack pointer can be changed after prologue, turn the
@ -2615,7 +2620,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,

    if (HasDwarfEHHandlers && !isDestroy &&
        MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
-      BuildCFI(MBB, I, DL,
+      BuildCFI(MBB, InsertPos, DL,
               MCCFIInstruction::createGnuArgsSize(nullptr, Amount));

    if (Amount == 0)
@ -2629,7 +2634,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
    // If this is a callee-pop calling convention, emit a CFA adjust for
    // the amount the callee popped.
    if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
-      BuildCFI(MBB, I, DL, 
+      BuildCFI(MBB, InsertPos, DL,
               MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));

    // Add Amount to SP to destroy a frame, or subtract to setup.
@ -2640,13 +2645,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
      // Merge with any previous or following adjustment instruction. Note: the
      // instructions merged with here do not have CFI, so their stack
      // adjustments do not feed into CfaAdjustment.
-      StackAdjustment += mergeSPUpdates(MBB, I, true);
-      StackAdjustment += mergeSPUpdates(MBB, I, false);
+      StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
+      StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);

      if (StackAdjustment) {
        if (!(Fn->optForMinSize() &&
-              adjustStackWithPops(MBB, I, DL, StackAdjustment)))
-          BuildStackAdjustment(MBB, I, DL, StackAdjustment,
+              adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))
+          BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,
                               /*InEpilogue=*/false);
      }
    }
@ -2662,8 +2667,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
      // TODO: When not using precise CFA, we also need to adjust for the
      // InternalAmt here.
      if (CfaAdjustment) {
-        BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset(
-                                 nullptr, CfaAdjustment));
+        BuildCFI(MBB, InsertPos, DL,
+                 MCCFIInstruction::createAdjustCfaOffset(nullptr,
+                                                         CfaAdjustment));
      }
    }

--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -11474,6 +11474,10 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                        const SmallBitVector &Zeroable,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
+  SmallVector<int, 4> WidenedMask;
+  if (!canWidenShuffleElements(Mask, WidenedMask))
+    return SDValue();
+
  // TODO: If minimizing size and one of the inputs is a zero vector and the
  // the zero vector has only one use, we could use a VPERM2X128 to save the
  // instruction bytes needed to explicitly generate the zero vector.
@ -11521,15 +11525,10 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
  //    [6]   - ignore
  //    [7]   - zero high half of destination

-  int MaskLO = Mask[0];
-  if (MaskLO == SM_SentinelUndef)
-    MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
+  int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
+  int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];

-  int MaskHI = Mask[2];
-  if (MaskHI == SM_SentinelUndef)
-    MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
-
-  unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
+  unsigned PermMask = MaskLO | (MaskHI << 4);

  // If either input is a zero vector, replace it with an undef input.
  // Shuffle mask values <  4 are selecting elements of V1.
@ -11538,16 +11537,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
  // selecting the zero vector and setting the zero mask bit.
  if (IsV1Zero) {
    V1 = DAG.getUNDEF(VT);
-    if (MaskLO < 4)
+    if (MaskLO < 2)
      PermMask = (PermMask & 0xf0) | 0x08;
-    if (MaskHI < 4)
+    if (MaskHI < 2)
      PermMask = (PermMask & 0x0f) | 0x80;
  }
  if (IsV2Zero) {
    V2 = DAG.getUNDEF(VT);
-    if (MaskLO >= 4)
+    if (MaskLO >= 2)
      PermMask = (PermMask & 0xf0) | 0x08;
-    if (MaskHI >= 4)
+    if (MaskHI >= 2)
      PermMask = (PermMask & 0x0f) | 0x80;
  }

@ -12012,11 +12011,9 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
  assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

-  SmallVector<int, 4> WidenedMask;
-  if (canWidenShuffleElements(Mask, WidenedMask))
-    if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
-                                             Zeroable, Subtarget, DAG))
-      return V;
+  if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
+                                           Zeroable, Subtarget, DAG))
+    return V;

  if (V2.isUndef()) {
    // Check for being able to broadcast a single element.
@ -12107,11 +12104,9 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
  assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");

-  SmallVector<int, 4> WidenedMask;
-  if (canWidenShuffleElements(Mask, WidenedMask))
-    if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
-                                             Zeroable, Subtarget, DAG))
-      return V;
+  if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
+                                           Zeroable, Subtarget, DAG))
+    return V;

  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
                                                Zeroable, Subtarget, DAG))
@ -12605,33 +12600,72 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
  if (!canWidenShuffleElements(Mask, WidenedMask))
    return SDValue();

-  SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
-  // Insure elements came from the same Op.
-  int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
-  for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
-    if (WidenedMask[i] == SM_SentinelZero)
-      return SDValue();
-    if (WidenedMask[i] == SM_SentinelUndef)
+  // Check for patterns which can be matched with a single insert of a 256-bit
+  // subvector.
+  bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
+                                        {0, 1, 2, 3, 0, 1, 2, 3});
+  if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
+                                        {0, 1, 2, 3, 8, 9, 10, 11})) {
+    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
+    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+                              DAG.getIntPtrConstant(0, DL));
+    SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+                              OnlyUsesV1 ? V1 : V2,
+                              DAG.getIntPtrConstant(0, DL));
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+  }
+
+  assert(WidenedMask.size() == 4);
+
+  // See if this is an insertion of the lower 128-bits of V2 into V1.
+  bool IsInsert = true;
+  int V2Index = -1;
+  for (int i = 0; i < 4; ++i) {
+    assert(WidenedMask[i] >= -1);
+    if (WidenedMask[i] < 0)
      continue;

-    SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
-    unsigned OpIndex = (i < Size/2) ? 0 : 1;
+    // Make sure all V1 subvectors are in place.
+    if (WidenedMask[i] < 4) {
+      if (WidenedMask[i] != i) {
+        IsInsert = false;
+        break;
+      }
+    } else {
+      // Make sure we only have a single V2 index and its the lowest 128-bits.
+      if (V2Index >= 0 || WidenedMask[i] != 4) {
+        IsInsert = false;
+        break;
+      }
+      V2Index = i;
+    }
+  }
+  if (IsInsert && V2Index >= 0) {
+    MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
+    SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
+                                 DAG.getIntPtrConstant(0, DL));
+    return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
+  }
+
+  // Try to lower to to vshuf64x2/vshuf32x4.
+  SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
+  unsigned PermMask = 0;
+  // Insure elements came from the same Op.
+  for (int i = 0; i < 4; ++i) {
+    assert(WidenedMask[i] >= -1);
+    if (WidenedMask[i] < 0)
+      continue;
+
+    SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
+    unsigned OpIndex = i / 2;
    if (Ops[OpIndex].isUndef())
      Ops[OpIndex] = Op;
    else if (Ops[OpIndex] != Op)
      return SDValue();
-  }

-  // Form a 128-bit permutation.
-  // Convert the 64-bit shuffle mask selection values into 128-bit selection
-  // bits defined by a vshuf64x2 instruction's immediate control byte.
-  unsigned PermMask = 0, Imm = 0;
-  unsigned ControlBitsNum = WidenedMask.size() / 2;
-
-  for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
-    // Use first element in place of undef mask.
-    Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
-    PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
+    // Convert the 128-bit shuffle mask selection values into 128-bit selection
+    // bits defined by a vshuf64x2 instruction's immediate control byte.
+    PermMask |= (WidenedMask[i] % 4) << (i * 2);
  }

  return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
@ -13051,10 +13085,10 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
  int NumElements = Mask.size();

-  int NumV1Elements = 0, NumV2Elements = 0, NumSentinelElements = 0;
+  int NumV1Elements = 0, NumV2Elements = 0;
  for (int M : Mask)
    if (M < 0)
-      ++NumSentinelElements;
+      continue;
    else if (M < NumElements)
      ++NumV1Elements;
    else
@ -18660,8 +18694,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
                                  Mask, PassThru, Subtarget, DAG);
    }
    case INTR_TYPE_3OP_IMM8_MASK:
-    case INTR_TYPE_3OP_MASK:
-    case INSERT_SUBVEC: {
+    case INTR_TYPE_3OP_MASK: {
      SDValue Src1 = Op.getOperand(1);
      SDValue Src2 = Op.getOperand(2);
      SDValue Src3 = Op.getOperand(3);
@ -18670,13 +18703,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget

      if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
        Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
-      else if (IntrData->Type == INSERT_SUBVEC) {
-        // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
-        assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
-        unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
-        Imm *= Src2.getSimpleValueType().getVectorNumElements();
-        Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
-      }

      // We specify 2 possible opcodes for intrinsics with rounding modes.
      // First, we check if the intrinsic may have non-default rounding mode,
@ -28693,6 +28719,29 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
    return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
                                    Op.getOperand(2));
  }
+  case ISD::INSERT_SUBVECTOR: {
+    unsigned EltSize = EltVT.getSizeInBits();
+    if (EltSize != 32 && EltSize != 64)
+      return false;
+    MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
+    // Only change element size, not type.
+    if (VT.isInteger() != OpEltVT.isInteger())
+      return false;
+    uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+    Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
+    SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
+    DCI.AddToWorklist(Op0.getNode());
+    // Op1 needs to be bitcasted to a smaller vector with the same element type.
+    SDValue Op1 = Op.getOperand(1);
+    MVT Op1VT = MVT::getVectorVT(EltVT,
+                            Op1.getSimpleValueType().getSizeInBits() / EltSize);
+    Op1 = DAG.getBitcast(Op1VT, Op1);
+    DCI.AddToWorklist(Op1.getNode());
+    DCI.CombineTo(OrigOp.getNode(),
+                  DAG.getNode(Opcode, DL, VT, Op0, Op1,
+                              DAG.getConstant(Imm, DL, MVT::i8)));
+    return true;
+  }
  }

  return false;
@ -31784,6 +31833,83 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
  return SDValue();
 }

+/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
+/// the codegen.
+/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
+static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
+                                          const X86Subtarget &Subtarget,
+                                          SDLoc &DL) {
+  assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
+  SDValue Src = N->getOperand(0);
+  unsigned Opcode = Src.getOpcode();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  EVT VT = N->getValueType(0);
+  EVT SrcVT = Src.getValueType();
+
+  auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
+    // TODO: Add extra cases where we can truncate both inputs for the
+    // cost of one (or none).
+    // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
+    if (Op0 == Op1)
+      return true;
+
+    SDValue BC0 = peekThroughOneUseBitcasts(Op0);
+    SDValue BC1 = peekThroughOneUseBitcasts(Op1);
+    return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
+           ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
+  };
+
+  auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
+    SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
+    SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+    return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
+  };
+
+  // Don't combine if the operation has other uses.
+  if (!N->isOnlyUserOf(Src.getNode()))
+    return SDValue();
+
+  // Only support vector truncation for now.
+  // TODO: i64 scalar math would benefit as well.
+  if (!VT.isVector())
+    return SDValue();
+
+  // In most cases its only worth pre-truncating if we're only facing the cost
+  // of one truncation.
+  // i.e. if one of the inputs will constant fold or the input is repeated.
+  switch (Opcode) {
+  case ISD::AND:
+  case ISD::XOR:
+  case ISD::OR: {
+    SDValue Op0 = Src.getOperand(0);
+    SDValue Op1 = Src.getOperand(1);
+    if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
+        IsRepeatedOpOrOneUseConstant(Op0, Op1))
+      return TruncateArithmetic(Op0, Op1);
+    break;
+  }
+
+  case ISD::MUL:
+    // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
+    // better to truncate if we have the chance.
+    if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
+        !TLI.isOperationLegal(Opcode, SrcVT))
+      return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
+    LLVM_FALLTHROUGH;
+  case ISD::ADD: {
+    SDValue Op0 = Src.getOperand(0);
+    SDValue Op1 = Src.getOperand(1);
+    if (TLI.isOperationLegal(Opcode, VT) &&
+        IsRepeatedOpOrOneUseConstant(Op0, Op1))
+      return TruncateArithmetic(Op0, Op1);
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
 static SDValue
 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
@ -31970,6 +32096,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
  SDValue Src = N->getOperand(0);
  SDLoc DL(N);

+  // Attempt to pre-truncate inputs to arithmetic ops instead.
+  if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
+    return V;
+
  // Try to detect AVG pattern first.
  if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
    return Avg;
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@ -650,33 +650,6 @@ multiclass vextract_for_size<int Opcode,
                                From.ZSuffix # "rrkz")
                To.KRCWM:$mask, From.RC:$src1,
                (EXTRACT_get_vextract_imm To.RC:$ext))>;
-
-  // Intrinsic call with masking.
-  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
-                              "x" # To.NumElts # "_" # From.Size)
-                From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask),
-            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
-                                From.ZSuffix # "rrk")
-                To.RC:$src0,
-                (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
-                From.RC:$src1, imm:$idx)>;
-
-  // Intrinsic call with zero-masking.
-  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
-                              "x" # To.NumElts # "_" # From.Size)
-                From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask),
-            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
-                                From.ZSuffix # "rrkz")
-                (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
-                From.RC:$src1, imm:$idx)>;
-
-  // Intrinsic call without masking.
-  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
-                              "x" # To.NumElts # "_" # From.Size)
-                From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
-            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
-                                From.ZSuffix # "rr")
-                From.RC:$src1, imm:$idx)>;
 }

 // Codegen pattern for the alternative types
@ -6871,18 +6844,18 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
                                    VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
  }
  let isCodeGenOnly = 1 in {
-    defm Int_VUCOMISSZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
-                              load, "ucomiss">, PS, EVEX, VEX_LIG,
+    defm Int_VUCOMISSZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
+                              sse_load_f32, "ucomiss">, PS, EVEX, VEX_LIG,
                              EVEX_CD8<32, CD8VT1>;
-    defm Int_VUCOMISDZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
-                              load, "ucomisd">, PD, EVEX,
+    defm Int_VUCOMISDZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
+                              sse_load_f64, "ucomisd">, PD, EVEX,
                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;

-    defm Int_VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
-                              load, "comiss">, PS, EVEX, VEX_LIG,
+    defm Int_VCOMISSZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
+                              sse_load_f32, "comiss">, PS, EVEX, VEX_LIG,
                              EVEX_CD8<32, CD8VT1>;
-    defm Int_VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
-                              load, "comisd">, PD, EVEX,
+    defm Int_VCOMISDZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
+                              sse_load_f64, "comisd">, PD, EVEX,
                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
  }
 }
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -2373,6 +2373,23 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
          Sched<[WriteFAddLd, ReadAfterLd]>;
 }

+// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
+multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
+                            ValueType vt, Operand memop,
+                            ComplexPattern mem_cpat, string OpcodeStr> {
+  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+                     IIC_SSE_COMIS_RR>,
+          Sched<[WriteFAdd]>;
+  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1),
+                                           mem_cpat:$src2))],
+                                           IIC_SSE_COMIS_RM>,
+          Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
 let Defs = [EFLAGS] in {
  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
                                  "ucomiss">, PS, VEX, VEX_LIG;
@ -2386,15 +2403,15 @@ let Defs = [EFLAGS] in {
  }

  let isCodeGenOnly = 1 in {
-    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                              load, "ucomiss">, PS, VEX;
-    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                              load, "ucomisd">, PD, VEX;
+    defm Int_VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+                              sse_load_f32, "ucomiss">, PS, VEX;
+    defm Int_VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+                              sse_load_f64, "ucomisd">, PD, VEX;

-    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
-                              load, "comiss">, PS, VEX;
-    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
-                              load, "comisd">, PD, VEX;
+    defm Int_VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+                              sse_load_f32, "comiss">, PS, VEX;
+    defm Int_VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+                              sse_load_f64, "comisd">, PD, VEX;
  }
  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
                                  "ucomiss">, PS;
@ -2409,15 +2426,15 @@ let Defs = [EFLAGS] in {
  }

  let isCodeGenOnly = 1 in {
-    defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                                load, "ucomiss">, PS;
-    defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                                load, "ucomisd">, PD;
+    defm Int_UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+                                sse_load_f32, "ucomiss">, PS;
+    defm Int_UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+                                sse_load_f64, "ucomisd">, PD;

-    defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
-                                    "comiss">, PS;
-    defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
-                                    "comisd">, PD;
+    defm Int_COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+                                    sse_load_f32, "comiss">, PS;
+    defm Int_COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+                                    sse_load_f64, "comisd">, PD;
  }
 } // Defs = [EFLAGS]

--- a/lib/Target/X86/X86InstrTablesInfo.h
+++ b/lib/Target/X86/X86InstrTablesInfo.h
@ -1,4 +1,4 @@
-//===-- X86AVX512Info.h - X86 Instruction Tables Information ----*- C++ -*-===//
+//===-- X86InstrTablesInfo.h - X86 Instruction Tables -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -25,8 +25,7 @@ struct X86EvexToVexCompressTableEntry {

 // X86 EVEX encoded instructions that have a VEX 128 encoding
 // (table format: <EVEX opcode, VEX-128 opcode>).
-static const X86EvexToVexCompressTableEntry 
-   X86EvexToVex128CompressTable[] = {
+static const X86EvexToVexCompressTableEntry X86EvexToVex128CompressTable[] = {
  // EVEX scalar with corresponding VEX.
  { X86::Int_VCOMISDZrm         ,  X86::Int_VCOMISDrm            },
  { X86::Int_VCOMISDZrr         ,  X86::Int_VCOMISDrr            },
@ -250,20 +249,20 @@ static const X86EvexToVexCompressTableEntry
  { X86::VUCOMISDZrr            ,  X86::VUCOMISDrr               },
  { X86::VUCOMISSZrm            ,  X86::VUCOMISSrm               },
  { X86::VUCOMISSZrr            ,  X86::VUCOMISSrr               },
-                                                                               
+
  { X86::VMOV64toPQIZrr         ,   X86::VMOV64toPQIrr           },
  { X86::VMOV64toSDZrr          ,   X86::VMOV64toSDrr            },
  { X86::VMOVDI2PDIZrm          ,   X86::VMOVDI2PDIrm            },
  { X86::VMOVDI2PDIZrr          ,   X86::VMOVDI2PDIrr            },
  { X86::VMOVLHPSZrr            ,   X86::VMOVLHPSrr              },
-  { X86::VMOVHLPSZrr            ,   X86::VMOVHLPSrr              },  
+  { X86::VMOVHLPSZrr            ,   X86::VMOVHLPSrr              },
  { X86::VMOVPDI2DIZmr          ,   X86::VMOVPDI2DImr            },
  { X86::VMOVPDI2DIZrr          ,   X86::VMOVPDI2DIrr            },
  { X86::VMOVPQI2QIZmr          ,   X86::VMOVPQI2QImr            },
  { X86::VMOVPQIto64Zrr         ,   X86::VMOVPQIto64rr           },
  { X86::VMOVQI2PQIZrm          ,   X86::VMOVQI2PQIrm            },
  { X86::VMOVZPQILo2PQIZrr      ,   X86::VMOVZPQILo2PQIrr        },
-                                                                               
+
  { X86::VPEXTRBZmr             ,   X86::VPEXTRBmr               },
  { X86::VPEXTRBZrr             ,   X86::VPEXTRBrr               },
  { X86::VPEXTRDZmr             ,   X86::VPEXTRDmr               },
@ -272,7 +271,7 @@ static const X86EvexToVexCompressTableEntry
  { X86::VPEXTRQZrr             ,   X86::VPEXTRQrr               },
  { X86::VPEXTRWZmr             ,   X86::VPEXTRWmr               },
  { X86::VPEXTRWZrr             ,   X86::VPEXTRWri               },
-                                                                               
+
  { X86::VPINSRBZrm             ,   X86::VPINSRBrm               },
  { X86::VPINSRBZrr             ,   X86::VPINSRBrr               },
  { X86::VPINSRDZrm             ,   X86::VPINSRDrm               },
@ -294,7 +293,7 @@ static const X86EvexToVexCompressTableEntry
  { X86::VANDPDZ128rm           ,    X86::VANDPDrm               },
  { X86::VANDPDZ128rr           ,    X86::VANDPDrr               },
  { X86::VANDPSZ128rm           ,    X86::VANDPSrm               },
-  { X86::VANDPSZ128rr           ,    X86::VANDPSrr               },      
+  { X86::VANDPSZ128rr           ,    X86::VANDPSrr               },
  { X86::VBROADCASTSSZ128m      ,    X86::VBROADCASTSSrm         },
  { X86::VBROADCASTSSZ128r      ,    X86::VBROADCASTSSrr         },
  { X86::VBROADCASTSSZ128r_s    ,    X86::VBROADCASTSSrr         },
@ -414,8 +413,8 @@ static const X86EvexToVexCompressTableEntry
  { X86::VMOVAPDZ128rm          ,    X86::VMOVAPDrm              },
  { X86::VMOVAPDZ128rr          ,    X86::VMOVAPDrr              },
  { X86::VMOVAPDZ128rr_REV      ,    X86::VMOVAPDrr_REV          },
-  { X86::VMOVAPSZ128mr          ,    X86::VMOVAPSmr              },    
-  { X86::VMOVAPSZ128rm          ,    X86::VMOVAPSrm              },    
+  { X86::VMOVAPSZ128mr          ,    X86::VMOVAPSmr              },
+  { X86::VMOVAPSZ128rm          ,    X86::VMOVAPSrm              },
  { X86::VMOVAPSZ128rr          ,    X86::VMOVAPSrr              },
  { X86::VMOVAPSZ128rr_REV      ,    X86::VMOVAPSrr_REV          },
  { X86::VMOVDDUPZ128rm         ,    X86::VMOVDDUPrm             },
@ -464,8 +463,8 @@ static const X86EvexToVexCompressTableEntry
  { X86::VMOVUPDZ128rm          ,    X86::VMOVUPDrm              },
  { X86::VMOVUPDZ128rr          ,    X86::VMOVUPDrr              },
  { X86::VMOVUPDZ128rr_REV      ,    X86::VMOVUPDrr_REV          },
-  { X86::VMOVUPSZ128mr          ,    X86::VMOVUPSmr              },    
-  { X86::VMOVUPSZ128rm          ,    X86::VMOVUPSrm              },    
+  { X86::VMOVUPSZ128mr          ,    X86::VMOVUPSmr              },
+  { X86::VMOVUPSZ128rm          ,    X86::VMOVUPSrm              },
  { X86::VMOVUPSZ128rr          ,    X86::VMOVUPSrr              },
  { X86::VMOVUPSZ128rr_REV      ,    X86::VMOVUPSrr_REV          },
  { X86::VMULPDZ128rm           ,    X86::VMULPDrm               },
@ -520,9 +519,9 @@ static const X86EvexToVexCompressTableEntry
  { X86::VPBROADCASTBZ128r      ,    X86::VPBROADCASTBrr         },
  { X86::VPBROADCASTDZ128m      ,    X86::VPBROADCASTDrm         },
  { X86::VPBROADCASTDZ128r      ,    X86::VPBROADCASTDrr         },
-  { X86::VPBROADCASTQZ128m      ,    X86::VPBROADCASTQrm         }, 
-  { X86::VPBROADCASTQZ128r      ,    X86::VPBROADCASTQrr         }, 
-  { X86::VPBROADCASTWZ128m      ,    X86::VPBROADCASTWrm         },    
+  { X86::VPBROADCASTQZ128m      ,    X86::VPBROADCASTQrm         },
+  { X86::VPBROADCASTQZ128r      ,    X86::VPBROADCASTQrr         },
+  { X86::VPBROADCASTWZ128m      ,    X86::VPBROADCASTWrm         },
  { X86::VPBROADCASTWZ128r      ,    X86::VPBROADCASTWrr         },
  { X86::VPERMILPDZ128mi        ,    X86::VPERMILPDmi            },
  { X86::VPERMILPDZ128ri        ,    X86::VPERMILPDri            },
@ -583,7 +582,7 @@ static const X86EvexToVexCompressTableEntry
  { X86::VPMOVZXWDZ128rm        ,    X86::VPMOVZXWDrm            },
  { X86::VPMOVZXWDZ128rr        ,    X86::VPMOVZXWDrr            },
  { X86::VPMOVZXWQZ128rm        ,    X86::VPMOVZXWQrm            },
-  { X86::VPMOVZXWQZ128rr        ,    X86::VPMOVZXWQrr            },    
+  { X86::VPMOVZXWQZ128rr        ,    X86::VPMOVZXWQrr            },
  { X86::VPMULDQZ128rm          ,    X86::VPMULDQrm              },
  { X86::VPMULDQZ128rr          ,    X86::VPMULDQrr              },
  { X86::VPMULHRSWZ128rm        ,    X86::VPMULHRSWrm            },
@ -612,10 +611,10 @@ static const X86EvexToVexCompressTableEntry
  { X86::VPSHUFHWZ128ri         ,    X86::VPSHUFHWri             },
  { X86::VPSHUFLWZ128mi         ,    X86::VPSHUFLWmi             },
  { X86::VPSHUFLWZ128ri         ,    X86::VPSHUFLWri             },
-  { X86::VPSLLDQZ128rr          ,    X86::VPSLLDQri              },    
+  { X86::VPSLLDQZ128rr          ,    X86::VPSLLDQri              },
  { X86::VPSLLDZ128ri           ,    X86::VPSLLDri               },
  { X86::VPSLLDZ128rm           ,    X86::VPSLLDrm               },
-  { X86::VPSLLDZ128rr           ,    X86::VPSLLDrr               },    
+  { X86::VPSLLDZ128rr           ,    X86::VPSLLDrr               },
  { X86::VPSLLQZ128ri           ,    X86::VPSLLQri               },
  { X86::VPSLLQZ128rm           ,    X86::VPSLLQrm               },
  { X86::VPSLLQZ128rr           ,    X86::VPSLLQrr               },
@ -713,8 +712,7 @@ static const X86EvexToVexCompressTableEntry

 // X86 EVEX encoded instructions that have a VEX 256 encoding
 // (table format: <EVEX opcode, VEX-256 opcode>).
- static const X86EvexToVexCompressTableEntry 
-   X86EvexToVex256CompressTable[] = {
+ static const X86EvexToVexCompressTableEntry X86EvexToVex256CompressTable[] = {
  { X86::VADDPDZ256rm           ,     X86::VADDPDYrm             },
  { X86::VADDPDZ256rr           ,     X86::VADDPDYrr             },
  { X86::VADDPSZ256rm           ,     X86::VADDPSYrm             },
@ -727,11 +725,11 @@ static const X86EvexToVexCompressTableEntry
  { X86::VANDPDZ256rr           ,     X86::VANDPDYrr             },
  { X86::VANDPSZ256rm           ,     X86::VANDPSYrm             },
  { X86::VANDPSZ256rr           ,     X86::VANDPSYrr             },
-  { X86::VBROADCASTSDZ256m      ,     X86::VBROADCASTSDYrm       }, 
-  { X86::VBROADCASTSDZ256r      ,     X86::VBROADCASTSDYrr       }, 
-  { X86::VBROADCASTSDZ256r_s    ,     X86::VBROADCASTSDYrr       }, 
+  { X86::VBROADCASTSDZ256m      ,     X86::VBROADCASTSDYrm       },
+  { X86::VBROADCASTSDZ256r      ,     X86::VBROADCASTSDYrr       },
+  { X86::VBROADCASTSDZ256r_s    ,     X86::VBROADCASTSDYrr       },
  { X86::VBROADCASTSSZ256m      ,     X86::VBROADCASTSSYrm       },
-  { X86::VBROADCASTSSZ256r      ,     X86::VBROADCASTSSYrr       }, 
+  { X86::VBROADCASTSSZ256r      ,     X86::VBROADCASTSSYrr       },
  { X86::VBROADCASTSSZ256r_s    ,     X86::VBROADCASTSSYrr       },
  { X86::VCVTDQ2PDZ256rm        ,     X86::VCVTDQ2PDYrm          },
  { X86::VCVTDQ2PDZ256rr        ,     X86::VCVTDQ2PDYrr          },
@ -757,6 +755,14 @@ static const X86EvexToVexCompressTableEntry
  { X86::VDIVPDZ256rr           ,     X86::VDIVPDYrr             },
  { X86::VDIVPSZ256rm           ,     X86::VDIVPSYrm             },
  { X86::VDIVPSZ256rr           ,     X86::VDIVPSYrr             },
+  { X86::VEXTRACTF32x4Z256mr    ,    X86::VEXTRACTF128mr         },
+  { X86::VEXTRACTF64x2Z256mr    ,    X86::VEXTRACTF128mr         },
+  { X86::VEXTRACTF32x4Z256rr    ,    X86::VEXTRACTF128rr         },
+  { X86::VEXTRACTF64x2Z256rr    ,    X86::VEXTRACTF128rr         },
+  { X86::VEXTRACTI32x4Z256mr    ,    X86::VEXTRACTI128mr         },
+  { X86::VEXTRACTI64x2Z256mr    ,    X86::VEXTRACTI128mr         },
+  { X86::VEXTRACTI32x4Z256rr    ,    X86::VEXTRACTI128rr         },
+  { X86::VEXTRACTI64x2Z256rr    ,    X86::VEXTRACTI128rr         },
  { X86::VFMADD132PDZ256m       ,     X86::VFMADD132PDYm         },
  { X86::VFMADD132PDZ256r       ,     X86::VFMADD132PDYr         },
  { X86::VFMADD132PSZ256m       ,     X86::VFMADD132PSYm         },
@ -829,6 +835,14 @@ static const X86EvexToVexCompressTableEntry
  { X86::VFNMSUB231PDZ256r      ,     X86::VFNMSUB231PDYr        },
  { X86::VFNMSUB231PSZ256m      ,     X86::VFNMSUB231PSYm        },
  { X86::VFNMSUB231PSZ256r      ,     X86::VFNMSUB231PSYr        },
+  { X86::VINSERTF32x4Z256rm     ,    X86::VINSERTF128rm          },
+  { X86::VINSERTF64x2Z256rm     ,    X86::VINSERTF128rm          },
+  { X86::VINSERTF32x4Z256rr     ,    X86::VINSERTF128rr          },
+  { X86::VINSERTF64x2Z256rr     ,    X86::VINSERTF128rr          },
+  { X86::VINSERTI32x4Z256rm     ,    X86::VINSERTI128rm          },
+  { X86::VINSERTI64x2Z256rm     ,    X86::VINSERTI128rm          },
+  { X86::VINSERTI32x4Z256rr     ,    X86::VINSERTI128rr          },
+  { X86::VINSERTI64x2Z256rr     ,    X86::VINSERTI128rr          },
  { X86::VMAXCPDZ256rm          ,     X86::VMAXCPDYrm            },
  { X86::VMAXCPDZ256rr          ,     X86::VMAXCPDYrr            },
  { X86::VMAXCPSZ256rm          ,     X86::VMAXCPSYrm            },
@ -849,8 +863,8 @@ static const X86EvexToVexCompressTableEntry
  { X86::VMOVAPDZ256rm          ,     X86::VMOVAPDYrm            },
  { X86::VMOVAPDZ256rr          ,     X86::VMOVAPDYrr            },
  { X86::VMOVAPDZ256rr_REV      ,     X86::VMOVAPDYrr_REV        },
-  { X86::VMOVAPSZ256mr          ,     X86::VMOVAPSYmr            },    
-  { X86::VMOVAPSZ256rm          ,     X86::VMOVAPSYrm            },    
+  { X86::VMOVAPSZ256mr          ,     X86::VMOVAPSYmr            },
+  { X86::VMOVAPSZ256rm          ,     X86::VMOVAPSYrm            },
  { X86::VMOVAPSZ256rr          ,     X86::VMOVAPSYrr            },
  { X86::VMOVAPSZ256rr_REV      ,     X86::VMOVAPSYrr_REV        },
  { X86::VMOVDDUPZ256rm         ,     X86::VMOVDDUPYrm           },
@ -943,14 +957,14 @@ static const X86EvexToVexCompressTableEntry
  { X86::VPAVGBZ256rr           ,     X86::VPAVGBYrr             },
  { X86::VPAVGWZ256rm           ,     X86::VPAVGWYrm             },
  { X86::VPAVGWZ256rr           ,     X86::VPAVGWYrr             },
-  { X86::VPBROADCASTBZ256m      ,     X86::VPBROADCASTBYrm       }, 
-  { X86::VPBROADCASTBZ256r      ,     X86::VPBROADCASTBYrr       },   
-  { X86::VPBROADCASTDZ256m      ,     X86::VPBROADCASTDYrm       }, 
-  { X86::VPBROADCASTDZ256r      ,     X86::VPBROADCASTDYrr       },   
-  { X86::VPBROADCASTQZ256m      ,     X86::VPBROADCASTQYrm       }, 
-  { X86::VPBROADCASTQZ256r      ,     X86::VPBROADCASTQYrr       }, 
-  { X86::VPBROADCASTWZ256m      ,     X86::VPBROADCASTWYrm       }, 
-  { X86::VPBROADCASTWZ256r      ,     X86::VPBROADCASTWYrr       }, 
+  { X86::VPBROADCASTBZ256m      ,     X86::VPBROADCASTBYrm       },
+  { X86::VPBROADCASTBZ256r      ,     X86::VPBROADCASTBYrr       },
+  { X86::VPBROADCASTDZ256m      ,     X86::VPBROADCASTDYrm       },
+  { X86::VPBROADCASTDZ256r      ,     X86::VPBROADCASTDYrr       },
+  { X86::VPBROADCASTQZ256m      ,     X86::VPBROADCASTQYrm       },
+  { X86::VPBROADCASTQZ256r      ,     X86::VPBROADCASTQYrr       },
+  { X86::VPBROADCASTWZ256m      ,     X86::VPBROADCASTWYrm       },
+  { X86::VPBROADCASTWZ256r      ,     X86::VPBROADCASTWYrr       },
  { X86::VPERMDZ256rm           ,     X86::VPERMDYrm             },
  { X86::VPERMDZ256rr           ,     X86::VPERMDYrr             },
  { X86::VPERMILPDZ256mi        ,     X86::VPERMILPDYmi          },
@ -1050,7 +1064,7 @@ static const X86EvexToVexCompressTableEntry
  { X86::VPSLLDQZ256rr          ,     X86::VPSLLDQYri            },
  { X86::VPSLLDZ256ri           ,     X86::VPSLLDYri             },
  { X86::VPSLLDZ256rm           ,     X86::VPSLLDYrm             },
-  { X86::VPSLLDZ256rr           ,     X86::VPSLLDYrr             },    
+  { X86::VPSLLDZ256rr           ,     X86::VPSLLDYrr             },
  { X86::VPSLLQZ256ri           ,     X86::VPSLLQYri             },
  { X86::VPSLLQZ256rm           ,     X86::VPSLLQYrm             },
  { X86::VPSLLQZ256rr           ,     X86::VPSLLQYrr             },
@ -1060,7 +1074,7 @@ static const X86EvexToVexCompressTableEntry
  { X86::VPSLLVQZ256rr          ,     X86::VPSLLVQYrr            },
  { X86::VPSLLWZ256ri           ,     X86::VPSLLWYri             },
  { X86::VPSLLWZ256rm           ,     X86::VPSLLWYrm             },
-  { X86::VPSLLWZ256rr           ,     X86::VPSLLWYrr             },    
+  { X86::VPSLLWZ256rr           ,     X86::VPSLLWYrr             },
  { X86::VPSRADZ256ri           ,     X86::VPSRADYri             },
  { X86::VPSRADZ256rm           ,     X86::VPSRADYrm             },
  { X86::VPSRADZ256rr           ,     X86::VPSRADYrr             },
@ -1072,7 +1086,7 @@ static const X86EvexToVexCompressTableEntry
  { X86::VPSRLDQZ256rr          ,     X86::VPSRLDQYri            },
  { X86::VPSRLDZ256ri           ,     X86::VPSRLDYri             },
  { X86::VPSRLDZ256rm           ,     X86::VPSRLDYrm             },
-  { X86::VPSRLDZ256rr           ,     X86::VPSRLDYrr             },   
+  { X86::VPSRLDZ256rr           ,     X86::VPSRLDYrr             },
  { X86::VPSRLQZ256ri           ,     X86::VPSRLQYri             },
  { X86::VPSRLQZ256rm           ,     X86::VPSRLQYrm             },
  { X86::VPSRLQZ256rr           ,     X86::VPSRLQYrr             },
@ -1145,4 +1159,4 @@ static const X86EvexToVexCompressTableEntry
  { X86::VXORPSZ256rr           ,     X86::VXORPSYrr             },
 };

-#endif
+#endif
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@ -34,7 +34,7 @@ enum IntrinsicType : uint16_t {
  INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
  COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC,
  TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
-  EXPAND_FROM_MEM, INSERT_SUBVEC,
+  EXPAND_FROM_MEM,
  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
  FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
 };
@ -795,30 +795,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                     X86ISD::VGETMANTS, 0),
  X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
                     X86ISD::VGETMANTS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
-  X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC,
-                     ISD::INSERT_SUBVECTOR, 0),
  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
                     ISD::CTLZ, 0),
  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@ -1115,56 +1115,6 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLo
  OutStreamer->EmitInstruction(TC, getSubtargetInfo());
 }

-void X86AsmPrinter::EmitXRayTable() {
-  if (Sleds.empty())
-    return;
-  
-  auto PrevSection = OutStreamer->getCurrentSectionOnly();
-  auto Fn = MF->getFunction();
-  MCSection *Section = nullptr;
-  if (Subtarget->isTargetELF()) {
-    if (Fn->hasComdat()) {
-      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
-                                         ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
-                                         Fn->getComdat()->getName());
-    } else {
-      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
-                                         ELF::SHF_ALLOC);
-    }
-  } else if (Subtarget->isTargetMachO()) {
-    Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
-                                         SectionKind::getReadOnlyWithRel());
-  } else {
-    llvm_unreachable("Unsupported target");
-  }
-
-  // Before we switch over, we force a reference to a label inside the
-  // xray_instr_map section. Since EmitXRayTable() is always called just
-  // before the function's end, we assume that this is happening after the
-  // last return instruction.
-  //
-  // We then align the reference to 16 byte boundaries, which we determined
-  // experimentally to be beneficial to avoid causing decoder stalls.
-  MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true);
-  OutStreamer->EmitCodeAlignment(16);
-  OutStreamer->EmitSymbolValue(Tmp, 8, false);
-  OutStreamer->SwitchSection(Section);
-  OutStreamer->EmitLabel(Tmp);
-  for (const auto &Sled : Sleds) {
-    OutStreamer->EmitSymbolValue(Sled.Sled, 8);
-    OutStreamer->EmitSymbolValue(CurrentFnSym, 8);
-    auto Kind = static_cast<uint8_t>(Sled.Kind);
-    OutStreamer->EmitBytes(
-        StringRef(reinterpret_cast<const char *>(&Kind), 1));
-    OutStreamer->EmitBytes(
-        StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
-    OutStreamer->EmitZeros(14);
-  }
-  OutStreamer->SwitchSection(PrevSection);
-
-  Sleds.clear();
-}
-
 // Returns instruction preceding MBBI in MachineFunction.
 // If MBBI is the first instruction of the first basic block, returns null.
 static MachineBasicBlock::const_iterator
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@ -598,197 +598,135 @@ int X86TTIImpl::getArithmeticInstrCost(

 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                               Type *SubTp) {
-
-  if (Kind == TTI::SK_Reverse) {
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
-    static const CostTblEntry AVX512VBMIShuffleTbl[] = {
-      { ISD::VECTOR_SHUFFLE, MVT::v64i8,  1 }, // vpermb
-      { ISD::VECTOR_SHUFFLE, MVT::v32i8,  1 }  // vpermb
-    };
-
-    if (ST->hasVBMI())
-      if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
-                                              ISD::VECTOR_SHUFFLE, LT.second))
-        return LT.first * Entry->Cost;
-
-    static const CostTblEntry AVX512BWShuffleTbl[] = {
-      { ISD::VECTOR_SHUFFLE, MVT::v32i16, 1 }, // vpermw
-      { ISD::VECTOR_SHUFFLE, MVT::v16i16, 1 }, // vpermw
-      { ISD::VECTOR_SHUFFLE, MVT::v64i8,  6 }  // vextracti64x4 + 2*vperm2i128
-                                               // + 2*pshufb + vinserti64x4
-    };
-
-    if (ST->hasBWI())
-      if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
-                                              ISD::VECTOR_SHUFFLE, LT.second))
-        return LT.first * Entry->Cost;
-
-    static const CostTblEntry AVX512ShuffleTbl[] = {
-      { ISD::VECTOR_SHUFFLE, MVT::v8f64,  1 }, // vpermpd
-      { ISD::VECTOR_SHUFFLE, MVT::v16f32, 1 }, // vpermps
-      { ISD::VECTOR_SHUFFLE, MVT::v8i64,  1 }, // vpermq
-      { ISD::VECTOR_SHUFFLE, MVT::v16i32, 1 }, // vpermd
-    };
-
-    if (ST->hasAVX512())
-      if (const auto *Entry =
-              CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
-        return LT.first * Entry->Cost;
-
-    static const CostTblEntry AVX2ShuffleTbl[] = {
-      { ISD::VECTOR_SHUFFLE, MVT::v4f64,  1 }, // vpermpd
-      { ISD::VECTOR_SHUFFLE, MVT::v8f32,  1 }, // vpermps
-      { ISD::VECTOR_SHUFFLE, MVT::v4i64,  1 }, // vpermq
-      { ISD::VECTOR_SHUFFLE, MVT::v8i32,  1 }, // vpermd
-      { ISD::VECTOR_SHUFFLE, MVT::v16i16, 2 }, // vperm2i128 + pshufb
-      { ISD::VECTOR_SHUFFLE, MVT::v32i8,  2 }  // vperm2i128 + pshufb
-    };
-
-    if (ST->hasAVX2())
-      if (const auto *Entry =
-              CostTableLookup(AVX2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
-        return LT.first * Entry->Cost;
-
-    static const CostTblEntry AVX1ShuffleTbl[] = {
-      { ISD::VECTOR_SHUFFLE, MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
-      { ISD::VECTOR_SHUFFLE, MVT::v8f32,  2 }, // vperm2f128 + vpermilps
-      { ISD::VECTOR_SHUFFLE, MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
-      { ISD::VECTOR_SHUFFLE, MVT::v8i32,  2 }, // vperm2f128 + vpermilps
-      { ISD::VECTOR_SHUFFLE, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
-                                               // + vinsertf128
-      { ISD::VECTOR_SHUFFLE, MVT::v32i8,  4 }  // vextractf128 + 2*pshufb
-                                               // + vinsertf128
-    };
-
-    if (ST->hasAVX())
-      if (const auto *Entry =
-              CostTableLookup(AVX1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
-        return LT.first * Entry->Cost;
-
-    static const CostTblEntry SSSE3ShuffleTbl[] = {
-      { ISD::VECTOR_SHUFFLE, MVT::v8i16, 1 }, // pshufb
-      { ISD::VECTOR_SHUFFLE, MVT::v16i8, 1 }  // pshufb
-    };
-
-    if (ST->hasSSSE3())
-      if (const auto *Entry =
-              CostTableLookup(SSSE3ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
-        return LT.first * Entry->Cost;
-
-    static const CostTblEntry SSE2ShuffleTbl[] = {
-      { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, // shufpd
-      { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, // pshufd
-      { ISD::VECTOR_SHUFFLE, MVT::v4i32, 1 }, // pshufd
-      { ISD::VECTOR_SHUFFLE, MVT::v8i16, 3 }, // pshuflw + pshufhw  + pshufd
-      { ISD::VECTOR_SHUFFLE, MVT::v16i8, 9 }  // 2*pshuflw + 2*pshufhw
-                                              // + 2*pshufd + 2*unpck + packus
-    };
-
-    if (ST->hasSSE2())
-      if (const auto *Entry =
-              CostTableLookup(SSE2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
-        return LT.first * Entry->Cost;
-
-    static const CostTblEntry SSE1ShuffleTbl[] = {
-        { ISD::VECTOR_SHUFFLE, MVT::v4f32, 1 }, // shufps
-    };
-
-    if (ST->hasSSE1())
-      if (const auto *Entry =
-              CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
-        return LT.first * Entry->Cost;
-
-  } else if (Kind == TTI::SK_Alternate) {
+  if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate) {
    // 64-bit packed float vectors (v2f32) are widened to type v4f32.
    // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);

-    // The backend knows how to generate a single VEX.256 version of
-    // instruction VPBLENDW if the target supports AVX2.
-    if (ST->hasAVX2() && LT.second == MVT::v16i16)
-      return LT.first;
+    static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+      { TTI::SK_Reverse, MVT::v64i8,  1 }, // vpermb
+      { TTI::SK_Reverse, MVT::v32i8,  1 }  // vpermb
+    };

-    static const CostTblEntry AVXAltShuffleTbl[] = {
-      {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vblendpd
-      {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vblendpd
+    if (ST->hasVBMI())
+      if (const auto *Entry =
+              CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
+        return LT.first * Entry->Cost;

-      {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vblendps
-      {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vblendps
+    static const CostTblEntry AVX512BWShuffleTbl[] = {
+      { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
+      { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
+      { TTI::SK_Reverse, MVT::v64i8,  6 }  // vextracti64x4 + 2*vperm2i128
+                                           // + 2*pshufb + vinserti64x4
+    };

-      // This shuffle is custom lowered into a sequence of:
-      //  2x  vextractf128 , 2x vpblendw , 1x vinsertf128
-      {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},
+    if (ST->hasBWI())
+      if (const auto *Entry =
+              CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
+        return LT.first * Entry->Cost;

-      // This shuffle is custom lowered into a long sequence of:
-      //  2x vextractf128 , 4x vpshufb , 2x vpor ,  1x vinsertf128
-      {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
+    static const CostTblEntry AVX512ShuffleTbl[] = {
+      { TTI::SK_Reverse, MVT::v8f64,  1 }, // vpermpd
+      { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
+      { TTI::SK_Reverse, MVT::v8i64,  1 }, // vpermq
+      { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
+    };
+
+    if (ST->hasAVX512())
+      if (const auto *Entry =
+              CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry AVX2ShuffleTbl[] = {
+      { TTI::SK_Reverse,   MVT::v4f64,  1 }, // vpermpd
+      { TTI::SK_Reverse,   MVT::v8f32,  1 }, // vpermps
+      { TTI::SK_Reverse,   MVT::v4i64,  1 }, // vpermq
+      { TTI::SK_Reverse,   MVT::v8i32,  1 }, // vpermd
+      { TTI::SK_Reverse,   MVT::v16i16, 2 }, // vperm2i128 + pshufb
+      { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
+
+      { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
+      { TTI::SK_Alternate, MVT::v32i8,  1 }  // vpblendvb
+    };
+
+    if (ST->hasAVX2())
+      if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry AVX1ShuffleTbl[] = {
+      { TTI::SK_Reverse,   MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
+      { TTI::SK_Reverse,   MVT::v8f32,  2 }, // vperm2f128 + vpermilps
+      { TTI::SK_Reverse,   MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
+      { TTI::SK_Reverse,   MVT::v8i32,  2 }, // vperm2f128 + vpermilps
+      { TTI::SK_Reverse,   MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
+                                             // + vinsertf128
+      { TTI::SK_Reverse,   MVT::v32i8,  4 }, // vextractf128 + 2*pshufb
+                                             // + vinsertf128
+
+      { TTI::SK_Alternate, MVT::v4i64,  1 }, // vblendpd
+      { TTI::SK_Alternate, MVT::v4f64,  1 }, // vblendpd
+      { TTI::SK_Alternate, MVT::v8i32,  1 }, // vblendps
+      { TTI::SK_Alternate, MVT::v8f32,  1 }, // vblendps
+      { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
+      { TTI::SK_Alternate, MVT::v32i8,  3 }  // vpand + vpandn + vpor
    };

    if (ST->hasAVX())
-      if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl,
-                                              ISD::VECTOR_SHUFFLE, LT.second))
+      if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
        return LT.first * Entry->Cost;

-    static const CostTblEntry SSE41AltShuffleTbl[] = {
-      // These are lowered into movsd.
-      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
-      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
-
-      // packed float vectors with four elements are lowered into BLENDI dag
-      // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.
-      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
-      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
-
-      // This shuffle generates a single pshufw.
-      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
-
-      // There is no instruction that matches a v16i8 alternate shuffle.
-      // The backend will expand it into the sequence 'pshufb + pshufb + or'.
-      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
+    static const CostTblEntry SSE41ShuffleTbl[] = {
+      { TTI::SK_Alternate, MVT::v2i64,  1 }, // pblendw
+      { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
+      { TTI::SK_Alternate, MVT::v4i32,  1 }, // pblendw
+      { TTI::SK_Alternate, MVT::v4f32,  1 }, // blendps
+      { TTI::SK_Alternate, MVT::v8i16,  1 }, // pblendw
+      { TTI::SK_Alternate, MVT::v16i8,  1 }  // pblendvb
    };

    if (ST->hasSSE41())
-      if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE,
-                                              LT.second))
+      if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
        return LT.first * Entry->Cost;

-    static const CostTblEntry SSSE3AltShuffleTbl[] = {
-      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
-      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
+    static const CostTblEntry SSSE3ShuffleTbl[] = {
+      { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
+      { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb

-      // SSE3 doesn't have 'blendps'. The following shuffles are expanded into
-      // the sequence 'shufps + pshufd'
-      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
-      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
-
-      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
-      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}  // pshufb + pshufb + or
+      { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
+      { TTI::SK_Alternate, MVT::v16i8,  3 }  // pshufb + pshufb + por
    };

    if (ST->hasSSSE3())
-      if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl,
-                                              ISD::VECTOR_SHUFFLE, LT.second))
+      if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
        return LT.first * Entry->Cost;

-    static const CostTblEntry SSEAltShuffleTbl[] = {
-      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
-      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
+    static const CostTblEntry SSE2ShuffleTbl[] = {
+      { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
+      { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
+      { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
+      { TTI::SK_Reverse,   MVT::v8i16,  3 }, // pshuflw + pshufhw  + pshufd
+      { TTI::SK_Reverse,   MVT::v16i8,  9 }, // 2*pshuflw + 2*pshufhw
+                                             // + 2*pshufd + 2*unpck + packus

-      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
-      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
-
-      // This is expanded into a long sequence of four extract + four insert.
-      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
-
-      // 8 x (pinsrw + pextrw + and + movb + movzb + or)
-      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
+      { TTI::SK_Alternate, MVT::v2i64,  1 }, // movsd
+      { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
+      { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
+      { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
+      { TTI::SK_Alternate, MVT::v16i8,  3 }  // pand + pandn + por
    };

-    // Fall-back (SSE3 and SSE2).
-    if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
-                                            ISD::VECTOR_SHUFFLE, LT.second))
-      return LT.first * Entry->Cost;
+    if (ST->hasSSE2())
+      if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry SSE1ShuffleTbl[] = {
+      { TTI::SK_Reverse,   MVT::v4f32,  1 }, // shufps
+      { TTI::SK_Alternate, MVT::v4f32,  2 }  // 2*shufps
+    };
+
+    if (ST->hasSSE1())
+      if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
+        return LT.first * Entry->Cost;

  } else if (Kind == TTI::SK_PermuteTwoSrc) {
    // We assume that source and destination have the same vector type.
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@ -1057,6 +1057,18 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
      // add(zext(xor i16 X, -32768), -32768) --> sext X
      return CastInst::Create(Instruction::SExt, X, LHS->getType());
    }
+
+    if (Val->isNegative() &&
+        match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) &&
+        Val->sge(-C->sext(Val->getBitWidth()))) {
+      // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val))
+      return CastInst::Create(
+          Instruction::ZExt,
+          Builder->CreateNUWAdd(
+              X, Constant::getIntegerValue(X->getType(),
+                                           *C + Val->trunc(C->getBitWidth()))),
+          I.getType());
+    }
  }

  // FIXME: Use the match above instead of dyn_cast to allow these transforms
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@ -1581,6 +1581,62 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      return replaceInstUsesWith(*II, V);
    break;
  }
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd: {
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+
+    // Canonicalize constants into the RHS.
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      II->setArgOperand(0, Src1);
+      II->setArgOperand(1, Src0);
+      std::swap(Src0, Src1);
+    }
+
+    Value *LHS = nullptr;
+    Value *RHS = nullptr;
+
+    // fma fneg(x), fneg(y), z -> fma x, y, z
+    if (match(Src0, m_FNeg(m_Value(LHS))) &&
+        match(Src1, m_FNeg(m_Value(RHS)))) {
+      CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(),
+                                              {LHS, RHS, II->getArgOperand(2)});
+      NewCall->takeName(II);
+      NewCall->copyFastMathFlags(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+
+    // fma fabs(x), fabs(x), z -> fma x, x, z
+    if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) &&
+        match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) {
+      CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(),
+                                              {LHS, LHS, II->getArgOperand(2)});
+      NewCall->takeName(II);
+      NewCall->copyFastMathFlags(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+
+    // fma x, 1, z -> fadd x, z
+    if (match(Src1, m_FPOne())) {
+      Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
+      RI->copyFastMathFlags(II);
+      return RI;
+    }
+
+    break;
+  }
+  case Intrinsic::fabs: {
+    Value *Cond;
+    Constant *LHS, *RHS;
+    if (match(II->getArgOperand(0),
+              m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) {
+      CallInst *Call0 = Builder->CreateCall(II->getCalledFunction(), {LHS});
+      CallInst *Call1 = Builder->CreateCall(II->getCalledFunction(), {RHS});
+      return SelectInst::Create(Cond, Call0, Call1);
+    }
+
+    break;
+  }
  case Intrinsic::ppc_altivec_lvx:
  case Intrinsic::ppc_altivec_lvxl:
    // Turn PPC lvx -> load if the pointer is known aligned.
@ -2669,24 +2725,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {

    // assume( (load addr) != null ) -> add 'nonnull' metadata to load
    // (if assume is valid at the load)
-    if (ICmpInst* ICmp = dyn_cast<ICmpInst>(IIOperand)) {
-      Value *LHS = ICmp->getOperand(0);
-      Value *RHS = ICmp->getOperand(1);
-      if (ICmpInst::ICMP_NE == ICmp->getPredicate() &&
-          isa<LoadInst>(LHS) &&
-          isa<Constant>(RHS) &&
-          RHS->getType()->isPointerTy() &&
-          cast<Constant>(RHS)->isNullValue()) {
-        LoadInst* LI = cast<LoadInst>(LHS);
-        if (isValidAssumeForContext(II, LI, &DT)) {
-          MDNode *MD = MDNode::get(II->getContext(), None);
-          LI->setMetadata(LLVMContext::MD_nonnull, MD);
-          return eraseInstFromFunction(*II);
-        }
-      }
+    CmpInst::Predicate Pred;
+    Instruction *LHS;
+    if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
+        Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
+        LHS->getType()->isPointerTy() &&
+        isValidAssumeForContext(II, LHS, &DT)) {
+      MDNode *MD = MDNode::get(II->getContext(), None);
+      LHS->setMetadata(LLVMContext::MD_nonnull, MD);
+      return eraseInstFromFunction(*II);
+
      // TODO: apply nonnull return attributes to calls and invokes
      // TODO: apply range metadata for range check patterns?
    }
+
    // If there is a dominating assume with the same condition as this one,
    // then this one is redundant, and should be removed.
    APInt KnownZero(1, 0), KnownOne(1, 0);
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@ -850,20 +850,10 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
  // separated by a few arithmetic operations.
  BasicBlock::iterator BBI(LI);
  bool IsLoadCSE = false;
-  if (Value *AvailableVal =
-      FindAvailableLoadedValue(&LI, LI.getParent(), BBI,
-                               DefMaxInstsToScan, AA, &IsLoadCSE)) {
-    if (IsLoadCSE) {
-      LoadInst *NLI = cast<LoadInst>(AvailableVal);
-      unsigned KnownIDs[] = {
-          LLVMContext::MD_tbaa,            LLVMContext::MD_alias_scope,
-          LLVMContext::MD_noalias,         LLVMContext::MD_range,
-          LLVMContext::MD_invariant_load,  LLVMContext::MD_nonnull,
-          LLVMContext::MD_invariant_group, LLVMContext::MD_align,
-          LLVMContext::MD_dereferenceable,
-          LLVMContext::MD_dereferenceable_or_null};
-      combineMetadata(NLI, &LI, KnownIDs);
-    };
+  if (Value *AvailableVal = FindAvailableLoadedValue(
+          &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+    if (IsLoadCSE)
+      combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI);

    return replaceInstUsesWith(
        LI, Builder->CreateBitOrPointerCast(AvailableVal, LI.getType(),
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@ -731,6 +731,25 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(I.getOperand(1))) {
    unsigned ShAmt = Op1C->getZExtValue();

+    // Turn:
+    //  %zext = zext i32 %V to i64
+    //  %res = shl i64 %V, 8
+    //
+    // Into:
+    //  %shl = shl i32 %V, 8
+    //  %res = zext i32 %shl to i64
+    //
+    // This is only valid if %V would have zeros shifted out.
+    if (auto *ZI = dyn_cast<ZExtInst>(I.getOperand(0))) {
+      unsigned SrcBitWidth = ZI->getSrcTy()->getScalarSizeInBits();
+      if (ShAmt < SrcBitWidth &&
+          MaskedValueIsZero(ZI->getOperand(0),
+                            APInt::getHighBitsSet(SrcBitWidth, ShAmt), 0, &I)) {
+        auto *Shl = Builder->CreateShl(ZI->getOperand(0), ShAmt);
+        return new ZExtInst(Shl, I.getType());
+      }
+    }
+
    // If the shifted-out value is known-zero, then this is a NUW shift.
    if (!I.hasNoUnsignedWrap() &&
        MaskedValueIsZero(I.getOperand(0),
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@ -481,9 +481,9 @@ private:
  bool processNode(DomTreeNode *Node);

  Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
-    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    if (auto *LI = dyn_cast<LoadInst>(Inst))
      return LI;
-    else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    if (auto *SI = dyn_cast<StoreInst>(Inst))
      return SI->getValueOperand();
    assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
    return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@ -79,6 +79,7 @@ STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
 STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted");
 STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
 STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
+STATISTIC(NumGVNMaxIterations, "Maximum Number of iterations it took to converge GVN");

 //===----------------------------------------------------------------------===//
 //                                GVN Pass
@ -714,16 +715,15 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
  // Unlike loads, we never try to eliminate stores, so we do not check if they
  // are simple and avoid value numbering them.
  auto *SI = cast<StoreInst>(I);
-  // If this store's memorydef stores the same value as the last store, the
-  // memory accesses are equivalent.
-  // Get the expression, if any, for the RHS of the MemoryDef.
  MemoryAccess *StoreAccess = MSSA->getMemoryAccess(SI);
-  MemoryAccess *StoreRHS = lookupMemoryAccessEquiv(
-      cast<MemoryDef>(StoreAccess)->getDefiningAccess());
-  const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
-  // See if this store expression already has a value, and it's the same as our
-  // current store.  FIXME: Right now, we only do this for simple stores.
+  // See if we are defined by a previous store expression, it already has a
+  // value, and it's the same value as our current store. FIXME: Right now, we
+  // only do this for simple stores, we should expand to cover memcpys, etc.
  if (SI->isSimple()) {
+    // Get the expression, if any, for the RHS of the MemoryDef.
+    MemoryAccess *StoreRHS = lookupMemoryAccessEquiv(
+        cast<MemoryDef>(StoreAccess)->getDefiningAccess());
+    const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
    CongruenceClass *CC = ExpressionToClass.lookup(OldStore);
    if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) &&
        CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B))
@ -1092,23 +1092,16 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
    if (auto *I = dyn_cast<Instruction>(V)) {
      if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
        // If this is a MemoryDef, we need to update the equivalence table. If
-        // we
-        // determined the expression is congruent to a different memory state,
-        // use that different memory state.  If we determined it didn't, we
-        // update
-        // that as well. Note that currently, we do not guarantee the
-        // "different" memory state dominates us.  The goal is to make things
-        // that are congruent look congruent, not ensure we can eliminate one in
-        // favor of the other.
-        // Right now, the only way they can be equivalent is for store
-        // expresions.
-        if (!isa<MemoryUse>(MA)) {
-          if (E && isa<StoreExpression>(E) && EClass->Members.size() != 1) {
-            auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
-            setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
-          } else {
-            setMemoryAccessEquivTo(MA, nullptr);
-          }
+        // we determined the expression is congruent to a different memory
+        // state, use that different memory state.  If we determined it didn't,
+        // we update that as well.  Right now, we only support store
+        // expressions.
+        if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) &&
+            EClass->Members.size() != 1) {
+          auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
+          setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
+        } else {
+          setMemoryAccessEquivTo(MA, nullptr);
        }
        markMemoryUsersTouched(MA);
      }
@ -1391,7 +1384,7 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
  } else {
    // Handle terminators that return values. All of them produce values we
    // don't currently understand.
-    if (!I->getType()->isVoidTy()){
+    if (!I->getType()->isVoidTy()) {
      auto *Symbolized = createUnknownExpression(I);
      performCongruenceFinding(I, Symbolized);
    }
@ -1427,14 +1420,12 @@ void NewGVN::verifyMemoryCongruency() {
      continue;
    if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second);
-      if (FirstMUD && SecondMUD) {
-        auto *FirstInst = FirstMUD->getMemoryInst();
-        auto *SecondInst = SecondMUD->getMemoryInst();
+      if (FirstMUD && SecondMUD)
        assert(
-            ValueToClass.lookup(FirstInst) == ValueToClass.lookup(SecondInst) &&
+            ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+                ValueToClass.lookup(SecondMUD->getMemoryInst()) &&
            "The instructions for these memory operations should have been in "
            "the same congruence class");
-      }
    } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {

      // We can only sanely verify that MemoryDefs in the operand list all have
@ -1538,9 +1529,11 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,

  initializeCongruenceClasses(F);

+  unsigned int Iterations = 0;
  // We start out in the entry block.
  BasicBlock *LastBlock = &F.getEntryBlock();
  while (TouchedInstructions.any()) {
+    ++Iterations;
    // Walk through all the instructions in all the blocks in RPO.
    for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
         InstrNum = TouchedInstructions.find_next(InstrNum)) {
@ -1587,8 +1580,7 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
      TouchedInstructions.reset(InstrNum);
    }
  }
-
-// FIXME: Move this to expensive checks when we are satisfied with NewGVN
+  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
 #ifndef NDEBUG
  verifyMemoryCongruency();
 #endif
@ -2070,7 +2062,7 @@ bool NewGVN::eliminateInstructions(Function &F) {

    // Cleanup the congruence class.
    SmallPtrSet<Value *, 4> MembersLeft;
-    for (Value * Member : CC->Members) {
+    for (Value *Member : CC->Members) {
      if (Member->getType()->isVoidTy()) {
        MembersLeft.insert(Member);
        continue;
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@ -760,7 +760,7 @@ static void PropagateParallelLoopAccessMetadata(CallSite CS,

 /// When inlining a function that contains noalias scope metadata,
 /// this metadata needs to be cloned so that the inlined blocks
-/// have different "unqiue scopes" at every call site. Were this not done, then
+/// have different "unique scopes" at every call site. Were this not done, then
 /// aliasing scopes from a function inlined into a caller multiple times could
 /// not be differentiated (and this would lead to miscompiles because the
 /// non-aliasing property communicated by the metadata could have
--- a/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp
@ -335,10 +335,12 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
  unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);

  uint64_t TrueWeight, FalseWeight;
-  uint64_t ExitWeight = 0, BackEdgeWeight = 0;
+  uint64_t ExitWeight = 0, CurHeaderWeight = 0;
  if (LatchBR->extractProfMetadata(TrueWeight, FalseWeight)) {
    ExitWeight = HeaderIdx ? TrueWeight : FalseWeight;
-    BackEdgeWeight = HeaderIdx ? FalseWeight : TrueWeight;
+    // The # of times the loop body executes is the sum of the exit block
+    // weight and the # of times the backedges are taken.
+    CurHeaderWeight = TrueWeight + FalseWeight;
  }

  // For each peeled-off iteration, make a copy of the loop.
@ -346,15 +348,14 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
    SmallVector<BasicBlock *, 8> NewBlocks;
    ValueToValueMapTy VMap;

-    // The exit weight of the previous iteration is the header entry weight
-    // of the current iteration. So this is exactly how many dynamic iterations
-    // the current peeled-off static iteration uses up.
+    // Subtract the exit weight from the current header weight -- the exit
+    // weight is exactly the weight of the previous iteration's header.
    // FIXME: due to the way the distribution is constructed, we need a
    // guard here to make sure we don't end up with non-positive weights.
-    if (ExitWeight < BackEdgeWeight)
-      BackEdgeWeight -= ExitWeight;
+    if (ExitWeight < CurHeaderWeight)
+      CurHeaderWeight -= ExitWeight;
    else
-      BackEdgeWeight = 1;
+      CurHeaderWeight = 1;

    cloneLoopBlocks(L, Iter, InsertTop, InsertBot, Exit,
                    NewBlocks, LoopBlocks, VMap, LVMap, LI);
@ -388,6 +389,14 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,

  // Adjust the branch weights on the loop exit.
  if (ExitWeight) {
+    // The backedge count is the difference of current header weight and
+    // current loop exit weight. If the current header weight is smaller than
+    // the current loop exit weight, we mark the loop backedge weight as 1.
+    uint64_t BackEdgeWeight = 0;
+    if (ExitWeight < CurHeaderWeight)
+      BackEdgeWeight = CurHeaderWeight - ExitWeight;
+    else
+      BackEdgeWeight = 1;
    MDBuilder MDB(LatchBR->getContext());
    MDNode *WeightNode =
        HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@ -1574,12 +1574,20 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
    I0->getOperandUse(O).set(NewOperands[O]);
  I0->moveBefore(&*BBEnd->getFirstInsertionPt());

-  // Update metadata and IR flags.
+  // The debug location for the "common" instruction is the merged locations of
+  // all the commoned instructions.  We start with the original location of the
+  // "common" instruction and iteratively merge each location in the loop below.
+  DILocation *Loc = I0->getDebugLoc();
+
+  // Update metadata and IR flags, and merge debug locations.
  for (auto *I : Insts)
    if (I != I0) {
+      Loc = DILocation::getMergedLocation(Loc, I->getDebugLoc());
      combineMetadataForCSE(I0, I);
      I0->andIRFlags(I);
    }
+  if (!isa<CallInst>(I0))
+    I0->setDebugLoc(Loc);

  if (!isa<StoreInst>(I0)) {
    // canSinkLastInstruction checked that all instructions were used by
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@ -73,7 +73,13 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
    # Setting a variable to let sub-projects detect which other projects
    # will be included under here.
    set(HAVE_${canon_name} On)
+  endforeach()

+  # We do this in two loops so that HAVE_* is set for each runtime before the
+  # other runtimes are added.
+  foreach(entry ${runtimes})
+    get_filename_component(projName ${entry} NAME)
+    
    # Between each sub-project we want to cache and clear the LIT properties
    set_property(GLOBAL PROPERTY LLVM_LIT_TESTSUITES)
    set_property(GLOBAL PROPERTY LLVM_LIT_PARAMS)
--- a/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
+++ b/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
@ -207,7 +207,7 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
  ret <8 x i16> %1
 }
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i16':
-; SSE2: Cost Model: {{.*}} 8 for instruction:   %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
 ; SSSE3: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
 ; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
 ; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
@ -219,7 +219,7 @@ define <8 x i16> @test_v8i16_2(<8 x i16> %a, <8 x i16> %b) {
  ret <8 x i16> %1
 }
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i16_2':
-; SSE2: Cost Model: {{.*}} 8 for instruction:   %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
 ; SSSE3: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
 ; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
 ; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
@ -280,11 +280,11 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) {
  ret <16 x i8> %1
 }
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i8':
-; SSE2: Cost Model: {{.*}} 48 for instruction:   %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
 ; SSSE3: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
-; SSE41: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
-; AVX: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
-; AVX2: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector


 define <16 x i8> @test_v16i8_2(<16 x i8> %a, <16 x i8> %b) {
@ -292,11 +292,11 @@ define <16 x i8> @test_v16i8_2(<16 x i8> %a, <16 x i8> %b) {
  ret <16 x i8> %1
 }
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i8_2':
-; SSE2: Cost Model: {{.*}} 48 for instruction:   %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
 ; SSSE3: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
-; SSE41: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
-; AVX: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
-; AVX2: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector


 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
@ -304,10 +304,10 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
  ret <16 x i16> %1
 }
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i16':
-; SSE2: Cost Model: {{.*}} 16 for instruction:   %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
 ; SSSE3: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
 ; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
-; AVX: Cost Model: {{.*}} 5 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector


@ -316,10 +316,10 @@ define <16 x i16> @test_v16i16_2(<16 x i16> %a, <16 x i16> %b) {
  ret <16 x i16> %1
 }
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i16_2':
-; SSE2: Cost Model: {{.*}} 16 for instruction:   %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
 ; SSSE3: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
 ; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
-; AVX: Cost Model: {{.*}} 5 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
 ; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector

 define <32 x i8> @test_v32i8(<32 x i8> %a, <32 x i8> %b) {
@ -327,11 +327,11 @@ define <32 x i8> @test_v32i8(<32 x i8> %a, <32 x i8> %b) {
  ret <32 x i8> %1
 }
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v32i8':
-; SSE2: Cost Model: {{.*}} 96 for instruction:   %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
 ; SSSE3: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
-; SSE41: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
-; AVX: Cost Model: {{.*}} 9 for instruction:   %1 = shufflevector
-; AVX2: Cost Model: {{.*}} 9 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector


 define <32 x i8> @test_v32i8_2(<32 x i8> %a, <32 x i8> %b) {
@ -339,9 +339,9 @@ define <32 x i8> @test_v32i8_2(<32 x i8> %a, <32 x i8> %b) {
  ret <32 x i8> %1
 }
 ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v32i8_2':
-; SSE2: Cost Model: {{.*}} 96 for instruction:   %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
 ; SSSE3: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
-; SSE41: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
-; AVX: Cost Model: {{.*}} 9 for instruction:   %1 = shufflevector
-; AVX2: Cost Model: {{.*}} 9 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector

--- a/test/Analysis/RegionInfo/bad_node_traversal.ll
+++ b/test/Analysis/RegionInfo/bad_node_traversal.ll
@ -0,0 +1,43 @@
+; REQUIRES: asserts
+; RUN: opt -regions -analyze < %s | FileCheck %s
+
+; While working on improvements to the region info analysis, this test
+; case caused an incorrect region 3 => 8 to be detected.
+
+define internal i8 @wibble() {
+bb:
+  br i1 true, label %bb1, label %bb8
+
+bb1:                                              ; preds = %bb
+  switch i32 0, label %bb2 [
+    i32 0, label %bb3
+    i32 1, label %bb7
+  ]
+
+bb2:                                              ; preds = %bb1
+  br label %bb4
+
+bb3:                                              ; preds = %bb1
+  br label %bb5
+
+bb4:                                              ; preds = %bb2
+  br label %bb6
+
+bb5:                                              ; preds = %bb3
+  br label %bb6
+
+bb6:                                              ; preds = %bb5, %bb4
+  br label %bb7
+
+bb7:                                              ; preds = %bb6, %bb1
+  br label %bb8
+
+bb8:                                              ; preds = %bb7, %bb
+  ret i8 1
+}
+
+; CHECK:      [0] bb => <Function Return>
+; CHECK-NEXT:   [1] bb => bb8
+; CHECK-NEXT:     [2] bb1 => bb7
+; CHECK-NEXT: End region tree
+
--- a/test/Bitcode/DIGlobalVariableExpression.ll
+++ b/test/Bitcode/DIGlobalVariableExpression.ll
@ -1,5 +1,8 @@
 ; RUN: llvm-dis -o - %s.bc | FileCheck %s
+; RUN: llvm-dis -o - %s.bc | llvm-as - | llvm-bcanalyzer -dump - | FileCheck %s --check-prefix=BC

+; BC: GLOBAL_VAR_EXPR
+; BC: GLOBAL_DECL_ATTACHMENT
 ; CHECK: @g = common global i32 0, align 4, !dbg ![[G:[0-9]+]]
 ; CHECK: @h = common global i32 0, align 4, !dbg ![[H:[0-9]+]]
 ; CHECK: ![[G]] = {{.*}}!DIGlobalVariableExpression(var: ![[GVAR:[0-9]+]], expr: ![[GEXPR:[0-9]+]])
--- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@ -1,5 +1,6 @@
 ; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s -check-prefix=CYCLONE --check-prefix=ALL
 ; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s -check-prefix=KRYO --check-prefix=ALL
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=falkor < %s | FileCheck %s -check-prefix=FALKOR --check-prefix=ALL

 ; rdar://11481771
 ; rdar://13713797
@ -16,6 +17,10 @@ entry:
 ; KRYO: movi v1.2d, #0000000000000000
 ; KRYO: movi v2.2d, #0000000000000000
 ; KRYO: movi v3.2d, #0000000000000000
+; FALKOR: movi v0.2d, #0000000000000000
+; FALKOR: movi v1.2d, #0000000000000000
+; FALKOR: movi v2.2d, #0000000000000000
+; FALKOR: movi v3.2d, #0000000000000000
  tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind
  ret void
 }
@ -47,6 +52,8 @@ define void @t4() nounwind ssp {
 ; CYCLONE: movi.2d v1, #0000000000000000
 ; KRYO: movi v0.2d, #0000000000000000
 ; KRYO: movi v1.2d, #0000000000000000
+; FALKOR: movi v0.2d, #0000000000000000
+; FALKOR: movi v1.2d, #0000000000000000
  tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
  ret void
 }
--- a/test/CodeGen/AArch64/store_merge_pair_offset.ll
+++ b/test/CodeGen/AArch64/store_merge_pair_offset.ll
@ -0,0 +1,12 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -enable-misched=false -enable-post-misched=false -o - %s | FileCheck %s
+
+define i64 @test(i64* %a) nounwind {
+  ; CHECK: ldp	x{{[0-9]+}}, x{{[0-9]+}}
+  ; CHECK-NOT: ldr
+  %p1 = getelementptr inbounds i64, i64* %a, i32 64
+  %tmp1 = load i64, i64* %p1, align 2
+  %p2 = getelementptr inbounds i64, i64* %a, i32 63
+  %tmp2 = load i64, i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
--- a/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll
+++ b/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll
@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}main:
+; GCN: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GCN-NEXT: s_endpgm
+
+define amdgpu_gs void @main(i32 inreg %a) #0 {
+  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %a)
+  ret void
+}
+
+; GCN-LABEL: {{^}}main_halt:
+; GCN: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT)
+; GCN-NEXT: s_endpgm
+
+define  void @main_halt(i32 inreg %a) #0 {
+  call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a)
+  ret void
+}
+
+; GCN-LABEL: {{^}}legacy:
+; GCN: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GCN-NEXT: s_endpgm
+
+define amdgpu_gs void @legacy(i32 inreg %a) #0 {
+  call void @llvm.SI.sendmsg(i32 3, i32 %a)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
+declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0
+declare void @llvm.SI.sendmsg(i32, i32) #0
+
+attributes #0 = { nounwind }
--- a/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll
+++ b/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll
@ -0,0 +1,161 @@
+;RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}test_interrupt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT)
+define void @test_interrupt() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0);
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_emit:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
+define void @test_gs_emit() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0);
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_cut:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
+define void @test_gs_cut() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0);
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_emit_cut:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+define void @test_gs_emit_cut() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_done:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+define void @test_gs_done() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}test_interrupt_halt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsghalt sendmsg(MSG_INTERRUPT)
+define void @test_interrupt_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_emit_halt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0)
+define void @test_gs_emit_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_cut_halt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1)
+define void @test_gs_cut_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_emit_cut_halt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+define void @test_gs_emit_cut_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_done_halt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP)
+define void @test_gs_done_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0)
+  ret void
+}
+
+; Legacy
+; CHECK-LABEL: {{^}}test_legacy_interrupt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT)
+define void @test_legacy_interrupt() {
+body:
+  call void @llvm.SI.sendmsg(i32 1, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_legacy_gs_emit:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
+define void @test_legacy_gs_emit() {
+body:
+  call void @llvm.SI.sendmsg(i32 34, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_legacy_gs_cut:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
+define void @test_legacy_gs_cut() {
+body:
+  call void @llvm.SI.sendmsg(i32 274, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_legacy_gs_emit_cut:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+define void @test_legacy_gs_emit_cut() {
+body:
+  call void @llvm.SI.sendmsg(i32 562, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_legacy_gs_done:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+define void @test_legacy_gs_done() {
+body:
+  call void @llvm.SI.sendmsg(i32 3, i32 0)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
+declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0
+declare void @llvm.SI.sendmsg(i32, i32) #0
+
+attributes #0 = { nounwind }
--- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
@ -1,17 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}main:
-; GCN: s_mov_b32 m0, s0
-; VI-NEXT: s_nop 0
-; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
-; GCN-NEXT: s_endpgm
-
-define amdgpu_gs void @main(i32 inreg %a) #0 {
-  call void @llvm.SI.sendmsg(i32 3, i32 %a)
-  ret void
-}
-
-declare void @llvm.SI.sendmsg(i32, i32) #0
-
-attributes #0 = { nounwind }
--- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
@ -1,24 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
-
-define void @main() {
-main_body:
-  call void @llvm.SI.sendmsg(i32 34, i32 0);
-  call void @llvm.SI.sendmsg(i32 274, i32 0);
-  call void @llvm.SI.sendmsg(i32 562, i32 0);
-  call void @llvm.SI.sendmsg(i32 3, i32 0);
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.SI.sendmsg(i32, i32) #0
-
-attributes #0 = { nounwind }
--- a/test/CodeGen/PowerPC/ppc64-blnop.ll
+++ b/test/CodeGen/PowerPC/ppc64-blnop.ll
@ -0,0 +1,129 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -function-sections -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS
+; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -function-sections -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS
+
+%class.T = type { [2 x i8] }
+
+define void @e_callee(%class.T* %this, i8* %c) { ret void }
+define void @e_caller(%class.T* %this, i8* %c) {
+  call void @e_callee(%class.T* %this, i8* %c)
+  ret void
+
+; CHECK-LABEL: e_caller:
+; CHECK: bl e_callee
+; CHECK-NEXT: nop
+
+; CHECK-FS-LABEL: e_caller:
+; CHECK-FS: bl e_callee
+; CHECK-FS-NEXT: nop
+}
+
+define void @e_scallee(%class.T* %this, i8* %c) section "different" { ret void }
+define void @e_scaller(%class.T* %this, i8* %c) {
+  call void @e_scallee(%class.T* %this, i8* %c)
+  ret void
+
+; CHECK-LABEL: e_scaller:
+; CHECK: bl e_scallee
+; CHECK-NEXT: nop
+}
+
+define void @e_s2callee(%class.T* %this, i8* %c) { ret void }
+define void @e_s2caller(%class.T* %this, i8* %c) section "different" {
+  call void @e_s2callee(%class.T* %this, i8* %c)
+  ret void
+
+; CHECK-LABEL: e_s2caller:
+; CHECK: bl e_s2callee
+; CHECK-NEXT: nop
+}
+
+$cd1 = comdat any
+$cd2 = comdat any
+
+define void @e_ccallee(%class.T* %this, i8* %c) comdat($cd1) { ret void }
+define void @e_ccaller(%class.T* %this, i8* %c) comdat($cd2) {
+  call void @e_ccallee(%class.T* %this, i8* %c)
+  ret void
+
+; CHECK-LABEL: e_ccaller:
+; CHECK: bl e_ccallee
+; CHECK-NEXT: nop
+}
+
+$cd = comdat any
+
+define void @e_c1callee(%class.T* %this, i8* %c) comdat($cd) { ret void }
+define void @e_c1caller(%class.T* %this, i8* %c) comdat($cd) {
+  call void @e_c1callee(%class.T* %this, i8* %c)
+  ret void
+
+; CHECK-LABEL: e_c1caller:
+; CHECK: bl e_c1callee
+; CHECK-NEXT: nop
+}
+
+define weak_odr hidden void @wo_hcallee(%class.T* %this, i8* %c) { ret void }
+define void @wo_hcaller(%class.T* %this, i8* %c) {
+  call void @wo_hcallee(%class.T* %this, i8* %c)
+  ret void
+
+; CHECK-LABEL: wo_hcaller:
+; CHECK: bl wo_hcallee
+; CHECK-NEXT: nop
+}
+
+define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void }
+define void @wo_pcaller(%class.T* %this, i8* %c) {
+  call void @wo_pcallee(%class.T* %this, i8* %c)
+  ret void
+
+; CHECK-LABEL: wo_pcaller:
+; CHECK: bl wo_pcallee
+; CHECK-NEXT: nop
+}
+
+define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void }
+define void @wo_caller(%class.T* %this, i8* %c) {
+  call void @wo_callee(%class.T* %this, i8* %c)
+  ret void
+
+; CHECK-LABEL: wo_caller:
+; CHECK: bl wo_callee
+; CHECK-NEXT: nop
+}
+
+define weak protected void @w_pcallee(i8* %ptr) { ret void }
+define void @w_pcaller(i8* %ptr) {
+  call void @w_pcallee(i8* %ptr)
+  ret void
+
+; CHECK-LABEL: w_pcaller:
+; CHECK: bl w_pcallee
+; CHECK-NEXT: nop
+}
+
+define weak hidden void @w_hcallee(i8* %ptr) { ret void }
+define void @w_hcaller(i8* %ptr) {
+  call void @w_hcallee(i8* %ptr)
+  ret void
+
+; CHECK-LABEL: w_hcaller:
+; CHECK: bl w_hcallee
+; CHECK-NEXT: nop
+}
+
+define weak void @w_callee(i8* %ptr) { ret void }
+define void @w_caller(i8* %ptr) {
+  call void @w_callee(i8* %ptr)
+  ret void
+
+; CHECK-LABEL: w_caller:
+; CHECK: bl w_callee
+; CHECK-NEXT: nop
+}
+
--- a/test/CodeGen/PowerPC/ppc64-sibcall.ll
+++ b/test/CodeGen/PowerPC/ppc64-sibcall.ll
@ -142,7 +142,7 @@ define void @wo_hcaller(%class.T* %this, i8* %c) {
  ret void

 ; CHECK-SCO-LABEL: wo_hcaller:
-; CHECK-SCO: b wo_hcallee
+; CHECK-SCO: bl wo_hcallee
 }

 define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void }
@ -151,7 +151,7 @@ define void @wo_pcaller(%class.T* %this, i8* %c) {
  ret void

 ; CHECK-SCO-LABEL: wo_pcaller:
-; CHECK-SCO: b wo_pcallee
+; CHECK-SCO: bl wo_pcallee
 }

 define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void }
@ -169,7 +169,7 @@ define void @w_pcaller(i8* %ptr) {
  ret void

 ; CHECK-SCO-LABEL: w_pcaller:
-; CHECK-SCO: b w_pcallee
+; CHECK-SCO: bl w_pcallee
 }

 define weak hidden void @w_hcallee(i8* %ptr) { ret void }
@ -178,7 +178,7 @@ define void @w_hcaller(i8* %ptr) {
  ret void

 ; CHECK-SCO-LABEL: w_hcaller:
-; CHECK-SCO: b w_hcallee
+; CHECK-SCO: bl w_hcallee
 }

 define weak void @w_callee(i8* %ptr) { ret void }
--- a/test/CodeGen/SPARC/soft-float.ll
+++ b/test/CodeGen/SPARC/soft-float.ll
@ -45,21 +45,21 @@ define fp128 @test_multf3(fp128 %a, fp128 %b) #0 {
 }

 define float @test_subsf3(float %a, float %b) #0 {
-  ; CHCEK-LABEL:  test_subsf3:
+  ; CHECK-LABEL:  test_subsf3:
  ; CHECK:        call __subsf3
  %sub = fsub float %a, %b
  ret float %sub
 }

 define double @test_subdf3(double %a, double %b) #0 {
-  ; CHCEK-LABEL:  test_subdf3:
+  ; CHECK-LABEL:  test_subdf3:
  ; CHECK:        call __subdf3
  %sub = fsub double %a, %b
  ret double %sub
 }

 define fp128 @test_subtf3(fp128 %a, fp128 %b) #0 {
-  ; CHCEK-LABEL:  test_subtf3:
+  ; CHECK-LABEL:  test_subtf3:
  ; CHECK:        call __subtf3
  %sub = fsub fp128 %a, %b
  ret fp128 %sub
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@ -370,6 +370,40 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
  ret void
 }

+; Make sure that we merge the consecutive load/store sequence below and use a
+; word (16 bit) instead of a byte copy for complicated address calculation.
+; .
+; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
+; BWON: movzwl   (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]]
+; BWOFF: movw    (%{{.*}},%{{.*}}), %[[REG:[a-z]+]]
+; CHECK: movw    %[[REG]], (%{{.*}})
+define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) {
+  br label %1
+
+; <label>:1
+  %.09 = phi i64 [ 0, %0 ], [ %13, %1 ]
+  %.08 = phi i8* [ %b, %0 ], [ %12, %1 ]
+  %2 = load i8, i8* %.08, align 1
+  %3 = sext i8 %2 to i64
+  %4 = getelementptr inbounds i8, i8* %c, i64 %3
+  %5 = load i8, i8* %4, align 1
+  %6 = add nsw i64 %3, 1
+  %7 = getelementptr inbounds i8, i8* %c, i64 %6
+  %8 = load i8, i8* %7, align 1
+  %9 = getelementptr inbounds i8, i8* %a, i64 %.09
+  store i8 %5, i8* %9, align 1
+  %10 = or i64 %.09, 1
+  %11 = getelementptr inbounds i8, i8* %a, i64 %10
+  store i8 %8, i8* %11, align 1
+  %12 = getelementptr inbounds i8, i8* %.08, i64 1
+  %13 = add nuw nsw i64 %.09, 2
+  %14 = icmp slt i64 %13, %n
+  br i1 %14, label %1, label %15
+
+; <label>:15
+  ret void
+}
+
 ; Make sure that we merge the consecutive load/store sequence below and use a
 ; word (16 bit) instead of a byte copy even if there are intermediate sign
 ; extensions.
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@ -209,34 +209,22 @@ entry:
 }

 define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
-; X32-AVX2-LABEL: QQ64:
-; X32-AVX2:       ## BB#0: ## %entry
-; X32-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX2-NEXT:    movl (%eax), %ecx
-; X32-AVX2-NEXT:    movl 4(%eax), %eax
-; X32-AVX2-NEXT:    vmovd %ecx, %xmm0
-; X32-AVX2-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; X32-AVX2-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
-; X32-AVX2-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
-; X32-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX2-NEXT:    retl
+; X32-LABEL: QQ64:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl 4(%eax), %eax
+; X32-NEXT:    vmovd %ecx, %xmm0
+; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT:    retl
 ;
 ; X64-LABEL: QQ64:
 ; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; X64-NEXT:    retq
-;
-; X32-AVX512VL-LABEL: QQ64:
-; X32-AVX512VL:       ## BB#0: ## %entry
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512VL-NEXT:    movl (%eax), %ecx
-; X32-AVX512VL-NEXT:    movl 4(%eax), %eax
-; X32-AVX512VL-NEXT:    vmovd %ecx, %xmm0
-; X32-AVX512VL-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; X32-AVX512VL-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
-; X32-AVX512VL-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
-; X32-AVX512VL-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512VL-NEXT:    retl
 entry:
  %q = load i64, i64* %ptr, align 4
  %q0 = insertelement <4 x i64> undef, i64 %q, i32 0
@ -1105,55 +1093,30 @@ define <4 x double> @splat_concat4(double %d) {
 ; Those test cases exerce the latter.

 define void @isel_crash_16b(i8* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_16b:
-; X32-AVX2:       ## BB#0: ## %eintry
-; X32-AVX2-NEXT:    subl $60, %esp
-; X32-AVX2-NEXT:  Lcfi0:
-; X32-AVX2-NEXT:    .cfi_def_cfa_offset 64
-; X32-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X32-AVX2-NEXT:    vpbroadcastb (%eax), %xmm1
-; X32-AVX2-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    addl $60, %esp
-; X32-AVX2-NEXT:    retl
+; X32-LABEL: isel_crash_16b:
+; X32:       ## BB#0: ## %eintry
+; X32-NEXT:    subl $60, %esp
+; X32-NEXT:  Lcfi0:
+; X32-NEXT:    .cfi_def_cfa_offset 64
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X32-NEXT:    vmovaps %xmm0, (%esp)
+; X32-NEXT:    vpbroadcastb (%eax), %xmm1
+; X32-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    vmovdqa %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $60, %esp
+; X32-NEXT:    retl
 ;
-; X64-AVX2-LABEL: isel_crash_16b:
-; X64-AVX2:       ## BB#0: ## %eintry
-; X64-AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X64-AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    movb (%rdi), %al
-; X64-AVX2-NEXT:    vmovd %eax, %xmm1
-; X64-AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; X64-AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    retq
-;
-; X32-AVX512VL-LABEL: isel_crash_16b:
-; X32-AVX512VL:       ## BB#0: ## %eintry
-; X32-AVX512VL-NEXT:    subl $60, %esp
-; X32-AVX512VL-NEXT:  Lcfi0:
-; X32-AVX512VL-NEXT:    .cfi_def_cfa_offset 64
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX512VL-NEXT:    vmovaps %xmm0, (%esp)
-; X32-AVX512VL-NEXT:    vpbroadcastb (%eax), %xmm1
-; X32-AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    addl $60, %esp
-; X32-AVX512VL-NEXT:    retl
-;
-; X64-AVX512VL-LABEL: isel_crash_16b:
-; X64-AVX512VL:       ## BB#0: ## %eintry
-; X64-AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X64-AVX512VL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    movb (%rdi), %al
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm1
-; X64-AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
-; X64-AVX512VL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    retq
+; X64-LABEL: isel_crash_16b:
+; X64:       ## BB#0: ## %eintry
+; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movb (%rdi), %al
+; X64-NEXT:    vmovd %eax, %xmm1
+; X64-NEXT:    vpbroadcastb %xmm1, %xmm1
+; X64-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    retq
 eintry:
  %__a.addr.i = alloca <2 x i64>, align 16
  %__b.addr.i = alloca <2 x i64>, align 16
@ -1277,55 +1240,30 @@ eintry:
 }

 define void @isel_crash_8w(i16* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_8w:
-; X32-AVX2:       ## BB#0: ## %entry
-; X32-AVX2-NEXT:    subl $60, %esp
-; X32-AVX2-NEXT:  Lcfi4:
-; X32-AVX2-NEXT:    .cfi_def_cfa_offset 64
-; X32-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X32-AVX2-NEXT:    vpbroadcastw (%eax), %xmm1
-; X32-AVX2-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    addl $60, %esp
-; X32-AVX2-NEXT:    retl
+; X32-LABEL: isel_crash_8w:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    subl $60, %esp
+; X32-NEXT:  Lcfi4:
+; X32-NEXT:    .cfi_def_cfa_offset 64
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X32-NEXT:    vmovaps %xmm0, (%esp)
+; X32-NEXT:    vpbroadcastw (%eax), %xmm1
+; X32-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    vmovdqa %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $60, %esp
+; X32-NEXT:    retl
 ;
-; X64-AVX2-LABEL: isel_crash_8w:
-; X64-AVX2:       ## BB#0: ## %entry
-; X64-AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X64-AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    movw (%rdi), %ax
-; X64-AVX2-NEXT:    vmovd %eax, %xmm1
-; X64-AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X64-AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    retq
-;
-; X32-AVX512VL-LABEL: isel_crash_8w:
-; X32-AVX512VL:       ## BB#0: ## %entry
-; X32-AVX512VL-NEXT:    subl $60, %esp
-; X32-AVX512VL-NEXT:  Lcfi4:
-; X32-AVX512VL-NEXT:    .cfi_def_cfa_offset 64
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX512VL-NEXT:    vmovaps %xmm0, (%esp)
-; X32-AVX512VL-NEXT:    vpbroadcastw (%eax), %xmm1
-; X32-AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    addl $60, %esp
-; X32-AVX512VL-NEXT:    retl
-;
-; X64-AVX512VL-LABEL: isel_crash_8w:
-; X64-AVX512VL:       ## BB#0: ## %entry
-; X64-AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X64-AVX512VL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    movw (%rdi), %ax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm1
-; X64-AVX512VL-NEXT:    vpbroadcastw %xmm1, %xmm1
-; X64-AVX512VL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    retq
+; X64-LABEL: isel_crash_8w:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movw (%rdi), %ax
+; X64-NEXT:    vmovd %eax, %xmm1
+; X64-NEXT:    vpbroadcastw %xmm1, %xmm1
+; X64-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    retq
 entry:
  %__a.addr.i = alloca <2 x i64>, align 16
  %__b.addr.i = alloca <2 x i64>, align 16
@ -1605,24 +1543,24 @@ eintry:
 }

 define void @isel_crash_2q(i64* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_2q:
-; X32-AVX2:       ## BB#0: ## %entry
-; X32-AVX2-NEXT:    subl $60, %esp
-; X32-AVX2-NEXT:  Lcfi12:
-; X32-AVX2-NEXT:    .cfi_def_cfa_offset 64
-; X32-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X32-AVX2-NEXT:    movl (%eax), %ecx
-; X32-AVX2-NEXT:    movl 4(%eax), %eax
-; X32-AVX2-NEXT:    vmovd %ecx, %xmm1
-; X32-AVX2-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X32-AVX2-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
-; X32-AVX2-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
-; X32-AVX2-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    addl $60, %esp
-; X32-AVX2-NEXT:    retl
+; X32-LABEL: isel_crash_2q:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    subl $60, %esp
+; X32-NEXT:  Lcfi12:
+; X32-NEXT:    .cfi_def_cfa_offset 64
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X32-NEXT:    vmovaps %xmm0, (%esp)
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl 4(%eax), %eax
+; X32-NEXT:    vmovd %ecx, %xmm1
+; X32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
+; X32-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
+; X32-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    vmovdqa %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $60, %esp
+; X32-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: isel_crash_2q:
 ; X64-AVX2:       ## BB#0: ## %entry
@ -1635,25 +1573,6 @@ define void @isel_crash_2q(i64* %cV_R.addr) {
 ; X64-AVX2-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX2-NEXT:    retq
 ;
-; X32-AVX512VL-LABEL: isel_crash_2q:
-; X32-AVX512VL:       ## BB#0: ## %entry
-; X32-AVX512VL-NEXT:    subl $60, %esp
-; X32-AVX512VL-NEXT:  Lcfi12:
-; X32-AVX512VL-NEXT:    .cfi_def_cfa_offset 64
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX512VL-NEXT:    vmovaps %xmm0, (%esp)
-; X32-AVX512VL-NEXT:    movl (%eax), %ecx
-; X32-AVX512VL-NEXT:    movl 4(%eax), %eax
-; X32-AVX512VL-NEXT:    vmovd %ecx, %xmm1
-; X32-AVX512VL-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X32-AVX512VL-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
-; X32-AVX512VL-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
-; X32-AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    addl $60, %esp
-; X32-AVX512VL-NEXT:    retl
-;
 ; X64-AVX512VL-LABEL: isel_crash_2q:
 ; X64-AVX512VL:       ## BB#0: ## %entry
 ; X64-AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
@ -1752,7 +1671,7 @@ define void @isel_crash_4q(i64* %cV_R.addr) {
 ; X32-AVX512VL-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
 ; X32-AVX512VL-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
 ; X32-AVX512VL-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
-; X32-AVX512VL-NEXT:    vinserti32x4 $1, %xmm1, %ymm1, %ymm1
+; X32-AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
 ; X32-AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
 ; X32-AVX512VL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
 ; X32-AVX512VL-NEXT:    movl %ebp, %esp
--- a/test/CodeGen/X86/avx512-any_extend_load.ll
+++ b/test/CodeGen/X86/avx512-any_extend_load.ll
@ -22,10 +22,8 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
 define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
 ; KNL-LABEL: any_extend_load_v8i32:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
-; KNL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
 ; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; KNL-NEXT:    vmovq %xmm0, (%rdi)
 ; KNL-NEXT:    retq
--- a/test/CodeGen/X86/avx512-extract-subvector.ll
+++ b/test/CodeGen/X86/avx512-extract-subvector.ll
@ -60,7 +60,7 @@ define <32 x i8> @extract_subvector256_v64i8(<64 x i8> %x) nounwind {
 define void @extract_subvector256_v8f64_store(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
 ; SKX-LABEL: extract_subvector256_v8f64_store:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    vextractf64x2 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vextractf128 $1, %ymm0, (%rdi)
 ; SKX-NEXT:    retq
 entry:
  %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3>
@ -72,7 +72,7 @@ entry:
 define void @extract_subvector256_v8f32_store(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
 ; SKX-LABEL: extract_subvector256_v8f32_store:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    vextractf32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vextractf128 $1, %ymm0, (%rdi)
 ; SKX-NEXT:    retq
 entry:
  %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@ -84,7 +84,7 @@ entry:
 define void @extract_subvector256_v4i64_store(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 ; SKX-LABEL: extract_subvector256_v4i64_store:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    vextracti64x2 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vextracti128 $1, %ymm0, (%rdi)
 ; SKX-NEXT:    retq
 entry:
  %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
@ -96,7 +96,7 @@ entry:
 define void @extract_subvector256_v8i32_store(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp {
 ; SKX-LABEL: extract_subvector256_v8i32_store:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vextracti128 $1, %ymm0, (%rdi)
 ; SKX-NEXT:    retq
 entry:
  %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@ -108,7 +108,7 @@ entry:
 define void @extract_subvector256_v16i16_store(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp {
 ; SKX-LABEL: extract_subvector256_v16i16_store:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vextracti128 $1, %ymm0, (%rdi)
 ; SKX-NEXT:    retq
 entry:
  %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@ -120,7 +120,7 @@ entry:
 define void @extract_subvector256_v32i8_store(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp {
 ; SKX-LABEL: extract_subvector256_v32i8_store:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vextracti128 $1, %ymm0, (%rdi)
 ; SKX-NEXT:    retq
 entry:
  %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@ -463,7 +463,7 @@ define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
 ; SKX-LABEL: extract_v4i64:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpextrq $1, %xmm0, %rax
-; SKX-NEXT:    vextracti64x2 $1, %ymm0, %xmm0
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi)
 ; SKX-NEXT:    retq
  %r1 = extractelement <4 x i64> %x, i32 1
@ -521,7 +521,7 @@ define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
 ; SKX-LABEL: extract_v8i32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpextrd $1, %xmm0, %eax
-; SKX-NEXT:    vextracti32x4 $1, %ymm0, %xmm0
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; SKX-NEXT:    vpextrd $1, %xmm0, (%rdi)
 ; SKX-NEXT:    retq
  %r1 = extractelement <8 x i32> %x, i32 1
@ -582,7 +582,7 @@ define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
 ; SKX-LABEL: extract_v16i16:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpextrw $1, %xmm0, %eax
-; SKX-NEXT:    vextracti32x4 $1, %ymm0, %xmm0
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; SKX-NEXT:    vpextrw $1, %xmm0, (%rdi)
 ; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; SKX-NEXT:    retq
@ -646,7 +646,7 @@ define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
 ; SKX-LABEL: extract_v32i8:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpextrb $1, %xmm0, %eax
-; SKX-NEXT:    vextracti32x4 $1, %ymm0, %xmm0
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; SKX-NEXT:    vpextrb $1, %xmm0, (%rdi)
 ; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    retq
@ -714,9 +714,9 @@ define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
 ; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT:    vextracti64x2 $1, %ymm0, %xmm1
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; SKX-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; SKX-NEXT:    retq
  %val = load i64, i64* %ptr
  %r1 = insertelement <4 x i64> %x, i64 %val, i32 1
@ -780,9 +780,9 @@ define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
 ; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT:    vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; SKX-NEXT:    vpinsrd $1, %edi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; SKX-NEXT:    retq
  %val = load i32, i32* %ptr
  %r1 = insertelement <8 x i32> %x, i32 %val, i32 1
@ -846,9 +846,9 @@ define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
 ; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT:    vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; SKX-NEXT:    vpinsrw $1, %edi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; SKX-NEXT:    retq
  %val = load i16, i16* %ptr
  %r1 = insertelement <16 x i16> %x, i16 %val, i32 1
@ -912,9 +912,9 @@ define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm1
 ; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT:    vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; SKX-NEXT:    vpinsrb $1, %edi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; SKX-NEXT:    retq
  %val = load i8, i8* %ptr
  %r1 = insertelement <32 x i8> %x, i8 %val, i32 1
@ -1014,9 +1014,9 @@ define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
 ;
 ; SKX-LABEL: test_insert_128_v16i16:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; SKX-NEXT:    vpinsrw $2, %edi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; SKX-NEXT:    retq
  %r = insertelement <16 x i16> %x, i16 %y, i32 10
  ret <16 x i16> %r
@ -1032,9 +1032,9 @@ define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
 ;
 ; SKX-LABEL: test_insert_128_v32i8:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; SKX-NEXT:    vpinsrb $4, %edi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; SKX-NEXT:    retq
  %r = insertelement <32 x i8> %x, i8 %y, i32 20
  ret <32 x i8> %r
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@ -2868,3 +2868,187 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask
 }

 declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
+
+define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
+; CHECK-LABEL: test_mask_vextractf32x4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kshiftlw $12, %k1, %k0
+; CHECK-NEXT:    kshiftrw $15, %k0, %k0
+; CHECK-NEXT:    kshiftlw $13, %k1, %k2
+; CHECK-NEXT:    kshiftrw $15, %k2, %k2
+; CHECK-NEXT:    kshiftlw $15, %k1, %k3
+; CHECK-NEXT:    kshiftrw $15, %k3, %k3
+; CHECK-NEXT:    kshiftlw $14, %k1, %k1
+; CHECK-NEXT:    kshiftrw $15, %k1, %k1
+; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k3, %ecx
+; CHECK-NEXT:    vmovd %ecx, %xmm2
+; CHECK-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
+
+define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: test_mask_vextracti64x4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kshiftlw $12, %k1, %k0
+; CHECK-NEXT:    kshiftrw $15, %k0, %k0
+; CHECK-NEXT:    kshiftlw $13, %k1, %k2
+; CHECK-NEXT:    kshiftrw $15, %k2, %k2
+; CHECK-NEXT:    kshiftlw $15, %k1, %k3
+; CHECK-NEXT:    kshiftrw $15, %k3, %k3
+; CHECK-NEXT:    kshiftlw $14, %k1, %k1
+; CHECK-NEXT:    kshiftrw $15, %k1, %k1
+; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k3, %ecx
+; CHECK-NEXT:    vmovd %ecx, %xmm2
+; CHECK-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT:    vpmovsxdq %xmm2, %ymm2
+; CHECK-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask)
+  ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
+
+define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: test_maskz_vextracti32x4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kshiftlw $12, %k1, %k0
+; CHECK-NEXT:    kshiftrw $15, %k0, %k0
+; CHECK-NEXT:    kshiftlw $13, %k1, %k2
+; CHECK-NEXT:    kshiftrw $15, %k2, %k2
+; CHECK-NEXT:    kshiftlw $15, %k1, %k3
+; CHECK-NEXT:    kshiftrw $15, %k3, %k3
+; CHECK-NEXT:    kshiftlw $14, %k1, %k1
+; CHECK-NEXT:    kshiftrw $15, %k1, %k1
+; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k3, %ecx
+; CHECK-NEXT:    vmovd %ecx, %xmm1
+; CHECK-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
+; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrad $31, %xmm1, %xmm1
+; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
+
+define <4 x double> @test_vextractf64x4(<8 x double> %a) {
+; CHECK-LABEL: test_vextractf64x4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
+
+declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res2, %res3
+  ret <16 x float> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res2, %res3
+  ret <16 x i32> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res2, %res3
+  ret <8 x double> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res2, %res3
+  ret <8 x i64> %res4
+}
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@ -1243,53 +1243,6 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {

 declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone

-define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
-; CHECK-LABEL: test_mask_vextractf32x4:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm0 {%k1}
-; CHECK-NEXT:    retq
-  %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
-  ret <4 x float> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
-
-define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
-; CHECK-LABEL: test_mask_vextracti64x4:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vextracti64x4 $2, %zmm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq
-  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask)
-  ret <4 x i64> %res
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
-
-define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
-; CHECK-LABEL: test_maskz_vextracti32x4:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    retq
-  %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
-  ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
-
-define <4 x double> @test_vextractf64x4(<8 x double> %a) {
-; CHECK-LABEL: test_vextractf64x4:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vextractf64x4 $2, %zmm0, %ymm0
-; CHECK-NEXT:    retq
-  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1)
-  ret <4 x double> %res
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
-
 declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
@ -3984,86 +3937,6 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<1
  ret <16 x float> %res2
 }

-declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
-  %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
-  %res3 = fadd <16 x float> %res, %res1
-  %res4 = fadd <16 x float> %res2, %res3
-  ret <16 x float> %res4
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
-  %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
-  %res3 = add <16 x i32> %res, %res1
-  %res4 = add <16 x i32> %res2, %res3
-  ret <16 x i32> %res4
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
-  %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
-  %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
-  %res3 = fadd <8 x double> %res, %res1
-  %res4 = fadd <8 x double> %res2, %res3
-  ret <8 x double> %res4
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
-  %res3 = add <8 x i64> %res, %res1
-  %res4 = add <8 x i64> %res2, %res3
-  ret <8 x i64> %res4
-}
-
 declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)

 define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@ -30,9 +30,9 @@ define <8 x i1> @test2(<2 x i1> %a) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k0
-; CHECK-NEXT:    vpmovm2q %k0, %zmm0
-; CHECK-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vpmovm2q %k0, %zmm1
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vpmovq2m %zmm0, %k0
 ; CHECK-NEXT:    vpmovm2w %k0, %xmm0
 ; CHECK-NEXT:    retq
--- a/test/CodeGen/X86/avx512-vbroadcasti128.ll
+++ b/test/CodeGen/X86/avx512-vbroadcasti128.ll
@ -237,7 +237,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
 ; X64-AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
 ; X64-AVX512VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; X64-AVX512VL-NEXT:    vmovdqa %ymm1, (%rsi)
-; X64-AVX512VL-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; X64-AVX512VL-NEXT:    retq
 ;
 ; X64-AVX512BWVL-LABEL: PR29088:
@ -245,7 +245,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
 ; X64-AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
 ; X64-AVX512BWVL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; X64-AVX512BWVL-NEXT:    vmovdqa %ymm1, (%rsi)
-; X64-AVX512BWVL-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512BWVL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; X64-AVX512BWVL-NEXT:    retq
 ;
 ; X64-AVX512DQVL-LABEL: PR29088:
@ -253,7 +253,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
 ; X64-AVX512DQVL-NEXT:    vmovdqa (%rdi), %xmm0
 ; X64-AVX512DQVL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
 ; X64-AVX512DQVL-NEXT:    vmovaps %ymm1, (%rsi)
-; X64-AVX512DQVL-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512DQVL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; X64-AVX512DQVL-NEXT:    retq
  %ld = load <4 x i32>, <4 x i32>* %p0
  store <8 x float> zeroinitializer, <8 x float>* %p1
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@ -30,7 +30,7 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; CHECK-NEXT:    vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
 ; CHECK-NEXT:    vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
 ; CHECK-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
-; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
@ -79,7 +79,7 @@ define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
 ; CHECK-NEXT:    vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
 ; CHECK-NEXT:    vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
 ; CHECK-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
-; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
@ -129,7 +129,7 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; CHECK-NEXT:    vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
 ; CHECK-NEXT:    vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
 ; CHECK-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
-; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
@ -178,7 +178,7 @@ define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask)
 ; CHECK-NEXT:    vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
 ; CHECK-NEXT:    vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
 ; CHECK-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
-; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
--- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s
+
+declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm0
+; CHECK-NEXT:    kmovb %edi, %k0
+; CHECK-NEXT:    kshiftlb $7, %k0, %k1
+; CHECK-NEXT:    kshiftrb $7, %k1, %k1
+; CHECK-NEXT:    kshiftlb $6, %k0, %k0
+; CHECK-NEXT:    kshiftrb $7, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vmovq %rax, %xmm2
+; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    vmovq %rax, %xmm3
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; CHECK-NEXT:    vpsllq $63, %xmm2, %xmm2
+; CHECK-NEXT:    vpsrad $31, %xmm2, %xmm2
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
+; CHECK-NEXT:    vandpd %xmm0, %xmm2, %xmm2
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res2, %res3
+  ret <2 x double> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm2
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res  = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res2, %res3
+  ret <8 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res2, %res3
+  ret <16 x float> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm3
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res3, %res2
+  ret <8 x double> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm3
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res2, %res3
+  ret <8 x i64> %res4
+}
--- a/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@ -325,127 +325,6 @@ define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x dou
  ret <2 x double> %res2
 }

-
-declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm1 {%k1}
-; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT:    retq
-  %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
-  %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
-  %res3 = fadd <2 x double> %res, %res1
-  %res4 = fadd <2 x double> %res2, %res3
-  ret <2 x double> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
-; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT:    retq
-  %res  = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3)
-  %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1)
-  %res3 = fadd <8 x float> %res, %res1
-  %res4 = fadd <8 x float> %res2, %res3
-  ret <8 x float> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
-  %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
-  %res3 = fadd <16 x float> %res, %res1
-  %res4 = fadd <16 x float> %res2, %res3
-  ret <16 x float> %res4
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
-  %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
-  %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
-  %res3 = fadd <8 x double> %res, %res1
-  %res4 = fadd <8 x double> %res3, %res2
-  ret <8 x double> %res4
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
-  %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
-  %res3 = add <16 x i32> %res, %res1
-  %res4 = add <16 x i32> %res3, %res2
-  ret <16 x i32> %res4
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
-  %res3 = add <8 x i64> %res, %res1
-  %res4 = add <8 x i64> %res2, %res3
-  ret <8 x i64> %res4
-}
-
 declare i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double>, i32, i8)

 define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) {
--- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
@ -1560,3 +1560,62 @@ define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8

 declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)

+declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
+; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
+; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
+; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res3, %res2
+  ret <2 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
+; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb]
+; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4)
+  %res3 = fadd <4 x double> %res, %res1
+  %res4 = fadd <4 x double> %res2, %res3
+  ret <4 x double> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i32, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
+; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01]
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
--- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@ -549,66 +549,6 @@ define <8 x float>@test_int_x86_avx512_mask_range_ps_256(<8 x float> %x0, <8 x f
  ret <8 x float> %res2
 }

-declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
-; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
-; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc2,0x01]
-; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x19,0xc0,0x01]
-; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
-; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
-  %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
-  %res3 = fadd <2 x double> %res, %res1
-  %res4 = fadd <2 x double> %res3, %res2
-  ret <2 x double> %res4
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
-; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
-; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xd9,0x01]
-; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x18,0xc1,0x01]
-; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0]
-; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
-  %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
-  %res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4)
-  %res3 = fadd <4 x double> %res, %res1
-  %res4 = fadd <4 x double> %res2, %res3
-  ret <4 x double> %res4
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i32, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
-; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
-; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xd9,0x01]
-; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x38,0xc1,0x01]
-; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
-; CHECK-NEXT:    vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
-  %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
-  %res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4)
-  %res3 = add <4 x i64> %res, %res1
-  %res4 = add <4 x i64> %res3, %res2
-  ret <4 x i64> %res4
-}
-
 declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8)

 define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) {
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@ -4773,3 +4773,63 @@ define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <
  ret <4 x float> %res4
 }

+declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01]
+; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc0,0x01]
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0]
+; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res  = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res2, %res3
+  ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01]
+; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb]
+; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res2, %res3
+  ret <8 x float> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i32, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
+; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+
+  %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res2, %res3
+  ret <8 x i32> %res4
+}
--- a/Show More
+++ b/Show More