From 3c315f3a8e8f326948fc789f146794ecd33cc540 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Fri, 16 Feb 2018 19:10:15 +0000
Subject: [PATCH] Vendor import of llvm release_60 branch r325330:
 https://llvm.org/svn/llvm-project/llvm/branches/release_60@325330

---
 docs/ReleaseNotes.rst                         |  38 ++
 include/llvm/IR/IntrinsicsAMDGPU.td           |  20 +
 include/llvm/IR/IntrinsicsX86.td              |   9 +
 include/llvm/MC/MCAsmMacro.h                  |  38 ++
 include/llvm/MC/MCContext.h                   |  15 +
 .../llvm/Support/GenericDomTreeConstruction.h |  32 +-
 lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp   |   3 +-
 lib/CodeGen/AsmPrinter/DwarfUnit.cpp          |   3 +-
 lib/CodeGen/LivePhysRegs.cpp                  |  31 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |   4 +-
 lib/CodeGen/SplitKit.cpp                      |  53 ++-
 lib/CodeGen/SplitKit.h                        |   6 +-
 lib/IR/AutoUpgrade.cpp                        |   7 -
 lib/MC/MCParser/AsmParser.cpp                 |  56 +--
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp      |   4 +
 lib/Target/AMDGPU/AMDGPUISelLowering.h        |   4 +
 lib/Target/AMDGPU/AMDGPUInstrInfo.cpp         |  18 +
 lib/Target/AMDGPU/AMDGPUInstrInfo.h           |   2 +
 lib/Target/AMDGPU/AMDGPUInstrInfo.td          |   8 +
 lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   2 +-
 lib/Target/AMDGPU/SIISelLowering.cpp          |  52 ++-
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  18 -
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h      |   1 -
 lib/Target/AMDGPU/VOP2Instructions.td         |   8 +-
 .../X86/MCTargetDesc/X86MCCodeEmitter.cpp     |  10 +-
 lib/Target/X86/X86AsmPrinter.cpp              |  11 +-
 lib/Target/X86/X86DomainReassignment.cpp      |  12 +-
 lib/Target/X86/X86ISelLowering.cpp            | 217 ++++-----
 lib/Target/X86/X86IntrinsicsInfo.h            |   5 +-
 lib/Target/X86/X86RetpolineThunks.cpp         |  68 +--
 .../InstCombine/InstCombineCalls.cpp          |  12 +
 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll |  84 ++++
 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll |  84 ++++
 .../AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll      | 164 +++++++
 .../AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll      | 164 +++++++
 test/CodeGen/ARM/pr25838.ll                   |   2 +-
 test/CodeGen/ARM/splitkit.ll                  | 245 ++++++++++
 test/CodeGen/Thumb/stm-scavenging.ll          |  46 ++
 .../X86/avx512-intrinsics-fast-isel.ll        |  53 ---
 test/CodeGen/X86/avx512-intrinsics-upgrade.ll |  14 -
 test/CodeGen/X86/avx512-intrinsics.ll         |  15 +
 test/CodeGen/X86/avx512-mask-op.ll            |  96 ++++
 .../X86/avx512bw-intrinsics-fast-isel.ll      | 371 ++++++---------
 .../X86/avx512bw-intrinsics-upgrade.ll        |  40 --
 test/CodeGen/X86/avx512bw-intrinsics.ll       |  49 ++
 test/CodeGen/X86/domain-reassignment.mir      | 439 +++++++++++-------
 test/CodeGen/X86/inline-asm-modifier-V.ll     |  14 +
 test/CodeGen/X86/pr36199.ll                   |  22 +
 test/CodeGen/X86/retpoline-external.ll        |  48 +-
 test/CodeGen/X86/retpoline-regparm.ll         |  42 ++
 test/CodeGen/X86/retpoline.ll                 |  14 +-
 test/DebugInfo/X86/void-typedef.ll            |  88 ++++
 test/MC/AsmParser/inline_macro_duplication.ll |   8 +
 test/MC/X86/x86-64.s                          |   5 +
 .../InstCombine/AMDGPU/amdgcn-intrinsics.ll   | 108 +++++
 55 files changed, 2116 insertions(+), 866 deletions(-)
 create mode 100644 include/llvm/MC/MCAsmMacro.h
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
 create mode 100644 test/CodeGen/ARM/splitkit.ll
 create mode 100644 test/CodeGen/Thumb/stm-scavenging.ll
 create mode 100644 test/CodeGen/X86/inline-asm-modifier-V.ll
 create mode 100644 test/CodeGen/X86/pr36199.ll
 create mode 100644 test/CodeGen/X86/retpoline-regparm.ll
 create mode 100644 test/DebugInfo/X86/void-typedef.ll
 create mode 100644 test/MC/AsmParser/inline_macro_duplication.ll

diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index c28a3829bfee..949ec85c270b 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -71,6 +71,13 @@ Non-comprehensive list of changes in this release
 Changes to the LLVM IR
 ----------------------
 
+Changes to the AArch64 Target
+-----------------------------
+
+During this release:
+
+ * Enabled the new GlobalISel instruction selection framework by default at ``-O0``.
+
 Changes to the ARM Target
 -------------------------
 
@@ -80,6 +87,28 @@ During this release the ARM target has:
   isn't the default.
 
 
+Changes to the Hexagon Target
+-----------------------------
+
+* The Hexagon backend now supports V65 ISA.
+
+* The ``-mhvx`` option now takes an optional value that specified the ISA
+  version of the HVX coprocessor.  The available values are v60, v62 and v65.
+  By default, the value is set to be the same as the CPU version.
+
+* The compiler option ``-mhvx-double`` is deprecated and will be removed in
+  the next release of the compiler. Programmers should use ``-mhvx-length``
+  option to specify the desired vector length: ``-mhvx-length=64b`` for
+  64-byte vectors and ``-mhvx-length=128b`` for 128-byte vectors. While the
+  current default vector length is 64 bytes, users should always specify the
+  length explicitly, since the default value may change in the future.
+
+* The target feature ``hvx-double`` is deprecated and will be removed in the
+  next release. LLVM IR generators should use target features ``hvx-length64b``
+  and ``hvx-length128b`` to indicate the vector length. The length should
+  always be specified when HVX code generation is enabled.
+
+
 Changes to the MIPS Target
 --------------------------
 
@@ -91,6 +120,15 @@ Changes to the PowerPC Target
 
  During this release ...
 
+Changes to the SystemZ Target
+-----------------------------
+
+During this release the SystemZ target has:
+
+* Added support for 128-bit atomic operations.
+
+* Added support for the "o" constraint for inline asm statements.
+
 Changes to the X86 Target
 -------------------------
 
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index 22a3a0fe618f..3397fa41db1b 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -238,6 +238,26 @@ def int_amdgcn_cvt_pkrtz : Intrinsic<
   [IntrNoMem, IntrSpeculatable]
 >;
 
+def int_amdgcn_cvt_pknorm_i16 : Intrinsic<
+  [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pknorm_u16 : Intrinsic<
+  [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pk_i16 : Intrinsic<
+  [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pk_u16 : Intrinsic<
+  [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
+
 def int_amdgcn_class : Intrinsic<
   [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable]
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index bd6177c5b3d9..7c000e2b1dc7 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -3738,6 +3738,15 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_kxnor_w : // TODO: remove this intrinsic
               Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
                          [IntrNoMem]>;
+  def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">,
+              Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
+                         [IntrNoMem]>;
+  def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+                         [IntrNoMem]>;
+  def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">,
+              Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+                         [IntrNoMem]>;
   def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">,
               Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
                         [IntrNoMem]>;
diff --git a/include/llvm/MC/MCAsmMacro.h b/include/llvm/MC/MCAsmMacro.h
new file mode 100644
index 000000000000..34d14abc9645
--- /dev/null
+++ b/include/llvm/MC/MCAsmMacro.h
@@ -0,0 +1,38 @@
+//===- MCAsmMacro.h - Assembly Macros ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCASMMACRO_H
+#define LLVM_MC_MCASMMACRO_H
+
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+
+namespace llvm {
+
+struct MCAsmMacroParameter {
+  StringRef Name;
+  std::vector<AsmToken> Value;
+  bool Required = false;
+  bool Vararg = false;
+
+  MCAsmMacroParameter() = default;
+};
+
+typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
+struct MCAsmMacro {
+  StringRef Name;
+  StringRef Body;
+  MCAsmMacroParameters Parameters;
+
+public:
+  MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
+      : Name(N), Body(B), Parameters(std::move(P)) {}
+};
+}; // namespace llvm
+
+#endif
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 432fc0ede072..358f67c4db6d 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCAsmMacro.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SectionKind.h"
@@ -268,6 +269,9 @@ namespace llvm {
                                        unsigned UniqueID,
                                        const MCSymbolELF *Associated);
 
+    /// \brief Map of currently defined macros.
+    StringMap<MCAsmMacro> MacroMap;
+
   public:
     explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI,
                        const MCObjectFileInfo *MOFI,
@@ -618,6 +622,17 @@ namespace llvm {
     // FIXME: We should really do something about that.
     LLVM_ATTRIBUTE_NORETURN void reportFatalError(SMLoc L,
                                                   const Twine &Msg);
+
+    const MCAsmMacro *lookupMacro(StringRef Name) {
+      StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
+      return (I == MacroMap.end()) ? nullptr : &I->getValue();
+    }
+
+    void defineMacro(StringRef Name, MCAsmMacro Macro) {
+      MacroMap.insert(std::make_pair(Name, std::move(Macro)));
+    }
+
+    void undefineMacro(StringRef Name) { MacroMap.erase(Name); }
   };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index 25175fe66aa8..9438c9e08850 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -698,24 +698,20 @@ struct SemiNCAInfo {
       return;
 
     // Recalculate the set of roots.
-    DT.Roots = FindRoots(DT, BUI);
-    for (const NodePtr R : DT.Roots) {
-      const TreeNodePtr TN = DT.getNode(R);
-      // A CFG node was selected as a tree root, but the corresponding tree node
-      // is not connected to the virtual root. This is because the incremental
-      // algorithm does not really know or use the set of roots and can make a
-      // different (implicit) decision about which nodes within an infinite loop
-      // becomes a root.
-      if (TN && !DT.isVirtualRoot(TN->getIDom())) {
-        DEBUG(dbgs() << "Root " << BlockNamePrinter(R)
-                     << " is not virtual root's child\n"
-                     << "The entire tree needs to be rebuilt\n");
-        // It should be possible to rotate the subtree instead of recalculating
-        // the whole tree, but this situation happens extremely rarely in
-        // practice.
-        CalculateFromScratch(DT, BUI);
-        return;
-      }
+    auto Roots = FindRoots(DT, BUI);
+    if (DT.Roots.size() != Roots.size() ||
+        !std::is_permutation(DT.Roots.begin(), DT.Roots.end(), Roots.begin())) {
+      // The roots chosen in the CFG have changed. This is because the
+      // incremental algorithm does not really know or use the set of roots and
+      // can make a different (implicit) decision about which node within an
+      // infinite loop becomes a root.
+
+      DEBUG(dbgs() << "Roots are different in updated trees\n"
+                   << "The entire tree needs to be rebuilt\n");
+      // It may be possible to update the tree without recalculating it, but
+      // we do not know yet how to do it, and it happens rarely in practise.
+      CalculateFromScratch(DT, BUI);
+      return;
     }
   }
 
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index d94b0e5c2118..2e5c22447936 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -163,7 +163,8 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
 
   DIType *BaseType = DDTy->getBaseType().resolve();
 
-  assert(BaseType && "Unexpected invalid base type");
+  if (!BaseType)
+    return 0;
 
   // If this is a derived type, go ahead and get the base type, unless it's a
   // reference then it's just the size of the field. Pointer types have no need
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 911e46235781..4ea59f504bd4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1391,7 +1391,8 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
   if (!Name.empty())
     addString(MemberDie, dwarf::DW_AT_name, Name);
 
-  addType(MemberDie, resolve(DT->getBaseType()));
+  if (DIType *Resolved = resolve(DT->getBaseType()))
+    addType(MemberDie, Resolved);
 
   addSourceLine(MemberDie, DT);
 
diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp
index f4b43a9b8ead..277212cf7dac 100644
--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@@ -205,14 +205,18 @@ void LivePhysRegs::addPristines(const MachineFunction &MF) {
 }
 
 void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
-  if (!MBB.succ_empty()) {
-    // To get the live-outs we simply merge the live-ins of all successors.
-    for (const MachineBasicBlock *Succ : MBB.successors())
-      addBlockLiveIns(*Succ);
-  } else if (MBB.isReturnBlock()) {
-    // For the return block: Add all callee saved registers that are saved and
-    // restored (somewhere); This does not include callee saved registers that
-    // are unused and hence not saved and restored; they are called pristine.
+  // To get the live-outs we simply merge the live-ins of all successors.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    addBlockLiveIns(*Succ);
+  if (MBB.isReturnBlock()) {
+    // Return blocks are a special case because we currently don't mark up
+    // return instructions completely: specifically, there is no explicit
+    // use for callee-saved registers. So we add all callee saved registers
+    // that are saved and restored (somewhere). This does not include
+    // callee saved registers that are unused and hence not saved and
+    // restored; they are called pristine.
+    // FIXME: PEI should add explicit markings to return instructions
+    // instead of implicitly handling them here.
     const MachineFunction &MF = *MBB.getParent();
     const MachineFrameInfo &MFI = MF.getFrameInfo();
     if (MFI.isCalleeSavedInfoValid()) {
@@ -225,15 +229,8 @@ void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
 
 void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
   const MachineFunction &MF = *MBB.getParent();
-  if (!MBB.succ_empty()) {
-    addPristines(MF);
-    addLiveOutsNoPristines(MBB);
-  } else if (MBB.isReturnBlock()) {
-    // For the return block: Add all callee saved registers.
-    const MachineFrameInfo &MFI = MF.getFrameInfo();
-    if (MFI.isCalleeSavedInfoValid())
-      addCalleeSavedRegs(*this, MF);
-  }
+  addPristines(MF);
+  addLiveOutsNoPristines(MBB);
 }
 
 void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) {
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2c6b724c02df..03cb2e310c7e 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16409,7 +16409,9 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
       N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       N1.getOperand(0).getOperand(1) == N2 &&
       N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() ==
-          VT.getVectorNumElements()) {
+          VT.getVectorNumElements() &&
+      N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
+          VT.getSizeInBits()) {
     return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
   }
 
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index c99c3b09d88a..1628ee28b8a3 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -491,9 +491,8 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
   return VNI;
 }
 
-void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
-  assert(ParentVNI && "Mapping  NULL value");
-  ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI->id)];
+void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) {
+  ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI.id)];
   VNInfo *VNI = VFP.getPointer();
 
   // ParentVNI was either unmapped or already complex mapped. Either way, just
@@ -777,7 +776,7 @@ SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) {
   // the source live range.  The spiller also won't try to hoist this copy.
   if (SpillMode && !SlotIndex::isSameInstr(ParentVNI->def, Idx) &&
       MI->readsVirtualRegister(Edit->getReg())) {
-    forceRecompute(0, ParentVNI);
+    forceRecompute(0, *ParentVNI);
     defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
     return Idx;
   }
@@ -835,7 +834,7 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
 
   // The complement interval will be extended as needed by LRCalc.extend().
   if (ParentVNI)
-    forceRecompute(0, ParentVNI);
+    forceRecompute(0, *ParentVNI);
   DEBUG(dbgs() << "    overlapIntv [" << Start << ';' << End << "):");
   RegAssign.insert(Start, End, OpenIdx);
   DEBUG(dump());
@@ -878,7 +877,7 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
     unsigned RegIdx = AssignI.value();
     if (AtBegin || !MBBI->readsVirtualRegister(Edit->getReg())) {
       DEBUG(dbgs() << "  cannot find simple kill of RegIdx " << RegIdx << '\n');
-      forceRecompute(RegIdx, Edit->getParent().getVNInfoAt(Def));
+      forceRecompute(RegIdx, *Edit->getParent().getVNInfoAt(Def));
     } else {
       SlotIndex Kill = LIS.getInstructionIndex(*MBBI).getRegSlot();
       DEBUG(dbgs() << "  move kill to " << Kill << '\t' << *MBBI);
@@ -982,7 +981,7 @@ void SplitEditor::computeRedundantBackCopies(
       }
     }
     if (!DominatedVNIs.empty()) {
-      forceRecompute(0, ParentVNI);
+      forceRecompute(0, *ParentVNI);
       for (auto VNI : DominatedVNIs) {
         BackCopies.push_back(VNI);
       }
@@ -1102,7 +1101,7 @@ void SplitEditor::hoistCopies() {
         NotToHoistSet.count(ParentVNI->id))
       continue;
     BackCopies.push_back(VNI);
-    forceRecompute(0, ParentVNI);
+    forceRecompute(0, *ParentVNI);
   }
 
   // If it is not beneficial to hoist all the BackCopies, simply remove
@@ -1428,6 +1427,41 @@ void SplitEditor::deleteRematVictims() {
   Edit->eliminateDeadDefs(Dead, None, &AA);
 }
 
+void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) {
+  // Fast-path for common case.
+  if (!ParentVNI.isPHIDef()) {
+    for (unsigned I = 0, E = Edit->size(); I != E; ++I)
+      forceRecompute(I, ParentVNI);
+    return;
+  }
+
+  // Trace value through phis.
+  SmallPtrSet<const VNInfo *, 8> Visited; ///< whether VNI was/is in worklist.
+  SmallVector<const VNInfo *, 4> WorkList;
+  Visited.insert(&ParentVNI);
+  WorkList.push_back(&ParentVNI);
+
+  const LiveInterval &ParentLI = Edit->getParent();
+  const SlotIndexes &Indexes = *LIS.getSlotIndexes();
+  do {
+    const VNInfo &VNI = *WorkList.back();
+    WorkList.pop_back();
+    for (unsigned I = 0, E = Edit->size(); I != E; ++I)
+      forceRecompute(I, VNI);
+    if (!VNI.isPHIDef())
+      continue;
+
+    MachineBasicBlock &MBB = *Indexes.getMBBFromIndex(VNI.def);
+    for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+      SlotIndex PredEnd = Indexes.getMBBEndIdx(Pred);
+      VNInfo *PredVNI = ParentLI.getVNInfoBefore(PredEnd);
+      assert(PredVNI && "Value available in PhiVNI predecessor");
+      if (Visited.insert(PredVNI).second)
+        WorkList.push_back(PredVNI);
+    }
+  } while(!WorkList.empty());
+}
+
 void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
   ++NumFinished;
 
@@ -1444,8 +1478,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     // Force rematted values to be recomputed everywhere.
     // The new live ranges may be truncated.
     if (Edit->didRematerialize(ParentVNI))
-      for (unsigned i = 0, e = Edit->size(); i != e; ++i)
-        forceRecompute(i, ParentVNI);
+      forceRecomputeVNI(*ParentVNI);
   }
 
   // Hoist back-copies to the complement interval when in spill mode.
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index c0608893d4e5..2dafaf587801 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -357,7 +357,11 @@ class LLVM_LIBRARY_VISIBILITY SplitEditor {
   /// recomputed by LiveRangeCalc::extend regardless of the number of defs.
   /// This is used for values whose live range doesn't match RegAssign exactly.
   /// They could have rematerialized, or back-copies may have been moved.
-  void forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI);
+  void forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI);
+
+  /// Calls forceRecompute() on any affected regidx and on ParentVNI
+  /// predecessors in case of a phi definition.
+  void forceRecomputeVNI(const VNInfo &ParentVNI);
 
   /// defFromParent - Define Reg from ParentVNI at UseIdx using either
   /// rematerialization or a COPY from parent. Return the new value.
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index c258d1a4e3ad..c56a022c6705 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -75,7 +75,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name=="ssse3.pabs.d.128" || // Added in 6.0
       Name.startswith("avx512.mask.shuf.i") || // Added in 6.0
       Name.startswith("avx512.mask.shuf.f") || // Added in 6.0
-      Name.startswith("avx512.kunpck") || //added in 6.0 
       Name.startswith("avx2.pabs.") || // Added in 6.0
       Name.startswith("avx512.mask.pabs.") || // Added in 6.0
       Name.startswith("avx512.broadcastm") || // Added in 6.0
@@ -1063,12 +1062,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("avx512.kunpck"))) {
-      uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2;
-      uint64_t And = (1ULL << Shift) - 1; 
-      Value* LowBits =  Builder.CreateAnd(CI->getArgOperand(0), And);
-      Value* HighBits =  Builder.CreateShl(CI->getArgOperand(1), Shift);
-      Rep = Builder.CreateOr(LowBits, HighBits);
     } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
       Type *I32Ty = Type::getInt32Ty(C);
       Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 2259136c6ec4..ce3b70bed740 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -83,27 +83,6 @@ namespace {
 typedef std::vector<AsmToken> MCAsmMacroArgument;
 typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
 
-struct MCAsmMacroParameter {
-  StringRef Name;
-  MCAsmMacroArgument Value;
-  bool Required = false;
-  bool Vararg = false;
-
-  MCAsmMacroParameter() = default;
-};
-
-typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
-
-struct MCAsmMacro {
-  StringRef Name;
-  StringRef Body;
-  MCAsmMacroParameters Parameters;
-
-public:
-  MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
-      : Name(N), Body(B), Parameters(std::move(P)) {}
-};
-
 /// \brief Helper class for storing information about an active macro
 /// instantiation.
 struct MacroInstantiation {
@@ -164,9 +143,6 @@ class AsmParser : public MCAsmParser {
   /// addDirectiveHandler.
   StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
 
-  /// \brief Map of currently defined macros.
-  StringMap<MCAsmMacro> MacroMap;
-
   /// \brief Stack of active macro instantiations.
   std::vector<MacroInstantiation*> ActiveMacros;
 
@@ -308,17 +284,6 @@ class AsmParser : public MCAsmParser {
   /// \brief Control a flag in the parser that enables or disables macros.
   void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
 
-  /// \brief Lookup a previously defined macro.
-  /// \param Name Macro name.
-  /// \returns Pointer to macro. NULL if no such macro was defined.
-  const MCAsmMacro* lookupMacro(StringRef Name);
-
-  /// \brief Define a new macro with the given name and information.
-  void defineMacro(StringRef Name, MCAsmMacro Macro);
-
-  /// \brief Undefine a macro. If no such macro was defined, it's a no-op.
-  void undefineMacro(StringRef Name);
-
   /// \brief Are we inside a macro instantiation?
   bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}
 
@@ -1841,7 +1806,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
 
   // If macros are enabled, check to see if this is a macro instantiation.
   if (areMacrosEnabled())
-    if (const MCAsmMacro *M = lookupMacro(IDVal)) {
+    if (const MCAsmMacro *M = getContext().lookupMacro(IDVal)) {
       return handleMacroEntry(M, IDLoc);
     }
 
@@ -2720,17 +2685,6 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
   return TokError("too many positional arguments");
 }
 
-const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) {
-  StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
-  return (I == MacroMap.end()) ? nullptr : &I->getValue();
-}
-
-void AsmParser::defineMacro(StringRef Name, MCAsmMacro Macro) {
-  MacroMap.insert(std::make_pair(Name, std::move(Macro)));
-}
-
-void AsmParser::undefineMacro(StringRef Name) { MacroMap.erase(Name); }
-
 bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   // Arbitrarily limit macro nesting depth (default matches 'as'). We can
   // eliminate this, although we should protect against infinite loops.
@@ -4249,7 +4203,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
     eatToEndOfStatement();
   }
 
-  if (lookupMacro(Name)) {
+  if (getContext().lookupMacro(Name)) {
     return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
   }
 
@@ -4257,7 +4211,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
   const char *BodyEnd = EndToken.getLoc().getPointer();
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
   checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
-  defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
+  getContext().defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
   return false;
 }
 
@@ -4416,10 +4370,10 @@ bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
                  "unexpected token in '.purgem' directive"))
     return true;
 
-  if (!lookupMacro(Name))
+  if (!getContext().lookupMacro(Name))
     return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
 
-  undefineMacro(Name);
+  getContext().undefineMacro(Name);
   return false;
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 49929441ef21..21192a2c1cc8 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3957,6 +3957,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CVT_F32_UBYTE2)
   NODE_NAME_CASE(CVT_F32_UBYTE3)
   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
+  NODE_NAME_CASE(CVT_PKNORM_I16_F32)
+  NODE_NAME_CASE(CVT_PKNORM_U16_F32)
+  NODE_NAME_CASE(CVT_PK_I16_I32)
+  NODE_NAME_CASE(CVT_PK_U16_U32)
   NODE_NAME_CASE(FP_TO_FP16)
   NODE_NAME_CASE(FP16_ZEXT)
   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 5c31bddd9b1a..039ee174e5b7 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -417,6 +417,10 @@ enum NodeType : unsigned {
   // Convert two float 32 numbers into a single register holding two packed f16
   // with round to zero.
   CVT_PKRTZ_F16_F32,
+  CVT_PKNORM_I16_F32,
+  CVT_PKNORM_U16_F32,
+  CVT_PK_I16_I32,
+  CVT_PK_U16_U32,
 
   // Same as the standard node, except the high bits of the resulting integer
   // are known 0.
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 8156599528c2..61892efe39e0 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -108,3 +108,21 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
 
   return MCOp;
 }
+
+// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
+bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
+  const Value *Ptr = MMO->getValue();
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+    return true;
+
+  if (const Argument *Arg = dyn_cast<Argument>(Ptr))
+    return AMDGPU::isArgPassedInSGPR(Arg);
+
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index a9fcd4834638..74e14ef8fbd8 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -50,6 +50,8 @@ class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
   /// Return -1 if the target-specific opcode for the pseudo instruction does
   /// not exist. If Opcode is not a pseudo instruction, this is identity.
   int pseudoToMCOpcode(int Opcode) const;
+
+  static bool isUniformMMO(const MachineMemOperand *MMO);
 };
 } // End llvm namespace
 
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index c024010f3e96..65c483d85c5a 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -35,6 +35,10 @@ def AMDGPUFPPackOp : SDTypeProfile<1, 2,
   [SDTCisFP<1>, SDTCisSameAs<1, 2>]
 >;
 
+def AMDGPUIntPackOp : SDTypeProfile<1, 2,
+  [SDTCisInt<1>, SDTCisSameAs<1, 2>]
+>;
+
 def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
   [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
 >;
@@ -142,6 +146,10 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
 def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
 
 def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
+def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
+def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
 def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
 def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
 
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1ed02fae085a..e3df6d9bee88 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -120,7 +120,7 @@ static bool isInstrUniform(const MachineInstr &MI) {
     return false;
 
   const MachineMemOperand *MMO = *MI.memoperands_begin();
-  return AMDGPU::isUniformMMO(MMO);
+  return AMDGPUInstrInfo::isUniformMMO(MMO);
 }
 
 const RegisterBankInfo::InstructionMapping &
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 415d8a512aa8..6d89aa6968e9 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -205,6 +205,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
 
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -1085,7 +1086,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
 
-  return AMDGPU::isUniformMMO(MemNode->getMemOperand());
+  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -3517,7 +3518,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
   }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
+    switch (IID) {
+    case Intrinsic::amdgcn_cvt_pkrtz: {
       SDValue Src0 = N->getOperand(1);
       SDValue Src1 = N->getOperand(2);
       SDLoc SL(N);
@@ -3526,6 +3528,29 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
       return;
     }
+    case Intrinsic::amdgcn_cvt_pknorm_i16:
+    case Intrinsic::amdgcn_cvt_pknorm_u16:
+    case Intrinsic::amdgcn_cvt_pk_i16:
+    case Intrinsic::amdgcn_cvt_pk_u16: {
+      SDValue Src0 = N->getOperand(1);
+      SDValue Src1 = N->getOperand(2);
+      SDLoc SL(N);
+      unsigned Opcode;
+
+      if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
+        Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+      else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
+        Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+      else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
+        Opcode = AMDGPUISD::CVT_PK_I16_I32;
+      else
+        Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+      SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
+      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+      return;
+    }
+    }
     break;
   }
   case ISD::SELECT: {
@@ -4424,10 +4449,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_ubfe:
     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-  case Intrinsic::amdgcn_cvt_pkrtz: {
-    // FIXME: Stop adding cast if v2f16 legal.
+  case Intrinsic::amdgcn_cvt_pkrtz:
+  case Intrinsic::amdgcn_cvt_pknorm_i16:
+  case Intrinsic::amdgcn_cvt_pknorm_u16:
+  case Intrinsic::amdgcn_cvt_pk_i16:
+  case Intrinsic::amdgcn_cvt_pk_u16: {
+    // FIXME: Stop adding cast if v2f16/v2i16 are legal.
     EVT VT = Op.getValueType();
-    SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+    unsigned Opcode;
+
+    if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
+      Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
+    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
+      Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
+      Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
+      Opcode = AMDGPUISD::CVT_PK_I16_I32;
+    else
+      Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+    SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
                                Op.getOperand(1), Op.getOperand(2));
     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
   }
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 125a3b22d0cf..bf9d5bc6ebdc 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -871,24 +871,6 @@ bool isArgPassedInSGPR(const Argument *A) {
   }
 }
 
-// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
-bool isUniformMMO(const MachineMemOperand *MMO) {
-  const Value *Ptr = MMO->getValue();
-  // UndefValue means this is a load of a kernel input.  These are uniform.
-  // Sometimes LDS instructions have constant pointers.
-  // If Ptr is null, then that means this mem operand contains a
-  // PseudoSourceValue like GOT.
-  if (!Ptr || isa<UndefValue>(Ptr) ||
-      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
-    return true;
-
-  if (const Argument *Arg = dyn_cast<Argument>(Ptr))
-    return isArgPassedInSGPR(Arg);
-
-  const Instruction *I = dyn_cast<Instruction>(Ptr);
-  return I && I->getMetadata("amdgpu.uniform");
-}
-
 int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
   if (isGCN3Encoding(ST))
     return ByteOffset;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a215b445378e..9515001b63d2 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -363,7 +363,6 @@ LLVM_READNONE
 bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
 bool isArgPassedInSGPR(const Argument *Arg);
-bool isUniformMMO(const MachineMemOperand *MMO);
 
 /// \returns The encoding that will be used for \p ByteOffset in the SMRD
 /// offset field.
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index ef90b68db1a8..56b934f92f61 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -407,11 +407,11 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
 defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
 defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
 defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;
 
 } // End SubtargetPredicate = isGCN
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index a7059c6914df..4ddc1f0ba429 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -396,10 +396,14 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
 
     // rip-relative addressing is actually relative to the *next* instruction.
     // Since an immediate can follow the mod/rm byte for an instruction, this
-    // means that we need to bias the immediate field of the instruction with
-    // the size of the immediate field.  If we have this case, add it into the
+    // means that we need to bias the displacement field of the instruction with
+    // the size of the immediate field. If we have this case, add it into the
     // expression to emit.
-    int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+    // Note: rip-relative addressing using immediate displacement values should
+    // not be adjusted, assuming it was the user's intent.
+    int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags)
+                      ? X86II::getSizeOfImm(TSFlags)
+                      : 0;
 
     EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
                   CurByte, OS, Fixups, -ImmSize);
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 71526dd77f11..2a501efbc1bf 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -370,6 +370,8 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
 static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
                               char Mode, raw_ostream &O) {
   unsigned Reg = MO.getReg();
+  bool EmitPercent = true;
+
   switch (Mode) {
   default: return true;  // Unknown mode.
   case 'b': // Print QImode register
@@ -384,6 +386,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
   case 'k': // Print SImode register
     Reg = getX86SubSuperRegister(Reg, 32);
     break;
+  case 'V':
+    EmitPercent = false;
+    LLVM_FALLTHROUGH;
   case 'q':
     // Print 64-bit register names if 64-bit integer registers are available.
     // Otherwise, print 32-bit register names.
@@ -391,7 +396,10 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
     break;
   }
 
-  O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+  if (EmitPercent)
+    O << '%';
+
+  O << X86ATTInstPrinter::getRegisterName(Reg);
   return false;
 }
 
@@ -464,6 +472,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     case 'w': // Print HImode register
     case 'k': // Print SImode register
     case 'q': // Print DImode register
+    case 'V': // Print native register without '%'
       if (MO.isReg())
         return printAsmMRegister(*this, MO, ExtraCode[0], O);
       printOperand(*this, MI, OpNo, O);
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index ba7280c29cc9..bc0f55f581ff 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -663,8 +663,10 @@ void X86DomainReassignment::initConverters() {
     createReplacer(X86::XOR32rr, X86::KXORDrr);
     createReplacer(X86::XOR64rr, X86::KXORQrr);
 
-    createReplacer(X86::TEST32rr, X86::KTESTDrr);
-    createReplacer(X86::TEST64rr, X86::KTESTQrr);
+    // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+    // to prove only Z flag is used.
+    //createReplacer(X86::TEST32rr, X86::KTESTDrr);
+    //createReplacer(X86::TEST64rr, X86::KTESTQrr);
   }
 
   if (STI->hasDQI()) {
@@ -684,8 +686,10 @@ void X86DomainReassignment::initConverters() {
     createReplacer(X86::SHR8ri, X86::KSHIFTRBri);
     createReplacer(X86::SHL8ri, X86::KSHIFTLBri);
 
-    createReplacer(X86::TEST8rr, X86::KTESTBrr);
-    createReplacer(X86::TEST16rr, X86::KTESTWrr);
+    // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+    // to prove only Z flag is used.
+    //createReplacer(X86::TEST8rr, X86::KTESTBrr);
+    //createReplacer(X86::TEST16rr, X86::KTESTWrr);
 
     createReplacer(X86::XOR8rr, X86::KXORBrr);
   }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 38885c42b529..9237833a2cd0 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -17017,24 +17017,6 @@ static bool hasNonFlagsUse(SDValue Op) {
   return false;
 }
 
-// Emit KTEST instruction for bit vectors on AVX-512
-static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
-                         const X86Subtarget &Subtarget) {
-  if (Op.getOpcode() == ISD::BITCAST) {
-    auto hasKTEST = [&](MVT VT) {
-      unsigned SizeInBits = VT.getSizeInBits();
-      return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
-        (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
-    };
-    SDValue Op0 = Op.getOperand(0);
-    MVT Op0VT = Op0.getValueType().getSimpleVT();
-    if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
-        hasKTEST(Op0VT))
-      return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
-  }
-  return SDValue();
-}
-
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
@@ -17079,9 +17061,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
-    // Emit KTEST for bit vectors
-    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
-      return Node;
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
@@ -17310,10 +17289,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
   }
 
   if (Opcode == 0) {
-    // Emit KTEST for bit vectors
-    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
-      return Node;
-
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
@@ -18093,6 +18068,34 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   return Result;
 }
 
+// Try to select this as a KTEST+SETCC if possible.
+static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         const X86Subtarget &Subtarget) {
+  // Only support equality comparisons.
+  if (CC != ISD::SETEQ && CC != ISD::SETNE)
+    return SDValue();
+
+  // Must be a bitcast from vXi1.
+  if (Op0.getOpcode() != ISD::BITCAST)
+    return SDValue();
+
+  Op0 = Op0.getOperand(0);
+  MVT VT = Op0.getSimpleValueType();
+  if (!(Subtarget.hasDQI() && (VT == MVT::v8i1  || VT == MVT::v16i1)) &&
+      !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
+    return SDValue();
+
+  X86::CondCode X86CC;
+  if (isNullConstant(Op1)) {
+    X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+  } else
+    return SDValue();
+
+  SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0);
+  return getSETCC(X86CC, KTEST, dl, DAG);
+}
+
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   MVT VT = Op.getSimpleValueType();
@@ -18115,6 +18118,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
       return NewSetCC;
   }
 
+  // Try to lower using KTEST.
+  if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
+    return NewSetCC;
+
   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   // these.
   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
@@ -20525,6 +20532,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       Mask = DAG.getBitcast(MaskVT, Mask);
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
     }
+    case KUNPCK: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      // Arguments should be swapped.
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+                                Src2, Src1);
+      return DAG.getBitcast(VT, Res);
+    }
     case MASK_BINOP: {
       MVT VT = Op.getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
@@ -27094,28 +27113,57 @@ static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
 
 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
                                       unsigned Reg) {
+  if (Subtarget.useRetpolineExternalThunk()) {
+    // When using an external thunk for retpolines, we pick names that match the
+    // names GCC happens to use as well. This helps simplify the implementation
+    // of the thunks for kernels where they have no easy ability to create
+    // aliases and are doing non-trivial configuration of the thunk's body. For
+    // example, the Linux kernel will do boot-time hot patching of the thunk
+    // bodies and cannot easily export aliases of these to loaded modules.
+    //
+    // Note that at any point in the future, we may need to change the semantics
+    // of how we implement retpolines and at that time will likely change the
+    // name of the called thunk. Essentially, there is no hard guarantee that
+    // LLVM will generate calls to specific thunks, we merely make a best-effort
+    // attempt to help out kernels and other systems where duplicating the
+    // thunks is costly.
+    switch (Reg) {
+    case X86::EAX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_eax";
+    case X86::ECX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_ecx";
+    case X86::EDX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_edx";
+    case X86::EDI:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_edi";
+    case X86::R11:
+      assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+      return "__x86_indirect_thunk_r11";
+    }
+    llvm_unreachable("unexpected reg for retpoline");
+  }
+
+  // When targeting an internal COMDAT thunk use an LLVM-specific name.
   switch (Reg) {
-  case 0:
-    assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_push"
-               : "__llvm_retpoline_push";
   case X86::EAX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_eax"
-               : "__llvm_retpoline_eax";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_eax";
   case X86::ECX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_ecx"
-               : "__llvm_retpoline_ecx";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_ecx";
   case X86::EDX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_edx"
-               : "__llvm_retpoline_edx";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_edx";
+  case X86::EDI:
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_edi";
   case X86::R11:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_r11"
-               : "__llvm_retpoline_r11";
+    assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+    return "__llvm_retpoline_r11";
   }
   llvm_unreachable("unexpected reg for retpoline");
 }
@@ -27134,15 +27182,13 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
   // just use R11, but we scan for uses anyway to ensure we don't generate
   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
   // already a register use operand to the call to hold the callee. If none
-  // are available, push the callee instead. This is less efficient, but is
-  // necessary for functions using 3 regparms. Such function calls are
-  // (currently) not eligible for tail call optimization, because there is no
-  // scratch register available to hold the address of the callee.
+  // are available, use EDI instead. EDI is chosen because EBX is the PIC base
+  // register and ESI is the base pointer to realigned stack frames with VLAs.
   SmallVector<unsigned, 3> AvailableRegs;
   if (Subtarget.is64Bit())
     AvailableRegs.push_back(X86::R11);
   else
-    AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX});
+    AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
 
   // Zero out any registers that are already used.
   for (const auto &MO : MI.operands()) {
@@ -27160,30 +27206,18 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
       break;
     }
   }
+  if (!AvailableReg)
+    report_fatal_error("calling convention incompatible with retpoline, no "
+                       "available registers");
 
   const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
 
-  if (AvailableReg == 0) {
-    // No register available. Use PUSH. This must not be a tailcall, and this
-    // must not be x64.
-    if (Subtarget.is64Bit())
-      report_fatal_error(
-          "Cannot make an indirect call on x86-64 using both retpoline and a "
-          "calling convention that preservers r11");
-    if (Opc != X86::CALLpcrel32)
-      report_fatal_error("Cannot make an indirect tail call on x86 using "
-                         "retpoline without a preserved register");
-    BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg);
-    MI.getOperand(0).ChangeToES(Symbol);
-    MI.setDesc(TII->get(Opc));
-  } else {
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
-        .addReg(CalleeVReg);
-    MI.getOperand(0).ChangeToES(Symbol);
-    MI.setDesc(TII->get(Opc));
-    MachineInstrBuilder(*BB->getParent(), &MI)
-        .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
-  }
+  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
+      .addReg(CalleeVReg);
+  MI.getOperand(0).ChangeToES(Symbol);
+  MI.setDesc(TII->get(Opc));
+  MachineInstrBuilder(*BB->getParent(), &MI)
+      .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
   return BB;
 }
 
@@ -30432,53 +30466,6 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
   SDValue N0 = BitCast.getOperand(0);
   EVT VecVT = N0->getValueType(0);
 
-  if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
-      N0->getOpcode() == ISD::OR) {
-    SDValue Op0 = N0->getOperand(0);
-    SDValue Op1 = N0->getOperand(1);
-    MVT TrunckVT;
-    MVT BitcastVT;
-    switch (VT.getSimpleVT().SimpleTy) {
-    default:
-      return SDValue();
-    case MVT::v16i1:
-      TrunckVT = MVT::i8;
-      BitcastVT = MVT::v8i1;
-      break;
-    case MVT::v32i1:
-      TrunckVT = MVT::i16;
-      BitcastVT = MVT::v16i1;
-      break;
-    case MVT::v64i1:
-      TrunckVT = MVT::i32;
-      BitcastVT = MVT::v32i1;
-      break;
-    }
-    bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
-    bool isArg0UndefLeft =
-        Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
-    bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
-    bool isArg1UndefLeft =
-        Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
-    SDValue OpLeft;
-    SDValue OpRight;
-    if (isArg0UndefRight && isArg1UndefLeft) {
-      OpLeft = Op0;
-      OpRight = Op1;
-    } else if (isArg1UndefRight && isArg0UndefLeft) {
-      OpLeft = Op1;
-      OpRight = Op0;
-    } else
-      return SDValue();
-    SDLoc DL(BitCast);
-    SDValue Shr = OpLeft->getOperand(0);
-    SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
-    SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
-    SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
-    SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
-  }
-
   if (!VT.isScalarInteger() || !VecVT.isSimple())
     return SDValue();
 
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 0782d5598746..fae0889950b2 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
   COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
   EXPAND_FROM_MEM,
-  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
   FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
   ROUNDP, ROUNDS
 };
@@ -479,6 +479,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
   X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
   X86ISD::FADD_RND),
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index 223fa5771498..d03826bbe992 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -43,7 +43,7 @@ static const char R11ThunkName[]    = "__llvm_retpoline_r11";
 static const char EAXThunkName[]    = "__llvm_retpoline_eax";
 static const char ECXThunkName[]    = "__llvm_retpoline_ecx";
 static const char EDXThunkName[]    = "__llvm_retpoline_edx";
-static const char PushThunkName[]   = "__llvm_retpoline_push";
+static const char EDIThunkName[]    = "__llvm_retpoline_edi";
 
 namespace {
 class X86RetpolineThunks : public MachineFunctionPass {
@@ -74,7 +74,6 @@ class X86RetpolineThunks : public MachineFunctionPass {
 
   void createThunkFunction(Module &M, StringRef Name);
   void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
-  void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB);
   void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
 };
 
@@ -127,7 +126,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
       createThunkFunction(M, R11ThunkName);
     else
       for (StringRef Name :
-           {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName})
+           {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
         createThunkFunction(M, Name);
     InsertedThunks = true;
     return true;
@@ -151,9 +150,8 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
     populateThunk(MF, X86::R11);
   } else {
     // For 32-bit targets we need to emit a collection of thunks for various
-    // possible scratch registers as well as a fallback that is used when
-    // there are no scratch registers and assumes the retpoline target has
-    // been pushed.
+    // possible scratch registers as well as a fallback that uses EDI, which is
+    // normally callee saved.
     //   __llvm_retpoline_eax:
     //         calll .Leax_call_target
     //   .Leax_capture_spec:
@@ -174,32 +172,18 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
     //         movl %edx, (%esp)
     //         retl
     //
-    // This last one is a bit more special and so needs a little extra
-    // handling.
-    // __llvm_retpoline_push:
-    //         calll .Lpush_call_target
-    // .Lpush_capture_spec:
-    //         pause
-    //         lfence
-    //         jmp .Lpush_capture_spec
-    // .align 16
-    // .Lpush_call_target:
-    //         # Clear pause_loop return address.
-    //         addl $4, %esp
-    //         # Top of stack words are: Callee, RA. Exchange Callee and RA.
-    //         pushl 4(%esp)  # Push callee
-    //         pushl 4(%esp)  # Push RA
-    //         popl 8(%esp)   # Pop RA to final RA
-    //         popl (%esp)    # Pop callee to next top of stack
-    //         retl           # Ret to callee
+    //   __llvm_retpoline_edi:
+    //   ... # Same setup
+    //         movl %edi, (%esp)
+    //         retl
     if (MF.getName() == EAXThunkName)
       populateThunk(MF, X86::EAX);
     else if (MF.getName() == ECXThunkName)
       populateThunk(MF, X86::ECX);
     else if (MF.getName() == EDXThunkName)
       populateThunk(MF, X86::EDX);
-    else if (MF.getName() == PushThunkName)
-      populateThunk(MF);
+    else if (MF.getName() == EDIThunkName)
+      populateThunk(MF, X86::EDI);
     else
       llvm_unreachable("Invalid thunk name on x86-32!");
   }
@@ -240,31 +224,6 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
       .addReg(Reg);
 }
 
-void X86RetpolineThunks::insert32BitPushReturnAddrClobber(
-    MachineBasicBlock &MBB) {
-  // The instruction sequence we use to replace the return address without
-  // a scratch register is somewhat complicated:
-  //   # Clear capture_spec from return address.
-  //   addl $4, %esp
-  //   # Top of stack words are: Callee, RA. Exchange Callee and RA.
-  //   pushl 4(%esp)  # Push callee
-  //   pushl 4(%esp)  # Push RA
-  //   popl 8(%esp)   # Pop RA to final RA
-  //   popl (%esp)    # Pop callee to next top of stack
-  //   retl           # Ret to callee
-  BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP)
-      .addReg(X86::ESP)
-      .addImm(4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
-               false, 4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
-               false, 4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
-               false, 8);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
-               false, 0);
-}
-
 void X86RetpolineThunks::populateThunk(MachineFunction &MF,
                                        Optional<unsigned> Reg) {
   // Set MF properties. We never use vregs...
@@ -301,11 +260,6 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
   CaptureSpec->addSuccessor(CaptureSpec);
 
   CallTarget->setAlignment(4);
-  if (Reg) {
-    insertRegReturnAddrClobber(*CallTarget, *Reg);
-  } else {
-    assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!");
-    insert32BitPushReturnAddrClobber(*CallTarget);
-  }
+  insertRegReturnAddrClobber(*CallTarget, *Reg);
   BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
 }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 40e52ee755e5..2f2f0696366c 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3264,6 +3264,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
+  case Intrinsic::amdgcn_cvt_pknorm_i16:
+  case Intrinsic::amdgcn_cvt_pknorm_u16:
+  case Intrinsic::amdgcn_cvt_pk_i16:
+  case Intrinsic::amdgcn_cvt_pk_u16: {
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
+      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+    break;
+  }
   case Intrinsic::amdgcn_ubfe:
   case Intrinsic::amdgcn_sbfe: {
     // Decompose simple cases into standard shifts.
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
new file mode 100644
index 000000000000..241b708e7baf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pk_i16_samereg_i32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pk_i16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %x)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %b = load volatile i32, i32 addrspace(1)* %b.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
+define amdgpu_kernel void @v_cvt_pk_i16_i32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 1)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, 1, [[A]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, 1, [[A]]
+define amdgpu_kernel void @v_cvt_pk_i16_i32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 1, i32 %a)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
new file mode 100644
index 000000000000..8d5c9aa95219
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pk_u16_samereg_i32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pk_u16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %x)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %b = load volatile i32, i32 addrspace(1)* %b.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
+define amdgpu_kernel void @v_cvt_pk_u16_u32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 1)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, 1, [[A]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, 1, [[A]]
+define amdgpu_kernel void @v_cvt_pk_u16_u32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 1, i32 %a)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
new file mode 100644
index 000000000000..822e8c2886ba
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
@@ -0,0 +1,164 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_i16_samereg_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pknorm_i16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %x)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float 1.0)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, 1.0, [[A]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float 1.0, float %a)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fabs.a = call float @llvm.fabs.f32(float %a)
+  %neg.fabs.a = fsub float -0.0, %fabs.a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.fabs.a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) #1
+declare float @llvm.fabs.f32(float) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
new file mode 100644
index 000000000000..c2b8f3cb28ca
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
@@ -0,0 +1,164 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_u16_samereg_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pknorm_u16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %x)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float 1.0)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, 1.0, [[A]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float 1.0, float %a)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fabs.a = call float @llvm.fabs.f32(float %a)
+  %neg.fabs.a = fsub float -0.0, %fabs.a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.fabs.a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) #1
+declare float @llvm.fabs.f32(float) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/ARM/pr25838.ll b/test/CodeGen/ARM/pr25838.ll
index 0aa95fd2d720..f3bb98f4260c 100644
--- a/test/CodeGen/ARM/pr25838.ll
+++ b/test/CodeGen/ARM/pr25838.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -verify-machineinstrs < %s
 ; PR25838
 
 target triple = "armv7--linux-android"
diff --git a/test/CodeGen/ARM/splitkit.ll b/test/CodeGen/ARM/splitkit.ll
new file mode 100644
index 000000000000..d51d35174450
--- /dev/null
+++ b/test/CodeGen/ARM/splitkit.ll
@@ -0,0 +1,245 @@
+; RUN: llc -o - %s | FileCheck %s
+; Make sure RegAllocGreedy/SplitKit do not produce invalid liveness information
+; and crash when splitting a liverange twice and rematerializing each time.
+; (Sorry for the testcase; this was ran through bugpoint and then  manually
+;  reduced for several hours but is still big...)
+target triple = "thumbv7-apple-ios"
+
+%struct.ham = type { %struct.wombat.0 }
+%struct.wombat.0 = type { %struct.barney }
+%struct.barney = type { %struct.snork.1 }
+%struct.snork.1 = type { %struct.wobble.2 }
+%struct.wobble.2 = type { %struct.blam }
+%struct.blam = type { i32, i32, i8* }
+%struct.ham.3 = type { %struct.pluto }
+%struct.pluto = type { %struct.zot*, %struct.snork.5, %struct.wibble }
+%struct.zot = type { %struct.blam.4* }
+%struct.blam.4 = type <{ %struct.zot, %struct.blam.4*, %struct.zot*, i8, [3 x i8] }>
+%struct.snork.5 = type { %struct.quux }
+%struct.quux = type { %struct.zot }
+%struct.wibble = type { %struct.widget }
+%struct.widget = type { i32 }
+%struct.bar = type { %struct.spam }
+%struct.spam = type { %struct.zot*, %struct.wobble, %struct.zot.7 }
+%struct.wobble = type { %struct.wibble.6 }
+%struct.wibble.6 = type { %struct.zot }
+%struct.zot.7 = type { %struct.ham.8 }
+%struct.ham.8 = type { i32 }
+%struct.hoge = type { %struct.ham, %struct.foo }
+%struct.foo = type { float, float }
+%struct.wombat = type { %struct.ham, float }
+%struct.snork = type { %struct.ham.9, [11 x i8] }
+%struct.ham.9 = type { i8 }
+
+@global = external global i8
+@global.1 = private constant [20 x i8] c"aaaaaaaaaaaaaaaaaa0\00"
+@global.2 = external constant [27 x i8]
+@global.3 = external global %struct.ham
+@global.4 = external constant [47 x i8]
+@global.5 = external constant [61 x i8]
+@global.6 = external constant [40 x i8]
+@global.7 = external constant [24 x i8]
+@global.8 = external constant [20 x i8]
+@global.9 = external global %struct.ham
+@global.10 = external global %struct.ham
+@global.11 = external global %struct.ham
+@global.12 = external global %struct.ham
+@global.13 = external global %struct.ham
+@global.14 = external global %struct.ham
+@global.15 = external global %struct.ham
+@global.16 = external global %struct.ham
+@global.17 = external global %struct.ham
+@global.18 = external constant [35 x i8]
+@global.19 = external global %struct.ham
+@global.20 = external constant [53 x i8]
+@global.21 = external global %struct.ham
+@global.22 = external global %struct.ham
+@global.23 = external global %struct.ham
+@global.24 = external constant [32 x i8]
+@global.25 = external global %struct.ham
+@global.26 = external constant [47 x i8]
+@global.27 = external global %struct.ham
+@global.28 = external constant [45 x i8]
+@global.29 = external global %struct.ham
+@global.30 = external global %struct.ham
+@global.31 = external constant [24 x i8]
+@global.32 = external global %struct.ham
+@global.33 = external global %struct.ham
+@global.34 = external global %struct.ham
+@global.35 = external global %struct.ham
+@global.36 = external constant [27 x i8]
+@global.37 = external global %struct.ham
+@global.38 = external constant [10 x i8]
+@global.39 = external global %struct.ham
+@global.40 = external global %struct.ham
+@global.41 = external global %struct.ham
+@global.42 = external global %struct.ham
+@global.43 = external global %struct.ham
+@global.44 = external constant [41 x i8]
+@global.45 = external global %struct.ham
+@global.46 = external global %struct.ham
+@global.47 = external global %struct.ham
+@global.48 = external global %struct.ham
+@global.49 = external constant [52 x i8]
+@global.50 = external constant [47 x i8]
+@global.51 = external global %struct.ham
+@global.52 = external global %struct.ham
+@global.53 = external global %struct.ham
+@global.54 = external global %struct.ham
+@global.55 = external global %struct.ham.3
+@global.56 = external global %struct.bar
+@global.57 = external global i8
+
+declare %struct.ham* @bar(%struct.ham* returned)
+
+declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*)
+
+declare %struct.ham* @wobble(%struct.ham* returned, %struct.ham* ) 
+
+declare i32 @quux(...)
+
+declare i8* @_Znwm(i32)
+
+declare i32 @wobble.58(%struct.pluto*, [1 x i32], %struct.ham* , %struct.hoge* )
+
+declare i32 @widget(%struct.spam*, [1 x i32], %struct.ham* , %struct.wombat* )
+
+; Just check we didn't crash and did output something...
+; CHECK-LABEL: func:
+; CHECK: trap
+define internal void @func() section "__TEXT,__StaticInit,regular,pure_instructions" personality i32 (...)* @quux {
+  %tmp = tail call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.3 to i8*), i8* @global) #0
+  %tmp2 = invoke %struct.ham* @wobble(%struct.ham* undef, %struct.ham*  @global.9)
+          to label %bb14 unwind label %bbunwind
+
+bb14:
+  %tmp15 = getelementptr  i8, i8* undef, i32 12
+  store i8 0, i8* %tmp15
+  %tmp16 = icmp eq i8 undef, 0
+  br i1 %tmp16, label %bb28, label %bb18
+
+bb18:
+  br i1 undef, label %bb21, label %bb29
+
+bb21:
+  %tmp22 = call i8* @_Znwm(i32 16)
+  store i32 17, i32* getelementptr  (%struct.ham, %struct.ham* @global.10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp23 = call i8* @_Znwm(i32 32)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([27 x i8], [27 x i8]* @global.2, i32 0, i32 0), i32 26, i32 1, i1 false)
+  store i32 33, i32* getelementptr  (%struct.ham, %struct.ham* @global.11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  store i32 23, i32* getelementptr  (%struct.ham, %struct.ham* @global.11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([24 x i8], [24 x i8]* @global.7, i32 0, i32 0), i32 23, i32 1, i1 false)
+  %tmp24 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.11 to i8*), i8* @global) #0
+  store i32 49, i32* getelementptr  (%struct.ham, %struct.ham* @global.12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  store i32 37, i32* getelementptr  (%struct.ham, %struct.ham* @global.13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.14 to i8*), i8 0, i32 12, i32 1, i1 false)
+  %tmp25 = call i8* @_Znwm(i32 48)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp25, i8* align 1 getelementptr  ([40 x i8], [40 x i8]* @global.6, i32 0, i32 0), i32 39, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([47 x i8], [47 x i8]* @global.4, i32 0, i32 0), i32 46, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([61 x i8], [61 x i8]* @global.5, i32 0, i32 0), i32 60, i32 1, i1 false)
+  %tmp26 = call i8* @_Znwm(i32 48)
+  store i32 65, i32* getelementptr  (%struct.ham, %struct.ham* @global.15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp27 = icmp eq i8 undef, 0
+  br i1 %tmp27, label %bb30, label %bb33
+
+bb28:
+  call void @llvm.trap()
+  unreachable
+
+bb29:
+  call void @llvm.trap()
+  unreachable
+
+bb30:
+  %tmp31 = icmp eq i32 undef, 37
+  br i1 %tmp31, label %bb32, label %bb30
+
+bb32:
+  store i8 1, i8* @global.57
+  br label %bb33
+
+bb33:
+  %tmp34 = call i8* @_Znwm(i32 32)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([20 x i8], [20 x i8]* @global.1, i32 0, i32 0), i32 19, i32 1, i1 false)
+  store i32 17, i32* getelementptr  (%struct.ham, %struct.ham* @global.16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  store i32 65, i32* getelementptr  (%struct.ham, %struct.ham* @global.17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([35 x i8], [35 x i8]* @global.18, i32 0, i32 0), i32 34, i32 1, i1 false)
+  store i32 65, i32* getelementptr  (%struct.ham, %struct.ham* @global.19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([53 x i8], [53 x i8]* @global.20, i32 0, i32 0), i32 52, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([20 x i8], [20 x i8]* @global.8, i32 0, i32 0), i32 19, i32 1, i1 false)
+  store i32 37, i32* getelementptr  (%struct.ham, %struct.ham* @global.21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  %tmp35 = call i8* @_Znwm(i32 32)
+  store i8 16, i8* bitcast (%struct.ham* @global.22 to i8*)
+  %tmp36 = call i8* @_Znwm(i32 32)
+  store i32 31, i32* getelementptr  (%struct.ham, %struct.ham* @global.23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp36, i8* align 1 getelementptr  ([32 x i8], [32 x i8]* @global.24, i32 0, i32 0), i32 31, i32 1, i1 false)
+  %tmp37 = getelementptr  i8, i8* %tmp36, i32 31
+  store i8 0, i8* %tmp37
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([47 x i8], [47 x i8]* @global.26, i32 0, i32 0), i32 46, i32 1, i1 false)
+  %tmp38 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.25 to i8*), i8* @global) #0
+  %tmp39 = call i8* @_Znwm(i32 48)
+  store i32 44, i32* getelementptr  (%struct.ham, %struct.ham* @global.27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp39, i8* align 1 getelementptr  ([45 x i8], [45 x i8]* @global.28, i32 0, i32 0), i32 44, i32 1, i1 false)
+  %tmp40 = getelementptr  i8, i8* %tmp39, i32 44
+  store i8 0, i8* %tmp40
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.29 to i8*), i8 0, i32 12, i32 1, i1 false)
+  %tmp41 = call i8* @_Znwm(i32 32)
+  store i32 23, i32* getelementptr  (%struct.ham, %struct.ham* @global.30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp41, i8* align 1 getelementptr  ([24 x i8], [24 x i8]* @global.31, i32 0, i32 0), i32 23, i32 1, i1 false)
+  %tmp42 = getelementptr  i8, i8* %tmp41, i32 23
+  store i8 0, i8* %tmp42
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.32 to i8*), i8 0, i32 12, i32 1, i1 false)
+  store i8 16, i8* bitcast (%struct.ham* @global.32 to i8*)
+  %tmp43 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.33 to i8*), i8* @global) #0
+  %tmp44 = call i8* @_Znwm(i32 16)
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.34 to i8*), i8 0, i32 12, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.9 to i8*), i8 0, i32 12, i32 1, i1 false)
+  %tmp45 = call i8* @_Znwm(i32 32)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp45, i8* align 1 getelementptr  ([27 x i8], [27 x i8]* @global.36, i32 0, i32 0), i32 26, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.37 to i8*), i8 0, i32 12, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 getelementptr (%struct.snork, %struct.snork* bitcast (%struct.ham* @global.37 to %struct.snork*), i32 0, i32 1, i32 0), i8* align 1 getelementptr  ([10 x i8], [10 x i8]* @global.38, i32 0, i32 0), i32 9, i32 1, i1 false)
+  store i32 17, i32* getelementptr  (%struct.ham, %struct.ham* @global.39, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp46 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.40 to i8*), i8* @global) #0
+  %tmp47 = call i8* @_Znwm(i32 32)
+  %tmp48 = getelementptr  i8, i8* %tmp47, i32 21
+  store i8 0, i8* %tmp48
+  store i32 33, i32* getelementptr  (%struct.ham, %struct.ham* @global.41, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  store i32 15, i32* getelementptr  (%struct.ham, %struct.ham* @global.42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  %tmp49 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.43 to i8*), i8* @global) #0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([41 x i8], [41 x i8]* @global.44, i32 0, i32 0), i32 40, i32 1, i1 false)
+  %tmp50 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.45 to i8*), i8* @global) #0
+  %tmp51 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.46 to i8*), i8* @global) #0
+  %tmp52 = call i8* @_Znwm(i32 32)
+  store i8* %tmp52, i8** getelementptr  (%struct.ham, %struct.ham* @global.47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([52 x i8], [52 x i8]* @global.49, i32 0, i32 0), i32 51, i32 1, i1 false)
+  %tmp53 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.48 to i8*), i8* @global) #0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([47 x i8], [47 x i8]* @global.50, i32 0, i32 0), i32 46, i32 1, i1 false)
+  store i32 33, i32* getelementptr  (%struct.ham, %struct.ham* @global.51, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  store i32 37, i32* getelementptr  (%struct.ham, %struct.ham* @global.52, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  %tmp54 = invoke %struct.ham* @wobble(%struct.ham* undef, %struct.ham*  @global.54)
+          to label %bb58 unwind label %bbunwind
+
+bb58:
+  %tmp59 = invoke i32 @wobble.58(%struct.pluto* getelementptr  (%struct.ham.3, %struct.ham.3* @global.55, i32 0, i32 0), [1 x i32] [i32 ptrtoint (%struct.zot* getelementptr  (%struct.ham.3, %struct.ham.3* @global.55, i32 0, i32 0, i32 1, i32 0, i32 0) to i32)], %struct.ham*  undef, %struct.hoge*  undef)
+          to label %bb71 unwind label %bbunwind
+
+bb71:
+  %tmp72 = invoke i32 @widget(%struct.spam* getelementptr  (%struct.bar, %struct.bar* @global.56, i32 0, i32 0), [1 x i32] [i32 ptrtoint (%struct.zot* getelementptr  (%struct.bar, %struct.bar* @global.56, i32 0, i32 0, i32 1, i32 0, i32 0) to i32)], %struct.ham*  undef, %struct.wombat*  undef)
+          to label %bb73 unwind label %bbunwind
+
+bb73:
+  ret void
+
+bbunwind:
+  %tmp75 = landingpad { i8*, i32 }
+          cleanup
+  resume { i8*, i32 } undef
+}
+
+declare void @llvm.trap()
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* , i8* , i32, i32, i1)
+
+declare void @llvm.memset.p0i8.i32(i8* , i8, i32, i32, i1)
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/Thumb/stm-scavenging.ll b/test/CodeGen/Thumb/stm-scavenging.ll
new file mode 100644
index 000000000000..3ed5763f2955
--- /dev/null
+++ b/test/CodeGen/Thumb/stm-scavenging.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "thumbv6---gnueabi"
+
+; Use STM to save the three registers
+; CHECK-LABEL: use_stm:
+; CHECK: .save   {r7, lr}
+; CHECK: .setfp  r7, sp
+; CHECK: stm r3!, {r0, r1, r2}
+; CHECK: bl throws_1
+define void @use_stm(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr noreturn "no-frame-pointer-elim"="true" {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %d, i32 2
+  store i32 %a, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %d, i32 3
+  store i32 %b, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %d, i32 4
+  store i32 %c, i32* %arrayidx2, align 4
+  tail call void @throws_1(i32 %a, i32 %b, i32 %c) noreturn
+  unreachable
+}
+
+; Don't use STM: there is no available register to store
+; the address. We could transform this with some extra math, but
+; that currently isn't implemented.
+; CHECK-LABEL: no_stm:
+; CHECK: .save   {r7, lr}
+; CHECK: .setfp  r7, sp
+; CHECK: str r0,
+; CHECK: str r1,
+; CHECK: str r2,
+; CHECK: bl throws_2
+define void @no_stm(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr noreturn "no-frame-pointer-elim"="true" {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %d, i32 2
+  store i32 %a, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %d, i32 3
+  store i32 %b, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %d, i32 4
+  store i32 %c, i32* %arrayidx2, align 4
+  tail call void @throws_2(i32 %a, i32 %b, i32 %c, i32* %d) noreturn
+  unreachable
+}
+
+
+declare void @throws_1(i32, i32, i32) noreturn
+declare void @throws_2(i32, i32, i32, i32*) noreturn
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index 50de773af001..80127f66bdfe 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -5,59 +5,6 @@
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
 
 
-define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
-; X32-LABEL: test_mm512_kunpackb:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X32-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
-; X32-NEXT:    kunpckbw %k0, %k1, %k1
-; X32-NEXT:    vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT:    kmovw %k0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_kunpackb:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
-; X64-NEXT:    kunpckbw %k0, %k1, %k1
-; X64-NEXT:    vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT:    kmovw %k0, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__A to <16 x i32>
-  %1 = bitcast <8 x i64> %__B to <16 x i32>
-  %2 = icmp ne <16 x i32> %0, %1
-  %3 = bitcast <16 x i1> %2 to i16
-  %4 = bitcast <8 x i64> %__C to <16 x i32>
-  %5 = bitcast <8 x i64> %__D to <16 x i32>
-  %6 = icmp ne <16 x i32> %4, %5
-  %7 = bitcast <16 x i1> %6 to i16
-  %8 = and i16 %7, 255
-  %shl.i = shl i16 %3, 8
-  %or.i = or i16 %8, %shl.i
-  %9 = bitcast <8 x i64> %__E to <16 x i32>
-  %10 = bitcast <8 x i64> %__F to <16 x i32>
-  %11 = icmp ne <16 x i32> %9, %10
-  %12 = bitcast i16 %or.i to <16 x i1>
-  %13 = and <16 x i1> %11, %12
-  %14 = bitcast <16 x i1> %13 to i16
-  ret i16 %14
-}
-
 define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
 ; X32-LABEL: test_mm512_shuffle_f32x4:
 ; X32:       # %bb.0: # %entry
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index f3ca0644e463..378dbda2dc0a 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -1,20 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
-
-define i16 @unpckbw_test(i16 %a0, i16 %a1) {
-; CHECK-LABEL: unpckbw_test:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    shll $8, %esi
-; CHECK-NEXT:    orl %esi, %eax
-; CHECK-NEXT:    ## kill: def %ax killed %ax killed %eax
-; CHECK-NEXT:    retq
-  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
-  ret i16 %res
-}
-
 define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
 ; CHECK:       ## %bb.0:
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 5faa202c30f3..628199d4ac9e 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -96,6 +96,21 @@ define i16 @test_kor(i16 %a0, i16 %a1) {
   ret i16 %t2
 }
 
+declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
+
+define i16 @unpckbw_test(i16 %a0, i16 %a1) {
+; CHECK-LABEL: unpckbw_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kunpckbw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT:    retq
+  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
+  ret i16 %res
+}
+
 declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
 ; TODO: the two kxnor instructions here a no op and should be elimintaed,
 ; probably by FoldConstantArithmetic in SelectionDAG.
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 4877157d911d..d112577a6104 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -2775,3 +2775,99 @@ define i8 @test_v8i1_mul(i8 %x, i8 %y) {
   %ret = bitcast <8 x i1> %m2 to i8
   ret i8 %ret
 }
+
+; Make sure we don't emit a ktest for signed comparisons.
+define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
+; KNL-LABEL: ktest_signed:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    pushq %rax
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testw %ax, %ax
+; KNL-NEXT:    jle LBB63_1
+; KNL-NEXT:  ## %bb.2: ## %bb.2
+; KNL-NEXT:    popq %rax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB63_1: ## %bb.1
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    callq _foo
+; KNL-NEXT:    popq %rax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: ktest_signed:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    pushq %rax
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    testw %ax, %ax
+; SKX-NEXT:    jle LBB63_1
+; SKX-NEXT:  ## %bb.2: ## %bb.2
+; SKX-NEXT:    popq %rax
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+; SKX-NEXT:  LBB63_1: ## %bb.1
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    callq _foo
+; SKX-NEXT:    popq %rax
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: ktest_signed:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    pushq %rax
+; AVX512BW-NEXT:    .cfi_def_cfa_offset 16
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    testw %ax, %ax
+; AVX512BW-NEXT:    jle LBB63_1
+; AVX512BW-NEXT:  ## %bb.2: ## %bb.2
+; AVX512BW-NEXT:    popq %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+; AVX512BW-NEXT:  LBB63_1: ## %bb.1
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    callq _foo
+; AVX512BW-NEXT:    popq %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: ktest_signed:
+; AVX512DQ:       ## %bb.0:
+; AVX512DQ-NEXT:    pushq %rax
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 16
+; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    testw %ax, %ax
+; AVX512DQ-NEXT:    jle LBB63_1
+; AVX512DQ-NEXT:  ## %bb.2: ## %bb.2
+; AVX512DQ-NEXT:    popq %rax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+; AVX512DQ-NEXT:  LBB63_1: ## %bb.1
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    callq _foo
+; AVX512DQ-NEXT:    popq %rax
+; AVX512DQ-NEXT:    retq
+  %a = icmp eq <16 x i32> %x, zeroinitializer
+  %b = icmp eq <16 x i32> %y, zeroinitializer
+  %c = and <16 x i1> %a, %b
+  %d = bitcast <16 x i1> %c to i16
+  %e = icmp sgt i16 %d, 0
+  br i1 %e, label %bb.2, label %bb.1
+bb.1:
+  call void @foo()
+  br label %bb.2
+bb.2:
+  ret void
+}
+declare void @foo()
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
index 1e754be6fe49..a56111f3453e 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@@ -4,117 +4,6 @@
 
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
 
-define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackd:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT:    vmovdqa64 72(%ebp), %zmm4
-; X32-NEXT:    vmovdqa64 8(%ebp), %zmm5
-; X32-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
-; X32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT:    vpcmpneqb %zmm5, %zmm2, %k0
-; X32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    kunpckdq %k0, %k1, %k1
-; X32-NEXT:    vpcmpneqb %zmm3, %zmm4, %k0 {%k1}
-; X32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_kunpackd:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
-; X64-NEXT:    vpcmpneqb %zmm3, %zmm2, %k1
-; X64-NEXT:    kunpckdq %k0, %k1, %k1
-; X64-NEXT:    vpcmpneqb %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT:    kmovq %k0, %rax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__B to <64 x i8>
-  %1 = bitcast <8 x i64> %__A to <64 x i8>
-  %2 = icmp ne <64 x i8> %0, %1
-  %3 = bitcast <64 x i1> %2 to i64
-  %4 = bitcast <8 x i64> %__C to <64 x i8>
-  %5 = bitcast <8 x i64> %__D to <64 x i8>
-  %6 = icmp ne <64 x i8> %4, %5
-  %7 = bitcast <64 x i1> %6 to i64
-  %and.i = and i64 %7, 4294967295
-  %shl.i = shl i64 %3, 32
-  %or.i = or i64 %and.i, %shl.i
-  %8 = bitcast <8 x i64> %__E to <64 x i8>
-  %9 = bitcast <8 x i64> %__F to <64 x i8>
-  %10 = icmp ne <64 x i8> %8, %9
-  %11 = bitcast i64 %or.i to <64 x i1>
-  %12 = and <64 x i1> %10, %11
-  %13 = bitcast <64 x i1> %12 to i64
-  ret i64 %13
-}
-
-define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackw:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
-; X32-NEXT:    vpcmpneqw 8(%ebp), %zmm2, %k1
-; X32-NEXT:    kunpckwd %k0, %k1, %k1
-; X32-NEXT:    vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_kunpackw:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
-; X64-NEXT:    vpcmpneqw %zmm3, %zmm2, %k1
-; X64-NEXT:    kunpckwd %k0, %k1, %k1
-; X64-NEXT:    vpcmpneqw %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__B to <32 x i16>
-  %1 = bitcast <8 x i64> %__A to <32 x i16>
-  %2 = icmp ne <32 x i16> %0, %1
-  %3 = bitcast <32 x i1> %2 to i32
-  %4 = bitcast <8 x i64> %__C to <32 x i16>
-  %5 = bitcast <8 x i64> %__D to <32 x i16>
-  %6 = icmp ne <32 x i16> %4, %5
-  %7 = bitcast <32 x i1> %6 to i32
-  %and.i = and i32 %7, 65535
-  %shl.i = shl i32 %3, 16
-  %or.i = or i32 %and.i, %shl.i
-  %8 = bitcast <8 x i64> %__E to <32 x i16>
-  %9 = bitcast <8 x i64> %__F to <32 x i16>
-  %10 = icmp ne <32 x i16> %8, %9
-  %11 = bitcast i32 %or.i to <32 x i1>
-  %12 = and <32 x i1> %10, %11
-  %13 = bitcast <32 x i1> %12 to i32
-  ret i32 %13
-}
-
-
 define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A)  {
 ; X32-LABEL: test_mm512_mask_set1_epi8:
 ; X32:       # %bb.0: # %entry
@@ -189,46 +78,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $55, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $9, %k0, %k1
 ; X32-NEXT:    andb $2, %al
 ; X32-NEXT:    shrb %al
 ; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $54, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $10, %k0, %k1
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    andb $15, %al
 ; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $53, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $3, %al
-; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $12, %eax
-; X32-NEXT:    andl $15, %eax
-; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kmovd %eax, %k4
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    shrl $13, %eax
 ; X32-NEXT:    andb $1, %al
-; X32-NEXT:    kmovd %eax, %k3
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $14, %eax
-; X32-NEXT:    andl $3, %eax
-; X32-NEXT:    kmovd %eax, %k4
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $15, %eax
-; X32-NEXT:    andl $1, %eax
 ; X32-NEXT:    kmovd %eax, %k5
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrl $16, %edx
@@ -243,25 +105,52 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kmovd %eax, %k7
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $55, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $9, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $54, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $10, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $53, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $52, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $12, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $51, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $13, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $50, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $14, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $49, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $15, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $48, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -494,22 +383,14 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $43, %k0, %k1
 ; X32-NEXT:    kxorq %k4, %k1, %k1
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $12, %esi
-; X32-NEXT:    andl $15, %esi
-; X32-NEXT:    kmovd %esi, %k2
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $14, %esi
-; X32-NEXT:    andl $3, %esi
-; X32-NEXT:    kmovd %esi, %k3
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $15, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    kmovd %esi, %k4
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $20, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $44, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $19, %k1, %k1
@@ -520,12 +401,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kshiftrq $18, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $46, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $17, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $47, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $16, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -551,8 +440,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $12, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k4
-; X32-NEXT:    kshiftrq $52, %k4, %k0
+; X32-NEXT:    kxorq %k0, %k1, %k3
+; X32-NEXT:    kshiftrq $52, %k3, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $4, %dl
 ; X32-NEXT:    kmovd %edx, %k1
@@ -576,19 +465,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k3
+; X32-NEXT:    kmovd %edx, %k4
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $11, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k4
-; X32-NEXT:    kshiftrq $53, %k4, %k5
+; X32-NEXT:    kxorq %k3, %k5, %k3
+; X32-NEXT:    kshiftrq $53, %k3, %k5
 ; X32-NEXT:    kxorq %k6, %k5, %k5
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $10, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k5
-; X32-NEXT:    kshiftrq $54, %k5, %k4
-; X32-NEXT:    kxorq %k7, %k4, %k6
+; X32-NEXT:    kxorq %k3, %k5, %k5
+; X32-NEXT:    kshiftrq $54, %k5, %k3
+; X32-NEXT:    kxorq %k7, %k3, %k6
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k4
+; X32-NEXT:    kmovd %ecx, %k3
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $29, %ecx
 ; X32-NEXT:    andb $1, %cl
@@ -603,12 +492,6 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kxorq %k5, %k0, %k0
 ; X32-NEXT:    kshiftrq $56, %k0, %k5
 ; X32-NEXT:    kxorq %k1, %k5, %k1
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k5
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k6
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $7, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -618,17 +501,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kshiftrq $6, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $58, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $5, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $59, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $4, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $60, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $28, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $3, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -638,7 +524,10 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kshiftrq $2, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $62, %k0, %k1
-; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $30, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    shrl $31, %eax
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
@@ -743,46 +632,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $55, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $9, %k0, %k1
 ; X32-NEXT:    andb $2, %al
 ; X32-NEXT:    shrb %al
 ; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $54, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $10, %k0, %k1
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    andb $15, %al
 ; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $53, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $3, %al
-; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $12, %eax
-; X32-NEXT:    andl $15, %eax
-; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kmovd %eax, %k4
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    shrl $13, %eax
 ; X32-NEXT:    andb $1, %al
-; X32-NEXT:    kmovd %eax, %k3
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $14, %eax
-; X32-NEXT:    andl $3, %eax
-; X32-NEXT:    kmovd %eax, %k4
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $15, %eax
-; X32-NEXT:    andl $1, %eax
 ; X32-NEXT:    kmovd %eax, %k5
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrl $16, %edx
@@ -797,25 +659,52 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kmovd %eax, %k7
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $55, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $9, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $54, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $10, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $53, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $52, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $12, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $51, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $13, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $50, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $14, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $49, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $15, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $48, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -1048,22 +937,14 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $43, %k0, %k1
 ; X32-NEXT:    kxorq %k4, %k1, %k1
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $12, %esi
-; X32-NEXT:    andl $15, %esi
-; X32-NEXT:    kmovd %esi, %k2
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $14, %esi
-; X32-NEXT:    andl $3, %esi
-; X32-NEXT:    kmovd %esi, %k3
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $15, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    kmovd %esi, %k4
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $20, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $44, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $19, %k1, %k1
@@ -1074,12 +955,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kshiftrq $18, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $46, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $17, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $47, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $16, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -1105,8 +994,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $12, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k4
-; X32-NEXT:    kshiftrq $52, %k4, %k0
+; X32-NEXT:    kxorq %k0, %k1, %k3
+; X32-NEXT:    kshiftrq $52, %k3, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $4, %dl
 ; X32-NEXT:    kmovd %edx, %k1
@@ -1130,19 +1019,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k3
+; X32-NEXT:    kmovd %edx, %k4
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $11, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k4
-; X32-NEXT:    kshiftrq $53, %k4, %k5
+; X32-NEXT:    kxorq %k3, %k5, %k3
+; X32-NEXT:    kshiftrq $53, %k3, %k5
 ; X32-NEXT:    kxorq %k6, %k5, %k5
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $10, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k5
-; X32-NEXT:    kshiftrq $54, %k5, %k4
-; X32-NEXT:    kxorq %k7, %k4, %k6
+; X32-NEXT:    kxorq %k3, %k5, %k5
+; X32-NEXT:    kshiftrq $54, %k5, %k3
+; X32-NEXT:    kxorq %k7, %k3, %k6
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k4
+; X32-NEXT:    kmovd %ecx, %k3
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $29, %ecx
 ; X32-NEXT:    andb $1, %cl
@@ -1157,12 +1046,6 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kxorq %k5, %k0, %k0
 ; X32-NEXT:    kshiftrq $56, %k0, %k5
 ; X32-NEXT:    kxorq %k1, %k5, %k1
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k5
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k6
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $7, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -1172,17 +1055,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kshiftrq $6, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $58, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $5, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $59, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $4, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $60, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $28, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $3, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -1192,7 +1078,10 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kshiftrq $2, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $62, %k0, %k1
-; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $30, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    shrl $31, %eax
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index f19e09758f12..13aca464b9e2 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -2,46 +2,6 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
 
-declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
-
-define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    movzwl %di, %eax
-; AVX512BW-NEXT:    shll $16, %esi
-; AVX512BW-NEXT:    orl %esi, %eax
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    shll $16, %eax
-; AVX512F-32-NEXT:    orl %ecx, %eax
-; AVX512F-32-NEXT:    retl
-  %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
-  ret i32 %res
-}
-
-declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
-
-define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    movl %edi, %eax
-; AVX512BW-NEXT:    shlq $32, %rsi
-; AVX512BW-NEXT:    orq %rsi, %rax
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    retl
-  %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
-  ret i64 %res
-}
-
 declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
 
   define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 2fa7c2c5b8a8..7b5cc5feff0c 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1455,6 +1455,55 @@ define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8>
   ret  <8 x i64> %res2
 }
 
+declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
+
+define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckwd %k0, %k1, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
+  %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
+  ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
+
+define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovq %rdi, %k0
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    kunpckdq %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
+  %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
+  ret i64 %res
+}
+
 declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)
 
 define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
diff --git a/test/CodeGen/X86/domain-reassignment.mir b/test/CodeGen/X86/domain-reassignment.mir
index 3cb4b5dd1396..7da9b083c22e 100644
--- a/test/CodeGen/X86/domain-reassignment.mir
+++ b/test/CodeGen/X86/domain-reassignment.mir
@@ -1,22 +1,23 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -run-pass x86-domain-reassignment -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq -o - %s | FileCheck %s
 --- |
   ; ModuleID = '../test/CodeGen/X86/gpr-to-mask.ll'
   source_filename = "../test/CodeGen/X86/gpr-to-mask.ll"
   target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-unknown"
-  
+
   define void @test_fcmp_storefloat(i1 %cond, float* %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) #0 {
   entry:
     br i1 %cond, label %if, label %else
-  
+
   if:                                               ; preds = %entry
     %cmp1 = fcmp oeq float %f3, %f4
     br label %exit
-  
+
   else:                                             ; preds = %entry
     %cmp2 = fcmp oeq float %f5, %f6
     br label %exit
-  
+
   exit:                                             ; preds = %else, %if
     %val = phi i1 [ %cmp1, %if ], [ %cmp2, %else ]
     %selected = select i1 %val, float %f1, float %f2
@@ -48,14 +49,13 @@
 ...
 ---
 name:            test_fcmp_storefloat
-# CHECK-LABEL: name: test_fcmp_storefloat
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr8, preferred-register: '' }
   - { id: 1, class: gr8, preferred-register: '' }
   - { id: 2, class: gr8, preferred-register: '' }
@@ -79,7 +79,7 @@ registers:
   - { id: 20, class: fr128, preferred-register: '' }
   - { id: 21, class: fr128, preferred-register: '' }
   - { id: 22, class: fr32x, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%edi', virtual-reg: '%3' }
   - { reg: '%rsi', virtual-reg: '%4' }
   - { reg: '%xmm0', virtual-reg: '%5' }
@@ -88,7 +88,7 @@ liveins:
   - { reg: '%xmm3', virtual-reg: '%8' }
   - { reg: '%xmm4', virtual-reg: '%9' }
   - { reg: '%xmm5', virtual-reg: '%10' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -105,14 +105,51 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_fcmp_storefloat
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK:   liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+  ; CHECK:   [[COPY:%[0-9]+]]:fr32x = COPY %xmm5
+  ; CHECK:   [[COPY1:%[0-9]+]]:fr32x = COPY %xmm4
+  ; CHECK:   [[COPY2:%[0-9]+]]:fr32x = COPY %xmm3
+  ; CHECK:   [[COPY3:%[0-9]+]]:fr32x = COPY %xmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:fr32x = COPY %xmm1
+  ; CHECK:   [[COPY5:%[0-9]+]]:vr128x = COPY %xmm0
+  ; CHECK:   [[COPY6:%[0-9]+]]:gr64 = COPY %rsi
+  ; CHECK:   [[COPY7:%[0-9]+]]:gr32 = COPY %edi
+  ; CHECK:   [[COPY8:%[0-9]+]]:gr8 = COPY [[COPY7]].sub_8bit
+  ; CHECK:   TEST8ri killed [[COPY8]], 1, implicit-def %eflags
+  ; CHECK:   JE_1 %bb.2, implicit %eflags
+  ; CHECK:   JMP_1 %bb.1
+  ; CHECK: bb.1.if:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   [[VCMPSSZrr:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY3]], [[COPY2]], 0
+  ; CHECK:   [[COPY9:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:vk8 = COPY [[COPY9]]
+  ; CHECK:   JMP_1 %bb.3
+  ; CHECK: bb.2.else:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   [[VCMPSSZrr1:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY1]], [[COPY]], 0
+  ; CHECK:   [[COPY11:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr1]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:vk8 = COPY [[COPY11]]
+  ; CHECK: bb.3.exit:
+  ; CHECK:   [[PHI:%[0-9]+]]:vk8 = PHI [[COPY12]], %bb.2, [[COPY10]], %bb.1
+  ; CHECK:   [[COPY13:%[0-9]+]]:vk32 = COPY [[PHI]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:vk1wm = COPY [[COPY13]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:vr128x = COPY [[COPY4]]
+  ; CHECK:   [[DEF:%[0-9]+]]:fr128 = IMPLICIT_DEF
+  ; CHECK:   [[VMOVSSZrrk:%[0-9]+]]:fr128 = VMOVSSZrrk [[COPY15]], killed [[COPY14]], killed [[DEF]], [[COPY5]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:fr32x = COPY [[VMOVSSZrrk]]
+  ; CHECK:   VMOVSSZmr [[COPY6]], 1, %noreg, 0, %noreg, killed [[COPY16]] :: (store 4 into %ir.fptr)
+  ; CHECK:   RET 0
   bb.0.entry:
     successors: %bb.1(0x40000000), %bb.2(0x40000000)
     liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-  
+
     %10 = COPY %xmm5
     %9 = COPY %xmm4
     %8 = COPY %xmm3
@@ -125,38 +162,31 @@ body:             |
     TEST8ri killed %11, 1, implicit-def %eflags
     JE_1 %bb.2, implicit %eflags
     JMP_1 %bb.1
-  
+
   bb.1.if:
     successors: %bb.3(0x80000000)
-  
+
     %14 = VCMPSSZrr %7, %8, 0
 
     ; check that cross domain copies are replaced with same domain copies.
-    ; CHECK: %15:vk32 = COPY %14
-    ; CHECK: %0:vk8 = COPY %15
-    
+
     %15 = COPY %14
     %0 = COPY %15.sub_8bit
     JMP_1 %bb.3
-  
+
   bb.2.else:
     successors: %bb.3(0x80000000)
     %12 = VCMPSSZrr %9, %10, 0
 
     ; check that cross domain copies are replaced with same domain copies.
-    ; CHECK: %13:vk32 = COPY %12
-    ; CHECK: %1:vk8 = COPY %13
 
     %13 = COPY %12
     %1 = COPY %13.sub_8bit
-  
+
   bb.3.exit:
 
     ; check PHI, IMPLICIT_DEF, and INSERT_SUBREG replacers.
-    ; CHECK: %2:vk8 = PHI %1, %bb.2, %0, %bb.1
-    ; CHECK: %16:vk32 = COPY %2
-    ; CHECK: %18:vk1wm = COPY %16
-  
+
     %2 = PHI %1, %bb.2, %0, %bb.1
     %17 = IMPLICIT_DEF
     %16 = INSERT_SUBREG %17, %2, 1
@@ -171,14 +201,13 @@ body:             |
 ...
 ---
 name:            test_8bitops
-# CHECK-LABEL: name: test_8bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -198,13 +227,13 @@ registers:
   - { id: 16, class: gr8, preferred-register: '' }
   - { id: 17, class: gr8, preferred-register: '' }
   - { id: 18, class: gr8, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
   - { reg: '%zmm2', virtual-reg: '%3' }
   - { reg: '%zmm3', virtual-reg: '%4' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -221,32 +250,50 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_8bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3
+  ; CHECK:   [[VCMPPDZrri:%[0-9]+]]:vk8 = VCMPPDZrri [[COPY3]], [[COPY4]], 0
+  ; CHECK:   [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPDZrri]]
+  ; CHECK:   [[COPY6:%[0-9]+]]:vk8 = COPY [[COPY5]]
+  ; CHECK:   [[KSHIFTRBri:%[0-9]+]]:vk8 = KSHIFTRBri [[COPY6]], 2
+  ; CHECK:   [[KSHIFTLBri:%[0-9]+]]:vk8 = KSHIFTLBri [[KSHIFTRBri]], 1
+  ; CHECK:   [[KNOTBrr:%[0-9]+]]:vk8 = KNOTBrr [[KSHIFTLBri]]
+  ; CHECK:   [[KORBrr:%[0-9]+]]:vk8 = KORBrr [[KNOTBrr]], [[KSHIFTRBri]]
+  ; CHECK:   [[KANDBrr:%[0-9]+]]:vk8 = KANDBrr [[KORBrr]], [[KSHIFTLBri]]
+  ; CHECK:   [[KXORBrr:%[0-9]+]]:vk8 = KXORBrr [[KANDBrr]], [[KSHIFTRBri]]
+  ; CHECK:   [[KADDBrr:%[0-9]+]]:vk8 = KADDBrr [[KXORBrr]], [[KNOTBrr]]
+  ; CHECK:   [[COPY7:%[0-9]+]]:vk32 = COPY [[KADDBrr]]
+  ; CHECK:   [[COPY8:%[0-9]+]]:vk8wm = COPY [[COPY7]]
+  ; CHECK:   [[VMOVAPDZrrk:%[0-9]+]]:vr512 = VMOVAPDZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
+  ; CHECK:   VMOVAPDZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPDZrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
   bb.0:
     liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
-  
+
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
     %3 = COPY %zmm2
     %4 = COPY %zmm3
-  
+
     %5 = VCMPPDZrri %3, %4, 0
-    ; CHECK: %6:vk32 = COPY %5
-    ; CHECK: %7:vk8 = COPY %6
     %6 = COPY %5
     %7 = COPY %6.sub_8bit
 
-    ; CHECK: %12:vk8 = KSHIFTRBri %7, 2
-    ; CHECK: %13:vk8 = KSHIFTLBri %12, 1
-    ; CHECK: %14:vk8 = KNOTBrr %13
-    ; CHECK: %15:vk8 = KORBrr %14, %12
-    ; CHECK: %16:vk8 = KANDBrr %15, %13
-    ; CHECK: %17:vk8 = KXORBrr %16, %12
-    ; CHECK: %18:vk8 = KADDBrr %17, %14
     %12 = SHR8ri %7, 2, implicit-def dead %eflags
     %13 = SHL8ri %12, 1, implicit-def dead %eflags
     %14 = NOT8r %13
@@ -254,19 +301,17 @@ body:             |
     %16 = AND8rr %15, %13, implicit-def dead %eflags
     %17 = XOR8rr %16, %12, implicit-def dead %eflags
     %18 = ADD8rr %17, %14, implicit-def dead %eflags
-  
-    ; CHECK: %9:vk32 = COPY %18
-    ; CHECK: %10:vk8wm = COPY %9
+
     %8 = IMPLICIT_DEF
     %9 = INSERT_SUBREG %8, %18, 1
     %10 = COPY %9
     %11 = VMOVAPDZrrk %2, killed %10, %1
-    VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11 
+    VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11
 
-    ; CHECK: KTESTBrr %18, %18, implicit-def %eflags
-    TEST8rr %18, %18, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST8rr %18, %18, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2
 
   bb.1:
 
@@ -276,14 +321,13 @@ body:             |
 ...
 ---
 name:            test_16bitops
-# CHECK-LABEL: name: test_16bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -302,13 +346,13 @@ registers:
   - { id: 15, class: gr16, preferred-register: '' }
   - { id: 16, class: gr16, preferred-register: '' }
   - { id: 17, class: gr16, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
   - { reg: '%zmm2', virtual-reg: '%3' }
   - { reg: '%zmm3', virtual-reg: '%4' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -325,50 +369,66 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_16bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3
+  ; CHECK:   [[VCMPPSZrri:%[0-9]+]]:vk16 = VCMPPSZrri [[COPY3]], [[COPY4]], 0
+  ; CHECK:   [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPSZrri]]
+  ; CHECK:   [[COPY6:%[0-9]+]]:vk16 = COPY [[COPY5]]
+  ; CHECK:   [[KSHIFTRWri:%[0-9]+]]:vk16 = KSHIFTRWri [[COPY6]], 2
+  ; CHECK:   [[KSHIFTLWri:%[0-9]+]]:vk16 = KSHIFTLWri [[KSHIFTRWri]], 1
+  ; CHECK:   [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[KSHIFTLWri]]
+  ; CHECK:   [[KORWrr:%[0-9]+]]:vk16 = KORWrr [[KNOTWrr]], [[KSHIFTRWri]]
+  ; CHECK:   [[KANDWrr:%[0-9]+]]:vk16 = KANDWrr [[KORWrr]], [[KSHIFTLWri]]
+  ; CHECK:   [[KXORWrr:%[0-9]+]]:vk16 = KXORWrr [[KANDWrr]], [[KSHIFTRWri]]
+  ; CHECK:   [[COPY7:%[0-9]+]]:vk32 = COPY [[KXORWrr]]
+  ; CHECK:   [[COPY8:%[0-9]+]]:vk16wm = COPY [[COPY7]]
+  ; CHECK:   [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
+  ; CHECK:   VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
   bb.0:
     liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
-  
+
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
     %3 = COPY %zmm2
     %4 = COPY %zmm3
-  
+
     %5 = VCMPPSZrri %3, %4, 0
-    ; CHECK: %6:vk32 = COPY %5
-    ; CHECK: %7:vk16 = COPY %6
     %6 = COPY %5
     %7 = COPY %6.sub_16bit
 
-    ; CHECK: %12:vk16 = KSHIFTRWri %7, 2
-    ; CHECK: %13:vk16 = KSHIFTLWri %12, 1
-    ; CHECK: %14:vk16 = KNOTWrr %13
-    ; CHECK: %15:vk16 = KORWrr %14, %12
-    ; CHECK: %16:vk16 = KANDWrr %15, %13
-    ; CHECK: %17:vk16 = KXORWrr %16, %12
     %12 = SHR16ri %7, 2, implicit-def dead %eflags
     %13 = SHL16ri %12, 1, implicit-def dead %eflags
     %14 = NOT16r %13
     %15 = OR16rr %14, %12, implicit-def dead %eflags
     %16 = AND16rr %15, %13, implicit-def dead %eflags
     %17 = XOR16rr %16, %12, implicit-def dead %eflags
-  
-    ; CHECK: %9:vk32 = COPY %17
-    ; CHECK: %10:vk16wm = COPY %9
+
     %8 = IMPLICIT_DEF
     %9 = INSERT_SUBREG %8, %17, 3
     %10 = COPY %9
     %11 = VMOVAPSZrrk %2, killed %10, %1
-    VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11 
+    VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11
 
-    ; CHECK: KTESTWrr %17, %17, implicit-def %eflags
-    TEST16rr %17, %17, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST16rr %17, %17, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2
 
   bb.1:
 
@@ -378,14 +438,13 @@ body:             |
 ...
 ---
 name:            test_32bitops
-# CHECK-LABEL: name: test_32bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -400,11 +459,11 @@ registers:
   - { id: 11, class: gr32, preferred-register: '' }
   - { id: 12, class: gr32, preferred-register: '' }
   - { id: 13, class: gr32, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -421,26 +480,40 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_32bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[KMOVDkm:%[0-9]+]]:vk32 = KMOVDkm [[COPY]], 1, %noreg, 0, %noreg
+  ; CHECK:   [[KSHIFTRDri:%[0-9]+]]:vk32 = KSHIFTRDri [[KMOVDkm]], 2
+  ; CHECK:   [[KSHIFTLDri:%[0-9]+]]:vk32 = KSHIFTLDri [[KSHIFTRDri]], 1
+  ; CHECK:   [[KNOTDrr:%[0-9]+]]:vk32 = KNOTDrr [[KSHIFTLDri]]
+  ; CHECK:   [[KORDrr:%[0-9]+]]:vk32 = KORDrr [[KNOTDrr]], [[KSHIFTRDri]]
+  ; CHECK:   [[KANDDrr:%[0-9]+]]:vk32 = KANDDrr [[KORDrr]], [[KSHIFTLDri]]
+  ; CHECK:   [[KXORDrr:%[0-9]+]]:vk32 = KXORDrr [[KANDDrr]], [[KSHIFTRDri]]
+  ; CHECK:   [[KANDNDrr:%[0-9]+]]:vk32 = KANDNDrr [[KXORDrr]], [[KORDrr]]
+  ; CHECK:   [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[KANDNDrr]], [[KXORDrr]]
+  ; CHECK:   [[COPY3:%[0-9]+]]:vk32wm = COPY [[KADDDrr]]
+  ; CHECK:   [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
+  ; CHECK:   VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
   bb.0:
     liveins: %rdi, %zmm0, %zmm1
-  
+
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
-  
-    ; CHECK: %5:vk32 = KMOVDkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk32 = KSHIFTRDri %5, 2
-    ; CHECK: %7:vk32 = KSHIFTLDri %6, 1
-    ; CHECK: %8:vk32 = KNOTDrr %7
-    ; CHECK: %9:vk32 = KORDrr %8, %6
-    ; CHECK: %10:vk32 = KANDDrr %9, %7
-    ; CHECK: %11:vk32 = KXORDrr %10, %6
-    ; CHECK: %12:vk32 = KANDNDrr %11, %9
-    ; CHECK: %13:vk32 = KADDDrr %12, %11
+
     %5 = MOV32rm %0, 1, %noreg, 0, %noreg
     %6 = SHR32ri %5, 2, implicit-def dead %eflags
     %7 = SHL32ri %6, 1, implicit-def dead %eflags
@@ -450,16 +523,15 @@ body:             |
     %11 = XOR32rr %10, %6, implicit-def dead %eflags
     %12 = ANDN32rr %11, %9, implicit-def dead %eflags
     %13 = ADD32rr %12, %11, implicit-def dead %eflags
-  
-    ; CHECK: %3:vk32wm = COPY %13
+
     %3 = COPY %13
     %4 = VMOVDQU16Zrrk %2, killed %3, %1
     VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
 
-    ; CHECK: KTESTDrr %13, %13, implicit-def %eflags
-    TEST32rr %13, %13, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST32rr %13, %13, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2
 
   bb.1:
 
@@ -469,14 +541,13 @@ body:             |
 ...
 ---
 name:            test_64bitops
-# CHECK-LABEL: name: test_64bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -491,11 +562,11 @@ registers:
   - { id: 11, class: gr64, preferred-register: '' }
   - { id: 12, class: gr64, preferred-register: '' }
   - { id: 13, class: gr64, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -512,26 +583,40 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_64bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[KMOVQkm:%[0-9]+]]:vk64 = KMOVQkm [[COPY]], 1, %noreg, 0, %noreg
+  ; CHECK:   [[KSHIFTRQri:%[0-9]+]]:vk64 = KSHIFTRQri [[KMOVQkm]], 2
+  ; CHECK:   [[KSHIFTLQri:%[0-9]+]]:vk64 = KSHIFTLQri [[KSHIFTRQri]], 1
+  ; CHECK:   [[KNOTQrr:%[0-9]+]]:vk64 = KNOTQrr [[KSHIFTLQri]]
+  ; CHECK:   [[KORQrr:%[0-9]+]]:vk64 = KORQrr [[KNOTQrr]], [[KSHIFTRQri]]
+  ; CHECK:   [[KANDQrr:%[0-9]+]]:vk64 = KANDQrr [[KORQrr]], [[KSHIFTLQri]]
+  ; CHECK:   [[KXORQrr:%[0-9]+]]:vk64 = KXORQrr [[KANDQrr]], [[KSHIFTRQri]]
+  ; CHECK:   [[KANDNQrr:%[0-9]+]]:vk64 = KANDNQrr [[KXORQrr]], [[KORQrr]]
+  ; CHECK:   [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[KANDNQrr]], [[KXORQrr]]
+  ; CHECK:   [[COPY3:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
+  ; CHECK:   [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
+  ; CHECK:   VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
   bb.0:
     liveins: %rdi, %zmm0, %zmm1
-  
+
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
-  
-    ; CHECK: %5:vk64 = KMOVQkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk64 = KSHIFTRQri %5, 2
-    ; CHECK: %7:vk64 = KSHIFTLQri %6, 1
-    ; CHECK: %8:vk64 = KNOTQrr %7
-    ; CHECK: %9:vk64 = KORQrr %8, %6
-    ; CHECK: %10:vk64 = KANDQrr %9, %7
-    ; CHECK: %11:vk64 = KXORQrr %10, %6
-    ; CHECK: %12:vk64 = KANDNQrr %11, %9
-    ; CHECK: %13:vk64 = KADDQrr %12, %11
+
     %5 = MOV64rm %0, 1, %noreg, 0, %noreg
     %6 = SHR64ri %5, 2, implicit-def dead %eflags
     %7 = SHL64ri %6, 1, implicit-def dead %eflags
@@ -541,16 +626,15 @@ body:             |
     %11 = XOR64rr %10, %6, implicit-def dead %eflags
     %12 = ANDN64rr %11, %9, implicit-def dead %eflags
     %13 = ADD64rr %12, %11, implicit-def dead %eflags
-  
-    ; CHECK: %3:vk64wm = COPY %13
+
     %3 = COPY %13
     %4 = VMOVDQU8Zrrk %2, killed %3, %1
     VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
 
-    ; CHECK: KTESTQrr %13, %13, implicit-def %eflags
-    TEST64rr %13, %13, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST64rr %13, %13, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2
 
   bb.1:
 
@@ -560,14 +644,13 @@ body:             |
 ...
 ---
 name:            test_16bitext
-# CHECK-LABEL: name: test_16bitext
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -575,11 +658,11 @@ registers:
   - { id: 4, class: vr512, preferred-register: '' }
   - { id: 5, class: gr16, preferred-register: '' }
   - { id: 6, class: gr16, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -596,24 +679,32 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
   bb.0:
     liveins: %rdi, %zmm0, %zmm1
-  
+
+    ; CHECK-LABEL: name: test_16bitext
+    ; CHECK: liveins: %rdi, %zmm0, %zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk16 = COPY [[KMOVBkm]]
+    ; CHECK: [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[COPY3]]
+    ; CHECK: [[COPY4:%[0-9]+]]:vk16wm = COPY [[KNOTWrr]]
+    ; CHECK: [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY4]], [[COPY1]]
+    ; CHECK: VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]]
+    ; CHECK: RET 0
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
-  
-    ; CHECK: %7:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %5:vk16 = COPY %7
-    ; CHECK: %6:vk16 = KNOTWrr %5
+
     %5 = MOVZX16rm8 %0, 1, %noreg, 0, %noreg
     %6 = NOT16r %5
 
-    ; CHECK: %3:vk16wm = COPY %6
     %3 = COPY %6
     %4 = VMOVAPSZrrk %2, killed %3, %1
     VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %4
@@ -622,14 +713,13 @@ body:             |
 ...
 ---
 name:            test_32bitext
-# CHECK-LABEL: name: test_32bitext
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -638,11 +728,11 @@ registers:
   - { id: 5, class: gr32, preferred-register: '' }
   - { id: 6, class: gr32, preferred-register: '' }
   - { id: 7, class: gr32, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -659,27 +749,35 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
   bb.0:
     liveins: %rdi, %zmm0, %zmm1
-  
+
+    ; CHECK-LABEL: name: test_32bitext
+    ; CHECK: liveins: %rdi, %zmm0, %zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk32 = COPY [[KMOVBkm]]
+    ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY4:%[0-9]+]]:vk32 = COPY [[KMOVWkm]]
+    ; CHECK: [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDDrr]]
+    ; CHECK: [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
+    ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]]
+    ; CHECK: RET 0
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
-  
-    ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %5:vk32 = COPY %8
-    ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk32 = COPY %9
-    ; CHECK: %7:vk32 = KADDDrr %5, %6
+
     %5 = MOVZX32rm8 %0, 1, %noreg, 0, %noreg
     %6 = MOVZX32rm16 %0, 1, %noreg, 0, %noreg
     %7 = ADD32rr %5, %6, implicit-def dead %eflags
 
-    ; CHECK: %3:vk64wm = COPY %7
     %3 = COPY %7
     %4 = VMOVDQU16Zrrk %2, killed %3, %1
     VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
@@ -688,14 +786,13 @@ body:             |
 ...
 ---
 name:            test_64bitext
-# CHECK-LABEL: name: test_64bitext
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -704,11 +801,11 @@ registers:
   - { id: 5, class: gr64, preferred-register: '' }
   - { id: 6, class: gr64, preferred-register: '' }
   - { id: 7, class: gr64, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -725,27 +822,35 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
   bb.0:
     liveins: %rdi, %zmm0, %zmm1
-  
+
+    ; CHECK-LABEL: name: test_64bitext
+    ; CHECK: liveins: %rdi, %zmm0, %zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk64 = COPY [[KMOVBkm]]
+    ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY4:%[0-9]+]]:vk64 = COPY [[KMOVWkm]]
+    ; CHECK: [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
+    ; CHECK: [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
+    ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]]
+    ; CHECK: RET 0
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
-  
-    ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %5:vk64 = COPY %8
-    ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk64 = COPY %9
-    ; CHECK: %7:vk64 = KADDQrr %5, %6
+
     %5 = MOVZX64rm8 %0, 1, %noreg, 0, %noreg
     %6 = MOVZX64rm16 %0, 1, %noreg, 0, %noreg
     %7 = ADD64rr %5, %6, implicit-def dead %eflags
 
-    ; CHECK: %3:vk64wm = COPY %7
     %3 = COPY %7
     %4 = VMOVDQU8Zrrk %2, killed %3, %1
     VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
diff --git a/test/CodeGen/X86/inline-asm-modifier-V.ll b/test/CodeGen/X86/inline-asm-modifier-V.ll
new file mode 100644
index 000000000000..5a7f3fdd25fd
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-modifier-V.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=i686-- -no-integrated-as | FileCheck -check-prefix=X86 %s
+; RUN: llc < %s -mtriple=x86_64-- -no-integrated-as | FileCheck -check-prefix=X64 %s
+
+; If the target does not have 64-bit integer registers, emit 32-bit register
+; names.
+
+; X86: call __x86_indirect_thunk_e{{[abcd]}}x
+; X64: call __x86_indirect_thunk_r
+
+define void @q_modifier(i32* %p) {
+entry:
+  tail call void asm sideeffect "call __x86_indirect_thunk_${0:V}", "r,~{dirflag},~{fpsr},~{flags}"(i32* %p)
+  ret void
+}
diff --git a/test/CodeGen/X86/pr36199.ll b/test/CodeGen/X86/pr36199.ll
new file mode 100644
index 000000000000..84e17dba92e0
--- /dev/null
+++ b/test/CodeGen/X86/pr36199.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s
+
+define void @foo() unnamed_addr #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddps %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovups %zmm0, (%rax)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %1 = fadd <16 x float> undef, undef
+  %bc256 = bitcast <16 x float> %1 to <4 x i128>
+  %2 = extractelement <4 x i128> %bc256, i32 0
+  %3 = bitcast i128 %2 to <4 x float>
+  %4 = shufflevector <4 x float> %3, <4 x float> undef, <16 x i32> <i32 0, i32
+1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0,
+i32 1, i32 2, i32 3>
+  store <16 x float> %4, <16 x float>* undef, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/retpoline-external.ll b/test/CodeGen/X86/retpoline-external.ll
index 66d32ba5d73d..2f21bb2566de 100644
--- a/test/CodeGen/X86/retpoline-external.ll
+++ b/test/CodeGen/X86/retpoline-external.ll
@@ -23,18 +23,18 @@ entry:
 ; X64:       callq bar
 ; X64-DAG:   movl %[[x]], %edi
 ; X64-DAG:   movq %[[fp]], %r11
-; X64:       callq __llvm_external_retpoline_r11
+; X64:       callq __x86_indirect_thunk_r11
 ; X64:       movl %[[x]], %edi
 ; X64:       callq bar
 ; X64-DAG:   movl %[[x]], %edi
 ; X64-DAG:   movq %[[fp]], %r11
-; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X64FAST-LABEL: icall_reg:
 ; X64FAST:       callq bar
-; X64FAST:       callq __llvm_external_retpoline_r11
+; X64FAST:       callq __x86_indirect_thunk_r11
 ; X64FAST:       callq bar
-; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X86-LABEL: icall_reg:
 ; X86-DAG:   movl 12(%esp), %[[fp:[^ ]*]]
@@ -43,19 +43,19 @@ entry:
 ; X86:       calll bar
 ; X86:       movl %[[fp]], %eax
 ; X86:       pushl %[[x]]
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86:       pushl %[[x]]
 ; X86:       calll bar
 ; X86:       movl %[[fp]], %eax
 ; X86:       pushl %[[x]]
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86-NOT:   # TAILCALL
 
 ; X86FAST-LABEL: icall_reg:
 ; X86FAST:       calll bar
-; X86FAST:       calll __llvm_external_retpoline_eax
+; X86FAST:       calll __x86_indirect_thunk_eax
 ; X86FAST:       calll bar
-; X86FAST:       calll __llvm_external_retpoline_eax
+; X86FAST:       calll __x86_indirect_thunk_eax
 
 
 @global_fp = external global void (i32)*
@@ -72,28 +72,28 @@ define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 {
 ; X64-LABEL: icall_global_fp:
 ; X64-DAG:   movl %edi, %[[x:[^ ]*]]
 ; X64-DAG:   movq global_fp(%rip), %r11
-; X64:       callq __llvm_external_retpoline_r11
+; X64:       callq __x86_indirect_thunk_r11
 ; X64-DAG:   movl %[[x]], %edi
 ; X64-DAG:   movq global_fp(%rip), %r11
-; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X64FAST-LABEL: icall_global_fp:
 ; X64FAST:       movq global_fp(%rip), %r11
-; X64FAST:       callq __llvm_external_retpoline_r11
+; X64FAST:       callq __x86_indirect_thunk_r11
 ; X64FAST:       movq global_fp(%rip), %r11
-; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X86-LABEL: icall_global_fp:
 ; X86:       movl global_fp, %eax
 ; X86:       pushl 4(%esp)
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86:       addl $4, %esp
 ; X86:       movl global_fp, %eax
-; X86:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86:       jmp __x86_indirect_thunk_eax # TAILCALL
 
 ; X86FAST-LABEL: icall_global_fp:
-; X86FAST:       calll __llvm_external_retpoline_eax
-; X86FAST:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86FAST:       calll __x86_indirect_thunk_eax
+; X86FAST:       jmp __x86_indirect_thunk_eax # TAILCALL
 
 
 %struct.Foo = type { void (%struct.Foo*)** }
@@ -114,14 +114,14 @@ define void @vcall(%struct.Foo* %obj) #0 {
 ; X64:       movq (%[[obj]]), %[[vptr:[^ ]*]]
 ; X64:       movq 8(%[[vptr]]), %[[fp:[^ ]*]]
 ; X64:       movq %[[fp]], %r11
-; X64:       callq __llvm_external_retpoline_r11
+; X64:       callq __x86_indirect_thunk_r11
 ; X64-DAG:   movq %[[obj]], %rdi
 ; X64-DAG:   movq %[[fp]], %r11
-; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X64FAST-LABEL: vcall:
-; X64FAST:       callq __llvm_external_retpoline_r11
-; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST:       callq __x86_indirect_thunk_r11
+; X64FAST:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X86-LABEL: vcall:
 ; X86:       movl 8(%esp), %[[obj:[^ ]*]]
@@ -129,14 +129,14 @@ define void @vcall(%struct.Foo* %obj) #0 {
 ; X86:       movl 4(%[[vptr]]), %[[fp:[^ ]*]]
 ; X86:       movl %[[fp]], %eax
 ; X86:       pushl %[[obj]]
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86:       addl $4, %esp
 ; X86:       movl %[[fp]], %eax
-; X86:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86:       jmp __x86_indirect_thunk_eax # TAILCALL
 
 ; X86FAST-LABEL: vcall:
-; X86FAST:       calll __llvm_external_retpoline_eax
-; X86FAST:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86FAST:       calll __x86_indirect_thunk_eax
+; X86FAST:       jmp __x86_indirect_thunk_eax # TAILCALL
 
 
 declare void @direct_callee()
diff --git a/test/CodeGen/X86/retpoline-regparm.ll b/test/CodeGen/X86/retpoline-regparm.ll
new file mode 100644
index 000000000000..13b32740b287
--- /dev/null
+++ b/test/CodeGen/X86/retpoline-regparm.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mtriple=i686-linux < %s | FileCheck --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" %s
+
+; Test 32-bit retpoline when -mregparm=3 is used. This case is interesting
+; because there are no available scratch registers.  The Linux kernel builds
+; with -mregparm=3, so we need to support it.  TCO should fail because we need
+; to restore EDI.
+
+define void @call_edi(void (i32, i32, i32)* %fp) #0 {
+entry:
+  tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0)
+  ret void
+}
+
+; CHECK-LABEL: call_edi:
+;     EDI is used, so it must be saved.
+; CHECK: pushl %edi
+; CHECK-DAG: xorl %eax, %eax
+; CHECK-DAG: xorl %edx, %edx
+; CHECK-DAG: xorl %ecx, %ecx
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK: calll __llvm_retpoline_edi
+; CHECK: popl %edi
+; CHECK: retl
+
+define void @edi_external(void (i32, i32, i32)* %fp) #1 {
+entry:
+  tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0)
+  ret void
+}
+
+; CHECK-LABEL: edi_external:
+; CHECK: pushl %edi
+; CHECK-DAG: xorl %eax, %eax
+; CHECK-DAG: xorl %edx, %edx
+; CHECK-DAG: xorl %ecx, %ecx
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK: calll __x86_indirect_thunk_edi
+; CHECK: popl %edi
+; CHECK: retl
+
+attributes #0 = { "target-features"="+retpoline" }
+attributes #1 = { "target-features"="+retpoline-external-thunk" }
diff --git a/test/CodeGen/X86/retpoline.ll b/test/CodeGen/X86/retpoline.ll
index 57d3388b812a..477609e2d10b 100644
--- a/test/CodeGen/X86/retpoline.ll
+++ b/test/CodeGen/X86/retpoline.ll
@@ -340,10 +340,10 @@ latch:
 ; X86-NEXT:          movl    %edx, (%esp)
 ; X86-NEXT:          retl
 ;
-; X86-LABEL:         .section        .text.__llvm_retpoline_push,{{.*}},__llvm_retpoline_push,comdat
-; X86-NEXT:          .hidden __llvm_retpoline_push
-; X86-NEXT:          .weak   __llvm_retpoline_push
-; X86:       __llvm_retpoline_push:
+; X86-LABEL:         .section        .text.__llvm_retpoline_edi,{{.*}},__llvm_retpoline_edi,comdat
+; X86-NEXT:          .hidden __llvm_retpoline_edi
+; X86-NEXT:          .weak   __llvm_retpoline_edi
+; X86:       __llvm_retpoline_edi:
 ; X86-NEXT:  # {{.*}}                                # %entry
 ; X86-NEXT:          calll   [[CALL_TARGET:.*]]
 ; X86-NEXT:  [[CAPTURE_SPEC:.*]]:                    # Block address taken
@@ -355,11 +355,7 @@ latch:
 ; X86-NEXT:          .p2align        4, 0x90
 ; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
 ; X86-NEXT:                                          # %entry
-; X86-NEXT:          addl    $4, %esp
-; X86-NEXT:          pushl   4(%esp)
-; X86-NEXT:          pushl   4(%esp)
-; X86-NEXT:          popl    8(%esp)
-; X86-NEXT:          popl    (%esp)
+; X86-NEXT:          movl    %edi, (%esp)
 ; X86-NEXT:          retl
 
 
diff --git a/test/DebugInfo/X86/void-typedef.ll b/test/DebugInfo/X86/void-typedef.ll
new file mode 100644
index 000000000000..2e6bf49bae78
--- /dev/null
+++ b/test/DebugInfo/X86/void-typedef.ll
@@ -0,0 +1,88 @@
+; Choosing CodeView generates debug metadata for class-scope typedefs that
+; Dwarf would normally omit.  Choosing both CodeView and Dwarf triggered
+; assertion failures and crashes because the Dwarf handler wasn't prepared for
+; those records (in particular, ones with the void type represented by a
+; null pointer).
+;
+; This test was generated with:
+;    clang++ -cc1 -emit-llvm -debug-info-kind=limited -dwarf-version=4 -gcodeview -x c++
+; on the following source code:
+;
+;   class A {
+;     typedef void _Nodeptr;
+;   };
+;   class B {
+;     A FailedTestsCache;
+;     bool m_fn1();
+;   };
+;   bool B::m_fn1() {}
+;
+; CodeView generates a DIDerivedType for the _Nodeptr typedef.
+;
+; RUN: llc %s -o - 2>&1 | FileCheck %s
+; CHECK-NOT: Assertion failed
+
+; ModuleID = 'bug.cpp'
+source_filename = "bug.cpp"
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+%class.B = type { %class.A }
+%class.A = type { i8 }
+
+; Function Attrs: noinline nounwind optnone
+define x86_thiscallcc zeroext i1 @"\01?m_fn1@B@@AAE_NXZ"(%class.B* %this) #0 align 2 !dbg !9 {
+entry:
+  %retval = alloca i1, align 1
+  %this.addr = alloca %class.B*, align 4
+  store %class.B* %this, %class.B** %this.addr, align 4
+  call void @llvm.dbg.declare(metadata %class.B** %this.addr, metadata !22, metadata !DIExpression()), !dbg !24
+  %this1 = load %class.B*, %class.B** %this.addr, align 4
+  call void @llvm.trap(), !dbg !25
+  unreachable, !dbg !25
+
+return:                                           ; No predecessors!
+  %0 = load i1, i1* %retval, align 1, !dbg !25
+  ret i1 %0, !dbg !25
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap() #2
+
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { noreturn nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "D:\5Csrc\5Cbug", checksumkind: CSK_MD5, checksum: "2216f11c5ddda8c48a6f92a6079ad4b6")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"CodeView", i32 1}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 2}
+!8 = !{!"clang version 6.0.0 "}
+!9 = distinct !DISubprogram(name: "m_fn1", linkageName: "\01?m_fn1@B@@AAE_NXZ", scope: !11, file: !10, line: 8, type: !18, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !17, variables: !2)
+!10 = !DIFile(filename: "bug.cpp", directory: "D:\5Csrc\5Cbug", checksumkind: CSK_MD5, checksum: "2216f11c5ddda8c48a6f92a6079ad4b6")
+!11 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "B", file: !10, line: 4, size: 8, elements: !12, identifier: ".?AVB@@")
+!12 = !{!13, !17}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "FailedTestsCache", scope: !11, file: !10, line: 5, baseType: !14, size: 8)
+!14 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !10, line: 1, size: 8, elements: !15, identifier: ".?AVA@@")
+!15 = !{!16}
+!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "_Nodeptr", scope: !14, file: !10, line: 2, baseType: null)
+!17 = !DISubprogram(name: "m_fn1", linkageName: "\01?m_fn1@B@@AAE_NXZ", scope: !11, file: !10, line: 6, type: !18, isLocal: false, isDefinition: false, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false)
+!18 = !DISubroutineType(cc: DW_CC_BORLAND_thiscall, types: !19)
+!19 = !{!20, !21}
+!20 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean)
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
+!22 = !DILocalVariable(name: "this", arg: 1, scope: !9, type: !23, flags: DIFlagArtificial | DIFlagObjectPointer)
+!23 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32)
+!24 = !DILocation(line: 0, scope: !9)
+!25 = !DILocation(line: 8, scope: !9)
diff --git a/test/MC/AsmParser/inline_macro_duplication.ll b/test/MC/AsmParser/inline_macro_duplication.ll
new file mode 100644
index 000000000000..9d7e22fde7b6
--- /dev/null
+++ b/test/MC/AsmParser/inline_macro_duplication.ll
@@ -0,0 +1,8 @@
+; RUN: not llc < %s 2>&1 | FileCheck %s
+
+define void @test() {
+  call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1
+  call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1
+; CHECK: error: macro 'FOO' is already defined
+  ret void
+}
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 378af768fa99..01cd6b6fa006 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -622,6 +622,11 @@ movl	$12, foo(%rip)
 // CHECK: encoding: [0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
 // CHECK:    fixup A - offset: 2, value: foo-8, kind: reloc_riprel_4byte
 
+// rdar://37247000
+movl	$12, 1024(%rip)
+// CHECK: movl	$12, 1024(%rip)
+// CHECK: encoding: [0xc7,0x05,0x00,0x04,0x00,0x00,0x0c,0x00,0x00,0x00]
+
 movq	$12, foo(%rip)
 // CHECK:  movq	$12, foo(%rip)
 // CHECK: encoding: [0x48,0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
diff --git a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index f82bf81fbbf8..c8a05204bf5e 100644
--- a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -722,6 +722,114 @@ define <2 x half> @constant_rtz_pkrtz() {
   ret <2 x half> %cvt
 }
 
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pknorm.i16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pknorm_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
+define <2 x i16> @undef_lhs_cvt_pknorm_i16(float %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pknorm_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
+define <2 x i16> @undef_rhs_cvt_pknorm_i16(float %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pknorm_i16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pknorm_i16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float undef)
+  ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pknorm.u16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pknorm_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
+define <2 x i16> @undef_lhs_cvt_pknorm_u16(float %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pknorm_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
+define <2 x i16> @undef_rhs_cvt_pknorm_u16(float %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pknorm_u16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pknorm_u16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float undef)
+  ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pk.i16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pk_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
+define <2 x i16> @undef_lhs_cvt_pk_i16(i32 %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pk_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
+define <2 x i16> @undef_rhs_cvt_pk_i16(i32 %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pk_i16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pk_i16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 undef)
+  ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pk.u16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pk_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
+define <2 x i16> @undef_lhs_cvt_pk_u16(i32 %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pk_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
+define <2 x i16> @undef_rhs_cvt_pk_u16(i32 %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pk_u16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pk_u16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 undef)
+  ret <2 x i16> %cvt
+}
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.ubfe
 ; --------------------------------------------------------------------