Vendor import of llvm release_60 branch r325330:

https://llvm.org/svn/llvm-project/llvm/branches/release_60@325330
2018-02-16 19:10:15 +00:00 · 2018-02-16 19:10:15 +00:00 · 3c315f3a8e
commit 3c315f3a8e
parent 6d18171c19
55 changed files with 2116 additions and 866 deletions
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@ -71,6 +71,13 @@ Non-comprehensive list of changes in this release
 Changes to the LLVM IR
 ----------------------

+Changes to the AArch64 Target
+-----------------------------
+
+During this release:
+
+ * Enabled the new GlobalISel instruction selection framework by default at ``-O0``.
+
 Changes to the ARM Target
 -------------------------

@ -80,6 +87,28 @@ During this release the ARM target has:
  isn't the default.


+Changes to the Hexagon Target
+-----------------------------
+
+* The Hexagon backend now supports V65 ISA.
+
+* The ``-mhvx`` option now takes an optional value that specified the ISA
+  version of the HVX coprocessor.  The available values are v60, v62 and v65.
+  By default, the value is set to be the same as the CPU version.
+
+* The compiler option ``-mhvx-double`` is deprecated and will be removed in
+  the next release of the compiler. Programmers should use ``-mhvx-length``
+  option to specify the desired vector length: ``-mhvx-length=64b`` for
+  64-byte vectors and ``-mhvx-length=128b`` for 128-byte vectors. While the
+  current default vector length is 64 bytes, users should always specify the
+  length explicitly, since the default value may change in the future.
+
+* The target feature ``hvx-double`` is deprecated and will be removed in the
+  next release. LLVM IR generators should use target features ``hvx-length64b``
+  and ``hvx-length128b`` to indicate the vector length. The length should
+  always be specified when HVX code generation is enabled.
+
+
 Changes to the MIPS Target
 --------------------------

@ -91,6 +120,15 @@ Changes to the PowerPC Target

 During this release ...

+Changes to the SystemZ Target
+-----------------------------
+
+During this release the SystemZ target has:
+
+* Added support for 128-bit atomic operations.
+
+* Added support for the "o" constraint for inline asm statements.
+
 Changes to the X86 Target
 -------------------------

--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@ -238,6 +238,26 @@ def int_amdgcn_cvt_pkrtz : Intrinsic<
  [IntrNoMem, IntrSpeculatable]
 >;

+def int_amdgcn_cvt_pknorm_i16 : Intrinsic<
+  [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pknorm_u16 : Intrinsic<
+  [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pk_i16 : Intrinsic<
+  [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pk_u16 : Intrinsic<
+  [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
+
 def int_amdgcn_class : Intrinsic<
  [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
  [IntrNoMem, IntrSpeculatable]
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@ -3738,6 +3738,15 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
  def int_x86_avx512_kxnor_w : // TODO: remove this intrinsic
              Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
                         [IntrNoMem]>;
+  def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">,
+              Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
+                         [IntrNoMem]>;
+  def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+                         [IntrNoMem]>;
+  def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">,
+              Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+                         [IntrNoMem]>;
  def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">,
              Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
                        [IntrNoMem]>;
--- a/include/llvm/MC/MCAsmMacro.h
+++ b/include/llvm/MC/MCAsmMacro.h
@ -0,0 +1,38 @@
+//===- MCAsmMacro.h - Assembly Macros ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCASMMACRO_H
+#define LLVM_MC_MCASMMACRO_H
+
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+
+namespace llvm {
+
+struct MCAsmMacroParameter {
+  StringRef Name;
+  std::vector<AsmToken> Value;
+  bool Required = false;
+  bool Vararg = false;
+
+  MCAsmMacroParameter() = default;
+};
+
+typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
+struct MCAsmMacro {
+  StringRef Name;
+  StringRef Body;
+  MCAsmMacroParameters Parameters;
+
+public:
+  MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
+      : Name(N), Body(B), Parameters(std::move(P)) {}
+};
+}; // namespace llvm
+
+#endif
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCAsmMacro.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SectionKind.h"
@ -268,6 +269,9 @@ namespace llvm {
                                       unsigned UniqueID,
                                       const MCSymbolELF *Associated);

+    /// \brief Map of currently defined macros.
+    StringMap<MCAsmMacro> MacroMap;
+
  public:
    explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI,
                       const MCObjectFileInfo *MOFI,
@ -618,6 +622,17 @@ namespace llvm {
    // FIXME: We should really do something about that.
    LLVM_ATTRIBUTE_NORETURN void reportFatalError(SMLoc L,
                                                  const Twine &Msg);
+
+    const MCAsmMacro *lookupMacro(StringRef Name) {
+      StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
+      return (I == MacroMap.end()) ? nullptr : &I->getValue();
+    }
+
+    void defineMacro(StringRef Name, MCAsmMacro Macro) {
+      MacroMap.insert(std::make_pair(Name, std::move(Macro)));
+    }
+
+    void undefineMacro(StringRef Name) { MacroMap.erase(Name); }
  };

 } // end namespace llvm
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@ -698,24 +698,20 @@ struct SemiNCAInfo {
      return;

    // Recalculate the set of roots.
-    DT.Roots = FindRoots(DT, BUI);
-    for (const NodePtr R : DT.Roots) {
-      const TreeNodePtr TN = DT.getNode(R);
-      // A CFG node was selected as a tree root, but the corresponding tree node
-      // is not connected to the virtual root. This is because the incremental
-      // algorithm does not really know or use the set of roots and can make a
-      // different (implicit) decision about which nodes within an infinite loop
-      // becomes a root.
-      if (TN && !DT.isVirtualRoot(TN->getIDom())) {
-        DEBUG(dbgs() << "Root " << BlockNamePrinter(R)
-                     << " is not virtual root's child\n"
-                     << "The entire tree needs to be rebuilt\n");
-        // It should be possible to rotate the subtree instead of recalculating
-        // the whole tree, but this situation happens extremely rarely in
-        // practice.
-        CalculateFromScratch(DT, BUI);
-        return;
-      }
+    auto Roots = FindRoots(DT, BUI);
+    if (DT.Roots.size() != Roots.size() ||
+        !std::is_permutation(DT.Roots.begin(), DT.Roots.end(), Roots.begin())) {
+      // The roots chosen in the CFG have changed. This is because the
+      // incremental algorithm does not really know or use the set of roots and
+      // can make a different (implicit) decision about which node within an
+      // infinite loop becomes a root.
+
+      DEBUG(dbgs() << "Roots are different in updated trees\n"
+                   << "The entire tree needs to be rebuilt\n");
+      // It may be possible to update the tree without recalculating it, but
+      // we do not know yet how to do it, and it happens rarely in practise.
+      CalculateFromScratch(DT, BUI);
+      return;
    }
  }

--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@ -163,7 +163,8 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {

  DIType *BaseType = DDTy->getBaseType().resolve();

-  assert(BaseType && "Unexpected invalid base type");
+  if (!BaseType)
+    return 0;

  // If this is a derived type, go ahead and get the base type, unless it's a
  // reference then it's just the size of the field. Pointer types have no need
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@ -1391,7 +1391,8 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
  if (!Name.empty())
    addString(MemberDie, dwarf::DW_AT_name, Name);

-  addType(MemberDie, resolve(DT->getBaseType()));
+  if (DIType *Resolved = resolve(DT->getBaseType()))
+    addType(MemberDie, Resolved);

  addSourceLine(MemberDie, DT);

--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@ -205,14 +205,18 @@ void LivePhysRegs::addPristines(const MachineFunction &MF) {
 }

 void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
-  if (!MBB.succ_empty()) {
-    // To get the live-outs we simply merge the live-ins of all successors.
-    for (const MachineBasicBlock *Succ : MBB.successors())
-      addBlockLiveIns(*Succ);
-  } else if (MBB.isReturnBlock()) {
-    // For the return block: Add all callee saved registers that are saved and
-    // restored (somewhere); This does not include callee saved registers that
-    // are unused and hence not saved and restored; they are called pristine.
+  // To get the live-outs we simply merge the live-ins of all successors.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    addBlockLiveIns(*Succ);
+  if (MBB.isReturnBlock()) {
+    // Return blocks are a special case because we currently don't mark up
+    // return instructions completely: specifically, there is no explicit
+    // use for callee-saved registers. So we add all callee saved registers
+    // that are saved and restored (somewhere). This does not include
+    // callee saved registers that are unused and hence not saved and
+    // restored; they are called pristine.
+    // FIXME: PEI should add explicit markings to return instructions
+    // instead of implicitly handling them here.
    const MachineFunction &MF = *MBB.getParent();
    const MachineFrameInfo &MFI = MF.getFrameInfo();
    if (MFI.isCalleeSavedInfoValid()) {
@ -225,15 +229,8 @@ void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {

 void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
  const MachineFunction &MF = *MBB.getParent();
-  if (!MBB.succ_empty()) {
-    addPristines(MF);
-    addLiveOutsNoPristines(MBB);
-  } else if (MBB.isReturnBlock()) {
-    // For the return block: Add all callee saved registers.
-    const MachineFrameInfo &MFI = MF.getFrameInfo();
-    if (MFI.isCalleeSavedInfoValid())
-      addCalleeSavedRegs(*this, MF);
-  }
+  addPristines(MF);
+  addLiveOutsNoPristines(MBB);
 }

 void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) {
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -16409,7 +16409,9 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
      N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
      N1.getOperand(0).getOperand(1) == N2 &&
      N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() ==
-          VT.getVectorNumElements()) {
+          VT.getVectorNumElements() &&
+      N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
+          VT.getSizeInBits()) {
    return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
  }

--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@ -491,9 +491,8 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
  return VNI;
 }

-void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
-  assert(ParentVNI && "Mapping  NULL value");
-  ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI->id)];
+void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) {
+  ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI.id)];
  VNInfo *VNI = VFP.getPointer();

  // ParentVNI was either unmapped or already complex mapped. Either way, just
@ -777,7 +776,7 @@ SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) {
  // the source live range.  The spiller also won't try to hoist this copy.
  if (SpillMode && !SlotIndex::isSameInstr(ParentVNI->def, Idx) &&
      MI->readsVirtualRegister(Edit->getReg())) {
-    forceRecompute(0, ParentVNI);
+    forceRecompute(0, *ParentVNI);
    defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
    return Idx;
  }
@ -835,7 +834,7 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {

  // The complement interval will be extended as needed by LRCalc.extend().
  if (ParentVNI)
-    forceRecompute(0, ParentVNI);
+    forceRecompute(0, *ParentVNI);
  DEBUG(dbgs() << "    overlapIntv [" << Start << ';' << End << "):");
  RegAssign.insert(Start, End, OpenIdx);
  DEBUG(dump());
@ -878,7 +877,7 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
    unsigned RegIdx = AssignI.value();
    if (AtBegin || !MBBI->readsVirtualRegister(Edit->getReg())) {
      DEBUG(dbgs() << "  cannot find simple kill of RegIdx " << RegIdx << '\n');
-      forceRecompute(RegIdx, Edit->getParent().getVNInfoAt(Def));
+      forceRecompute(RegIdx, *Edit->getParent().getVNInfoAt(Def));
    } else {
      SlotIndex Kill = LIS.getInstructionIndex(*MBBI).getRegSlot();
      DEBUG(dbgs() << "  move kill to " << Kill << '\t' << *MBBI);
@ -982,7 +981,7 @@ void SplitEditor::computeRedundantBackCopies(
      }
    }
    if (!DominatedVNIs.empty()) {
-      forceRecompute(0, ParentVNI);
+      forceRecompute(0, *ParentVNI);
      for (auto VNI : DominatedVNIs) {
        BackCopies.push_back(VNI);
      }
@ -1102,7 +1101,7 @@ void SplitEditor::hoistCopies() {
        NotToHoistSet.count(ParentVNI->id))
      continue;
    BackCopies.push_back(VNI);
-    forceRecompute(0, ParentVNI);
+    forceRecompute(0, *ParentVNI);
  }

  // If it is not beneficial to hoist all the BackCopies, simply remove
@ -1428,6 +1427,41 @@ void SplitEditor::deleteRematVictims() {
  Edit->eliminateDeadDefs(Dead, None, &AA);
 }

+void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) {
+  // Fast-path for common case.
+  if (!ParentVNI.isPHIDef()) {
+    for (unsigned I = 0, E = Edit->size(); I != E; ++I)
+      forceRecompute(I, ParentVNI);
+    return;
+  }
+
+  // Trace value through phis.
+  SmallPtrSet<const VNInfo *, 8> Visited; ///< whether VNI was/is in worklist.
+  SmallVector<const VNInfo *, 4> WorkList;
+  Visited.insert(&ParentVNI);
+  WorkList.push_back(&ParentVNI);
+
+  const LiveInterval &ParentLI = Edit->getParent();
+  const SlotIndexes &Indexes = *LIS.getSlotIndexes();
+  do {
+    const VNInfo &VNI = *WorkList.back();
+    WorkList.pop_back();
+    for (unsigned I = 0, E = Edit->size(); I != E; ++I)
+      forceRecompute(I, VNI);
+    if (!VNI.isPHIDef())
+      continue;
+
+    MachineBasicBlock &MBB = *Indexes.getMBBFromIndex(VNI.def);
+    for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+      SlotIndex PredEnd = Indexes.getMBBEndIdx(Pred);
+      VNInfo *PredVNI = ParentLI.getVNInfoBefore(PredEnd);
+      assert(PredVNI && "Value available in PhiVNI predecessor");
+      if (Visited.insert(PredVNI).second)
+        WorkList.push_back(PredVNI);
+    }
+  } while(!WorkList.empty());
+}
+
 void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
  ++NumFinished;

@ -1444,8 +1478,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
    // Force rematted values to be recomputed everywhere.
    // The new live ranges may be truncated.
    if (Edit->didRematerialize(ParentVNI))
-      for (unsigned i = 0, e = Edit->size(); i != e; ++i)
-        forceRecompute(i, ParentVNI);
+      forceRecomputeVNI(*ParentVNI);
  }

  // Hoist back-copies to the complement interval when in spill mode.
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@ -357,7 +357,11 @@ private:
  /// recomputed by LiveRangeCalc::extend regardless of the number of defs.
  /// This is used for values whose live range doesn't match RegAssign exactly.
  /// They could have rematerialized, or back-copies may have been moved.
-  void forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI);
+  void forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI);
+
+  /// Calls forceRecompute() on any affected regidx and on ParentVNI
+  /// predecessors in case of a phi definition.
+  void forceRecomputeVNI(const VNInfo &ParentVNI);

  /// defFromParent - Define Reg from ParentVNI at UseIdx using either
  /// rematerialization or a COPY from parent. Return the new value.
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@ -75,7 +75,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
      Name=="ssse3.pabs.d.128" || // Added in 6.0
      Name.startswith("avx512.mask.shuf.i") || // Added in 6.0
      Name.startswith("avx512.mask.shuf.f") || // Added in 6.0
-      Name.startswith("avx512.kunpck") || //added in 6.0 
      Name.startswith("avx2.pabs.") || // Added in 6.0
      Name.startswith("avx512.mask.pabs.") || // Added in 6.0
      Name.startswith("avx512.broadcastm") || // Added in 6.0
@ -1063,12 +1062,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
      Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                          CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("avx512.kunpck"))) {
-      uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2;
-      uint64_t And = (1ULL << Shift) - 1; 
-      Value* LowBits =  Builder.CreateAnd(CI->getArgOperand(0), And);
-      Value* HighBits =  Builder.CreateShl(CI->getArgOperand(1), Shift);
-      Rep = Builder.CreateOr(LowBits, HighBits);
    } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
      Type *I32Ty = Type::getInt32Ty(C);
      Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@ -83,27 +83,6 @@ namespace {
 typedef std::vector<AsmToken> MCAsmMacroArgument;
 typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;

-struct MCAsmMacroParameter {
-  StringRef Name;
-  MCAsmMacroArgument Value;
-  bool Required = false;
-  bool Vararg = false;
-
-  MCAsmMacroParameter() = default;
-};
-
-typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
-
-struct MCAsmMacro {
-  StringRef Name;
-  StringRef Body;
-  MCAsmMacroParameters Parameters;
-
-public:
-  MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
-      : Name(N), Body(B), Parameters(std::move(P)) {}
-};
-
 /// \brief Helper class for storing information about an active macro
 /// instantiation.
 struct MacroInstantiation {
@ -164,9 +143,6 @@ private:
  /// addDirectiveHandler.
  StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;

-  /// \brief Map of currently defined macros.
-  StringMap<MCAsmMacro> MacroMap;
-
  /// \brief Stack of active macro instantiations.
  std::vector<MacroInstantiation*> ActiveMacros;

@ -308,17 +284,6 @@ private:
  /// \brief Control a flag in the parser that enables or disables macros.
  void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}

-  /// \brief Lookup a previously defined macro.
-  /// \param Name Macro name.
-  /// \returns Pointer to macro. NULL if no such macro was defined.
-  const MCAsmMacro* lookupMacro(StringRef Name);
-
-  /// \brief Define a new macro with the given name and information.
-  void defineMacro(StringRef Name, MCAsmMacro Macro);
-
-  /// \brief Undefine a macro. If no such macro was defined, it's a no-op.
-  void undefineMacro(StringRef Name);
-
  /// \brief Are we inside a macro instantiation?
  bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}

@ -1841,7 +1806,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,

  // If macros are enabled, check to see if this is a macro instantiation.
  if (areMacrosEnabled())
-    if (const MCAsmMacro *M = lookupMacro(IDVal)) {
+    if (const MCAsmMacro *M = getContext().lookupMacro(IDVal)) {
      return handleMacroEntry(M, IDLoc);
    }

@ -2720,17 +2685,6 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
  return TokError("too many positional arguments");
 }

-const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) {
-  StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
-  return (I == MacroMap.end()) ? nullptr : &I->getValue();
-}
-
-void AsmParser::defineMacro(StringRef Name, MCAsmMacro Macro) {
-  MacroMap.insert(std::make_pair(Name, std::move(Macro)));
-}
-
-void AsmParser::undefineMacro(StringRef Name) { MacroMap.erase(Name); }
-
 bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
  // Arbitrarily limit macro nesting depth (default matches 'as'). We can
  // eliminate this, although we should protect against infinite loops.
@ -4249,7 +4203,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
    eatToEndOfStatement();
  }

-  if (lookupMacro(Name)) {
+  if (getContext().lookupMacro(Name)) {
    return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
  }

@ -4257,7 +4211,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
  const char *BodyEnd = EndToken.getLoc().getPointer();
  StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
  checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
-  defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
+  getContext().defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
  return false;
 }

@ -4416,10 +4370,10 @@ bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
                 "unexpected token in '.purgem' directive"))
    return true;

-  if (!lookupMacro(Name))
+  if (!getContext().lookupMacro(Name))
    return Error(DirectiveLoc, "macro '" + Name + "' is not defined");

-  undefineMacro(Name);
+  getContext().undefineMacro(Name);
  return false;
 }

--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -3957,6 +3957,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
  NODE_NAME_CASE(CVT_F32_UBYTE2)
  NODE_NAME_CASE(CVT_F32_UBYTE3)
  NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
+  NODE_NAME_CASE(CVT_PKNORM_I16_F32)
+  NODE_NAME_CASE(CVT_PKNORM_U16_F32)
+  NODE_NAME_CASE(CVT_PK_I16_I32)
+  NODE_NAME_CASE(CVT_PK_U16_U32)
  NODE_NAME_CASE(FP_TO_FP16)
  NODE_NAME_CASE(FP16_ZEXT)
  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@ -417,6 +417,10 @@ enum NodeType : unsigned {
  // Convert two float 32 numbers into a single register holding two packed f16
  // with round to zero.
  CVT_PKRTZ_F16_F32,
+  CVT_PKNORM_I16_F32,
+  CVT_PKNORM_U16_F32,
+  CVT_PK_I16_I32,
+  CVT_PK_U16_U32,

  // Same as the standard node, except the high bits of the resulting integer
  // are known 0.
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@ -108,3 +108,21 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {

  return MCOp;
 }
+
+// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
+bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
+  const Value *Ptr = MMO->getValue();
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+    return true;
+
+  if (const Argument *Arg = dyn_cast<Argument>(Ptr))
+    return AMDGPU::isArgPassedInSGPR(Arg);
+
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@ -50,6 +50,8 @@ public:
  /// Return -1 if the target-specific opcode for the pseudo instruction does
  /// not exist. If Opcode is not a pseudo instruction, this is identity.
  int pseudoToMCOpcode(int Opcode) const;
+
+  static bool isUniformMMO(const MachineMemOperand *MMO);
 };
 } // End llvm namespace

--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@ -35,6 +35,10 @@ def AMDGPUFPPackOp : SDTypeProfile<1, 2,
  [SDTCisFP<1>, SDTCisSameAs<1, 2>]
 >;

+def AMDGPUIntPackOp : SDTypeProfile<1, 2,
+  [SDTCisInt<1>, SDTCisSameAs<1, 2>]
+>;
+
 def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
  [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
 >;
@ -142,6 +146,10 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
 def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;

 def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
+def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
+def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
 def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
 def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;

--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@ -120,7 +120,7 @@ static bool isInstrUniform(const MachineInstr &MI) {
    return false;

  const MachineMemOperand *MMO = *MI.memoperands_begin();
-  return AMDGPU::isUniformMMO(MMO);
+  return AMDGPUInstrInfo::isUniformMMO(MMO);
 }

 const RegisterBankInfo::InstructionMapping &
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -205,6 +205,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);

  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@ -1085,7 +1086,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
  const MemSDNode *MemNode = cast<MemSDNode>(N);

-  return AMDGPU::isUniformMMO(MemNode->getMemOperand());
+  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
 }

 TargetLoweringBase::LegalizeTypeAction
@ -3517,7 +3518,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
  }
  case ISD::INTRINSIC_WO_CHAIN: {
    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
+    switch (IID) {
+    case Intrinsic::amdgcn_cvt_pkrtz: {
      SDValue Src0 = N->getOperand(1);
      SDValue Src1 = N->getOperand(2);
      SDLoc SL(N);
@ -3526,6 +3528,29 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
      return;
    }
+    case Intrinsic::amdgcn_cvt_pknorm_i16:
+    case Intrinsic::amdgcn_cvt_pknorm_u16:
+    case Intrinsic::amdgcn_cvt_pk_i16:
+    case Intrinsic::amdgcn_cvt_pk_u16: {
+      SDValue Src0 = N->getOperand(1);
+      SDValue Src1 = N->getOperand(2);
+      SDLoc SL(N);
+      unsigned Opcode;
+
+      if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
+        Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+      else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
+        Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+      else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
+        Opcode = AMDGPUISD::CVT_PK_I16_I32;
+      else
+        Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+      SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
+      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+      return;
+    }
+    }
    break;
  }
  case ISD::SELECT: {
@ -4424,10 +4449,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
  case Intrinsic::amdgcn_ubfe:
    return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-  case Intrinsic::amdgcn_cvt_pkrtz: {
-    // FIXME: Stop adding cast if v2f16 legal.
+  case Intrinsic::amdgcn_cvt_pkrtz:
+  case Intrinsic::amdgcn_cvt_pknorm_i16:
+  case Intrinsic::amdgcn_cvt_pknorm_u16:
+  case Intrinsic::amdgcn_cvt_pk_i16:
+  case Intrinsic::amdgcn_cvt_pk_u16: {
+    // FIXME: Stop adding cast if v2f16/v2i16 are legal.
    EVT VT = Op.getValueType();
-    SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+    unsigned Opcode;
+
+    if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
+      Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
+    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
+      Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
+      Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
+      Opcode = AMDGPUISD::CVT_PK_I16_I32;
+    else
+      Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+    SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
                               Op.getOperand(1), Op.getOperand(2));
    return DAG.getNode(ISD::BITCAST, DL, VT, Node);
  }
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@ -871,24 +871,6 @@ bool isArgPassedInSGPR(const Argument *A) {
  }
 }

-// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
-bool isUniformMMO(const MachineMemOperand *MMO) {
-  const Value *Ptr = MMO->getValue();
-  // UndefValue means this is a load of a kernel input.  These are uniform.
-  // Sometimes LDS instructions have constant pointers.
-  // If Ptr is null, then that means this mem operand contains a
-  // PseudoSourceValue like GOT.
-  if (!Ptr || isa<UndefValue>(Ptr) ||
-      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
-    return true;
-
-  if (const Argument *Arg = dyn_cast<Argument>(Ptr))
-    return isArgPassedInSGPR(Arg);
-
-  const Instruction *I = dyn_cast<Instruction>(Ptr);
-  return I && I->getMetadata("amdgpu.uniform");
-}
-
 int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
  if (isGCN3Encoding(ST))
    return ByteOffset;
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@ -363,7 +363,6 @@ LLVM_READNONE
 bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);

 bool isArgPassedInSGPR(const Argument *Arg);
-bool isUniformMMO(const MachineMemOperand *MMO);

 /// \returns The encoding that will be used for \p ByteOffset in the SMRD
 /// offset field.
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@ -407,11 +407,11 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
 defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
 defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
 defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;

 } // End SubtargetPredicate = isGCN

--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@ -396,10 +396,14 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,

    // rip-relative addressing is actually relative to the *next* instruction.
    // Since an immediate can follow the mod/rm byte for an instruction, this
-    // means that we need to bias the immediate field of the instruction with
-    // the size of the immediate field.  If we have this case, add it into the
+    // means that we need to bias the displacement field of the instruction with
+    // the size of the immediate field. If we have this case, add it into the
    // expression to emit.
-    int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+    // Note: rip-relative addressing using immediate displacement values should
+    // not be adjusted, assuming it was the user's intent.
+    int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags)
+                      ? X86II::getSizeOfImm(TSFlags)
+                      : 0;

    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
                  CurByte, OS, Fixups, -ImmSize);
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@ -370,6 +370,8 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
 static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
                              char Mode, raw_ostream &O) {
  unsigned Reg = MO.getReg();
+  bool EmitPercent = true;
+
  switch (Mode) {
  default: return true;  // Unknown mode.
  case 'b': // Print QImode register
@ -384,6 +386,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
  case 'k': // Print SImode register
    Reg = getX86SubSuperRegister(Reg, 32);
    break;
+  case 'V':
+    EmitPercent = false;
+    LLVM_FALLTHROUGH;
  case 'q':
    // Print 64-bit register names if 64-bit integer registers are available.
    // Otherwise, print 32-bit register names.
@ -391,7 +396,10 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
    break;
  }

-  O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+  if (EmitPercent)
+    O << '%';
+
+  O << X86ATTInstPrinter::getRegisterName(Reg);
  return false;
 }

@ -464,6 +472,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
    case 'w': // Print HImode register
    case 'k': // Print SImode register
    case 'q': // Print DImode register
+    case 'V': // Print native register without '%'
      if (MO.isReg())
        return printAsmMRegister(*this, MO, ExtraCode[0], O);
      printOperand(*this, MI, OpNo, O);
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@ -663,8 +663,10 @@ void X86DomainReassignment::initConverters() {
    createReplacer(X86::XOR32rr, X86::KXORDrr);
    createReplacer(X86::XOR64rr, X86::KXORQrr);

-    createReplacer(X86::TEST32rr, X86::KTESTDrr);
-    createReplacer(X86::TEST64rr, X86::KTESTQrr);
+    // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+    // to prove only Z flag is used.
+    //createReplacer(X86::TEST32rr, X86::KTESTDrr);
+    //createReplacer(X86::TEST64rr, X86::KTESTQrr);
  }

  if (STI->hasDQI()) {
@ -684,8 +686,10 @@ void X86DomainReassignment::initConverters() {
    createReplacer(X86::SHR8ri, X86::KSHIFTRBri);
    createReplacer(X86::SHL8ri, X86::KSHIFTLBri);

-    createReplacer(X86::TEST8rr, X86::KTESTBrr);
-    createReplacer(X86::TEST16rr, X86::KTESTWrr);
+    // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+    // to prove only Z flag is used.
+    //createReplacer(X86::TEST8rr, X86::KTESTBrr);
+    //createReplacer(X86::TEST16rr, X86::KTESTWrr);

    createReplacer(X86::XOR8rr, X86::KXORBrr);
  }
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -17017,24 +17017,6 @@ static bool hasNonFlagsUse(SDValue Op) {
  return false;
 }

-// Emit KTEST instruction for bit vectors on AVX-512
-static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
-                         const X86Subtarget &Subtarget) {
-  if (Op.getOpcode() == ISD::BITCAST) {
-    auto hasKTEST = [&](MVT VT) {
-      unsigned SizeInBits = VT.getSizeInBits();
-      return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
-        (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
-    };
-    SDValue Op0 = Op.getOperand(0);
-    MVT Op0VT = Op0.getValueType().getSimpleVT();
-    if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
-        hasKTEST(Op0VT))
-      return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
-  }
-  return SDValue();
-}
-
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
@ -17079,9 +17061,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
  // we prove that the arithmetic won't overflow, we can't use OF or CF.
  if (Op.getResNo() != 0 || NeedOF || NeedCF) {
-    // Emit KTEST for bit vectors
-    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
-      return Node;
    // Emit a CMP with 0, which is the TEST pattern.
    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                       DAG.getConstant(0, dl, Op.getValueType()));
@ -17310,10 +17289,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
  }

  if (Opcode == 0) {
-    // Emit KTEST for bit vectors
-    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
-      return Node;
-
    // Emit a CMP with 0, which is the TEST pattern.
    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                       DAG.getConstant(0, dl, Op.getValueType()));
@ -18093,6 +18068,34 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
  return Result;
 }

+// Try to select this as a KTEST+SETCC if possible.
+static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         const X86Subtarget &Subtarget) {
+  // Only support equality comparisons.
+  if (CC != ISD::SETEQ && CC != ISD::SETNE)
+    return SDValue();
+
+  // Must be a bitcast from vXi1.
+  if (Op0.getOpcode() != ISD::BITCAST)
+    return SDValue();
+
+  Op0 = Op0.getOperand(0);
+  MVT VT = Op0.getSimpleValueType();
+  if (!(Subtarget.hasDQI() && (VT == MVT::v8i1  || VT == MVT::v16i1)) &&
+      !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
+    return SDValue();
+
+  X86::CondCode X86CC;
+  if (isNullConstant(Op1)) {
+    X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+  } else
+    return SDValue();
+
+  SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0);
+  return getSETCC(X86CC, KTEST, dl, DAG);
+}
+
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

  MVT VT = Op.getSimpleValueType();
@ -18115,6 +18118,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
      return NewSetCC;
  }

+  // Try to lower using KTEST.
+  if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
+    return NewSetCC;
+
  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
  // these.
  if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
@ -20525,6 +20532,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
      Mask = DAG.getBitcast(MaskVT, Mask);
      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
    }
+    case KUNPCK: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      // Arguments should be swapped.
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+                                Src2, Src1);
+      return DAG.getBitcast(VT, Res);
+    }
    case MASK_BINOP: {
      MVT VT = Op.getSimpleValueType();
      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
@ -27094,28 +27113,57 @@ static unsigned getOpcodeForRetpoline(unsigned RPOpc) {

 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
                                      unsigned Reg) {
+  if (Subtarget.useRetpolineExternalThunk()) {
+    // When using an external thunk for retpolines, we pick names that match the
+    // names GCC happens to use as well. This helps simplify the implementation
+    // of the thunks for kernels where they have no easy ability to create
+    // aliases and are doing non-trivial configuration of the thunk's body. For
+    // example, the Linux kernel will do boot-time hot patching of the thunk
+    // bodies and cannot easily export aliases of these to loaded modules.
+    //
+    // Note that at any point in the future, we may need to change the semantics
+    // of how we implement retpolines and at that time will likely change the
+    // name of the called thunk. Essentially, there is no hard guarantee that
+    // LLVM will generate calls to specific thunks, we merely make a best-effort
+    // attempt to help out kernels and other systems where duplicating the
+    // thunks is costly.
+    switch (Reg) {
+    case X86::EAX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_eax";
+    case X86::ECX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_ecx";
+    case X86::EDX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_edx";
+    case X86::EDI:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_edi";
+    case X86::R11:
+      assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+      return "__x86_indirect_thunk_r11";
+    }
+    llvm_unreachable("unexpected reg for retpoline");
+  }
+
+  // When targeting an internal COMDAT thunk use an LLVM-specific name.
  switch (Reg) {
-  case 0:
-    assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_push"
-               : "__llvm_retpoline_push";
  case X86::EAX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_eax"
-               : "__llvm_retpoline_eax";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_eax";
  case X86::ECX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_ecx"
-               : "__llvm_retpoline_ecx";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_ecx";
  case X86::EDX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_edx"
-               : "__llvm_retpoline_edx";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_edx";
+  case X86::EDI:
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_edi";
  case X86::R11:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_r11"
-               : "__llvm_retpoline_r11";
+    assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+    return "__llvm_retpoline_r11";
  }
  llvm_unreachable("unexpected reg for retpoline");
 }
@ -27134,15 +27182,13 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
  // just use R11, but we scan for uses anyway to ensure we don't generate
  // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
  // already a register use operand to the call to hold the callee. If none
-  // are available, push the callee instead. This is less efficient, but is
-  // necessary for functions using 3 regparms. Such function calls are
-  // (currently) not eligible for tail call optimization, because there is no
-  // scratch register available to hold the address of the callee.
+  // are available, use EDI instead. EDI is chosen because EBX is the PIC base
+  // register and ESI is the base pointer to realigned stack frames with VLAs.
  SmallVector<unsigned, 3> AvailableRegs;
  if (Subtarget.is64Bit())
    AvailableRegs.push_back(X86::R11);
  else
-    AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX});
+    AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

  // Zero out any registers that are already used.
  for (const auto &MO : MI.operands()) {
@ -27160,30 +27206,18 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
      break;
    }
  }
+  if (!AvailableReg)
+    report_fatal_error("calling convention incompatible with retpoline, no "
+                       "available registers");

  const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);

-  if (AvailableReg == 0) {
-    // No register available. Use PUSH. This must not be a tailcall, and this
-    // must not be x64.
-    if (Subtarget.is64Bit())
-      report_fatal_error(
-          "Cannot make an indirect call on x86-64 using both retpoline and a "
-          "calling convention that preservers r11");
-    if (Opc != X86::CALLpcrel32)
-      report_fatal_error("Cannot make an indirect tail call on x86 using "
-                         "retpoline without a preserved register");
-    BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg);
-    MI.getOperand(0).ChangeToES(Symbol);
-    MI.setDesc(TII->get(Opc));
-  } else {
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
-        .addReg(CalleeVReg);
-    MI.getOperand(0).ChangeToES(Symbol);
-    MI.setDesc(TII->get(Opc));
-    MachineInstrBuilder(*BB->getParent(), &MI)
-        .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
-  }
+  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
+      .addReg(CalleeVReg);
+  MI.getOperand(0).ChangeToES(Symbol);
+  MI.setDesc(TII->get(Opc));
+  MachineInstrBuilder(*BB->getParent(), &MI)
+      .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
  return BB;
 }

@ -30432,53 +30466,6 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
  SDValue N0 = BitCast.getOperand(0);
  EVT VecVT = N0->getValueType(0);

-  if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
-      N0->getOpcode() == ISD::OR) {
-    SDValue Op0 = N0->getOperand(0);
-    SDValue Op1 = N0->getOperand(1);
-    MVT TrunckVT;
-    MVT BitcastVT;
-    switch (VT.getSimpleVT().SimpleTy) {
-    default:
-      return SDValue();
-    case MVT::v16i1:
-      TrunckVT = MVT::i8;
-      BitcastVT = MVT::v8i1;
-      break;
-    case MVT::v32i1:
-      TrunckVT = MVT::i16;
-      BitcastVT = MVT::v16i1;
-      break;
-    case MVT::v64i1:
-      TrunckVT = MVT::i32;
-      BitcastVT = MVT::v32i1;
-      break;
-    }
-    bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
-    bool isArg0UndefLeft =
-        Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
-    bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
-    bool isArg1UndefLeft =
-        Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
-    SDValue OpLeft;
-    SDValue OpRight;
-    if (isArg0UndefRight && isArg1UndefLeft) {
-      OpLeft = Op0;
-      OpRight = Op1;
-    } else if (isArg1UndefRight && isArg0UndefLeft) {
-      OpLeft = Op1;
-      OpRight = Op0;
-    } else
-      return SDValue();
-    SDLoc DL(BitCast);
-    SDValue Shr = OpLeft->getOperand(0);
-    SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
-    SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
-    SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
-    SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
-  }
-
  if (!VT.isScalarInteger() || !VecVT.isSimple())
    return SDValue();

--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
  COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
  TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
  EXPAND_FROM_MEM,
-  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
  FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
  ROUNDP, ROUNDS
 };
@ -479,6 +479,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
  X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
  X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
  X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
  X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
  X86ISD::FADD_RND),
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@ -43,7 +43,7 @@ static const char R11ThunkName[]    = "__llvm_retpoline_r11";
 static const char EAXThunkName[]    = "__llvm_retpoline_eax";
 static const char ECXThunkName[]    = "__llvm_retpoline_ecx";
 static const char EDXThunkName[]    = "__llvm_retpoline_edx";
-static const char PushThunkName[]   = "__llvm_retpoline_push";
+static const char EDIThunkName[]    = "__llvm_retpoline_edi";

 namespace {
 class X86RetpolineThunks : public MachineFunctionPass {
@ -74,7 +74,6 @@ private:

  void createThunkFunction(Module &M, StringRef Name);
  void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
-  void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB);
  void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
 };

@ -127,7 +126,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
      createThunkFunction(M, R11ThunkName);
    else
      for (StringRef Name :
-           {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName})
+           {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
        createThunkFunction(M, Name);
    InsertedThunks = true;
    return true;
@ -151,9 +150,8 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
    populateThunk(MF, X86::R11);
  } else {
    // For 32-bit targets we need to emit a collection of thunks for various
-    // possible scratch registers as well as a fallback that is used when
-    // there are no scratch registers and assumes the retpoline target has
-    // been pushed.
+    // possible scratch registers as well as a fallback that uses EDI, which is
+    // normally callee saved.
    //   __llvm_retpoline_eax:
    //         calll .Leax_call_target
    //   .Leax_capture_spec:
@ -174,32 +172,18 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
    //         movl %edx, (%esp)
    //         retl
    //
-    // This last one is a bit more special and so needs a little extra
-    // handling.
-    // __llvm_retpoline_push:
-    //         calll .Lpush_call_target
-    // .Lpush_capture_spec:
-    //         pause
-    //         lfence
-    //         jmp .Lpush_capture_spec
-    // .align 16
-    // .Lpush_call_target:
-    //         # Clear pause_loop return address.
-    //         addl $4, %esp
-    //         # Top of stack words are: Callee, RA. Exchange Callee and RA.
-    //         pushl 4(%esp)  # Push callee
-    //         pushl 4(%esp)  # Push RA
-    //         popl 8(%esp)   # Pop RA to final RA
-    //         popl (%esp)    # Pop callee to next top of stack
-    //         retl           # Ret to callee
+    //   __llvm_retpoline_edi:
+    //   ... # Same setup
+    //         movl %edi, (%esp)
+    //         retl
    if (MF.getName() == EAXThunkName)
      populateThunk(MF, X86::EAX);
    else if (MF.getName() == ECXThunkName)
      populateThunk(MF, X86::ECX);
    else if (MF.getName() == EDXThunkName)
      populateThunk(MF, X86::EDX);
-    else if (MF.getName() == PushThunkName)
-      populateThunk(MF);
+    else if (MF.getName() == EDIThunkName)
+      populateThunk(MF, X86::EDI);
    else
      llvm_unreachable("Invalid thunk name on x86-32!");
  }
@ -240,31 +224,6 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
      .addReg(Reg);
 }

-void X86RetpolineThunks::insert32BitPushReturnAddrClobber(
-    MachineBasicBlock &MBB) {
-  // The instruction sequence we use to replace the return address without
-  // a scratch register is somewhat complicated:
-  //   # Clear capture_spec from return address.
-  //   addl $4, %esp
-  //   # Top of stack words are: Callee, RA. Exchange Callee and RA.
-  //   pushl 4(%esp)  # Push callee
-  //   pushl 4(%esp)  # Push RA
-  //   popl 8(%esp)   # Pop RA to final RA
-  //   popl (%esp)    # Pop callee to next top of stack
-  //   retl           # Ret to callee
-  BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP)
-      .addReg(X86::ESP)
-      .addImm(4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
-               false, 4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
-               false, 4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
-               false, 8);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
-               false, 0);
-}
-
 void X86RetpolineThunks::populateThunk(MachineFunction &MF,
                                       Optional<unsigned> Reg) {
  // Set MF properties. We never use vregs...
@ -301,11 +260,6 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
  CaptureSpec->addSuccessor(CaptureSpec);

  CallTarget->setAlignment(4);
-  if (Reg) {
-    insertRegReturnAddrClobber(*CallTarget, *Reg);
-  } else {
-    assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!");
-    insert32BitPushReturnAddrClobber(*CallTarget);
-  }
+  insertRegReturnAddrClobber(*CallTarget, *Reg);
  BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
 }
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@ -3264,6 +3264,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {

    break;
  }
+  case Intrinsic::amdgcn_cvt_pknorm_i16:
+  case Intrinsic::amdgcn_cvt_pknorm_u16:
+  case Intrinsic::amdgcn_cvt_pk_i16:
+  case Intrinsic::amdgcn_cvt_pk_u16: {
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
+      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+    break;
+  }
  case Intrinsic::amdgcn_ubfe:
  case Intrinsic::amdgcn_sbfe: {
    // Decompose simple cases into standard shifts.
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pk_i16_samereg_i32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pk_i16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %x)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %b = load volatile i32, i32 addrspace(1)* %b.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
+define amdgpu_kernel void @v_cvt_pk_i16_i32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 1)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, 1, [[A]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, 1, [[A]]
+define amdgpu_kernel void @v_cvt_pk_i16_i32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 1, i32 %a)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pk_u16_samereg_i32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pk_u16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %x)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %b = load volatile i32, i32 addrspace(1)* %b.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
+define amdgpu_kernel void @v_cvt_pk_u16_u32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 1)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, 1, [[A]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, 1, [[A]]
+define amdgpu_kernel void @v_cvt_pk_u16_u32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile i32, i32 addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 1, i32 %a)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
@ -0,0 +1,164 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_i16_samereg_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pknorm_i16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %x)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float 1.0)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, 1.0, [[A]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float 1.0, float %a)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fabs.a = call float @llvm.fabs.f32(float %a)
+  %neg.fabs.a = fsub float -0.0, %fabs.a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.fabs.a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) #1
+declare float @llvm.fabs.f32(float) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
@ -0,0 +1,164 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_u16_samereg_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pknorm_u16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
+  %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %x)
+  %r = bitcast <2 x i16> %result to i32
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float 1.0)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, 1.0, [[A]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float 1.0, float %a)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fabs.a = call float @llvm.fabs.f32(float %a)
+  %neg.fabs.a = fsub float -0.0, %fabs.a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.fabs.a, float %neg.b)
+  %r = bitcast <2 x i16> %cvt to i32
+  store i32 %r, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) #1
+declare float @llvm.fabs.f32(float) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
--- a/test/CodeGen/ARM/pr25838.ll
+++ b/test/CodeGen/ARM/pr25838.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -verify-machineinstrs < %s
 ; PR25838

 target triple = "armv7--linux-android"
--- a/test/CodeGen/ARM/splitkit.ll
+++ b/test/CodeGen/ARM/splitkit.ll
@ -0,0 +1,245 @@
+; RUN: llc -o - %s | FileCheck %s
+; Make sure RegAllocGreedy/SplitKit do not produce invalid liveness information
+; and crash when splitting a liverange twice and rematerializing each time.
+; (Sorry for the testcase; this was ran through bugpoint and then  manually
+;  reduced for several hours but is still big...)
+target triple = "thumbv7-apple-ios"
+
+%struct.ham = type { %struct.wombat.0 }
+%struct.wombat.0 = type { %struct.barney }
+%struct.barney = type { %struct.snork.1 }
+%struct.snork.1 = type { %struct.wobble.2 }
+%struct.wobble.2 = type { %struct.blam }
+%struct.blam = type { i32, i32, i8* }
+%struct.ham.3 = type { %struct.pluto }
+%struct.pluto = type { %struct.zot*, %struct.snork.5, %struct.wibble }
+%struct.zot = type { %struct.blam.4* }
+%struct.blam.4 = type <{ %struct.zot, %struct.blam.4*, %struct.zot*, i8, [3 x i8] }>
+%struct.snork.5 = type { %struct.quux }
+%struct.quux = type { %struct.zot }
+%struct.wibble = type { %struct.widget }
+%struct.widget = type { i32 }
+%struct.bar = type { %struct.spam }
+%struct.spam = type { %struct.zot*, %struct.wobble, %struct.zot.7 }
+%struct.wobble = type { %struct.wibble.6 }
+%struct.wibble.6 = type { %struct.zot }
+%struct.zot.7 = type { %struct.ham.8 }
+%struct.ham.8 = type { i32 }
+%struct.hoge = type { %struct.ham, %struct.foo }
+%struct.foo = type { float, float }
+%struct.wombat = type { %struct.ham, float }
+%struct.snork = type { %struct.ham.9, [11 x i8] }
+%struct.ham.9 = type { i8 }
+
+@global = external global i8
+@global.1 = private constant [20 x i8] c"aaaaaaaaaaaaaaaaaa0\00"
+@global.2 = external constant [27 x i8]
+@global.3 = external global %struct.ham
+@global.4 = external constant [47 x i8]
+@global.5 = external constant [61 x i8]
+@global.6 = external constant [40 x i8]
+@global.7 = external constant [24 x i8]
+@global.8 = external constant [20 x i8]
+@global.9 = external global %struct.ham
+@global.10 = external global %struct.ham
+@global.11 = external global %struct.ham
+@global.12 = external global %struct.ham
+@global.13 = external global %struct.ham
+@global.14 = external global %struct.ham
+@global.15 = external global %struct.ham
+@global.16 = external global %struct.ham
+@global.17 = external global %struct.ham
+@global.18 = external constant [35 x i8]
+@global.19 = external global %struct.ham
+@global.20 = external constant [53 x i8]
+@global.21 = external global %struct.ham
+@global.22 = external global %struct.ham
+@global.23 = external global %struct.ham
+@global.24 = external constant [32 x i8]
+@global.25 = external global %struct.ham
+@global.26 = external constant [47 x i8]
+@global.27 = external global %struct.ham
+@global.28 = external constant [45 x i8]
+@global.29 = external global %struct.ham
+@global.30 = external global %struct.ham
+@global.31 = external constant [24 x i8]
+@global.32 = external global %struct.ham
+@global.33 = external global %struct.ham
+@global.34 = external global %struct.ham
+@global.35 = external global %struct.ham
+@global.36 = external constant [27 x i8]
+@global.37 = external global %struct.ham
+@global.38 = external constant [10 x i8]
+@global.39 = external global %struct.ham
+@global.40 = external global %struct.ham
+@global.41 = external global %struct.ham
+@global.42 = external global %struct.ham
+@global.43 = external global %struct.ham
+@global.44 = external constant [41 x i8]
+@global.45 = external global %struct.ham
+@global.46 = external global %struct.ham
+@global.47 = external global %struct.ham
+@global.48 = external global %struct.ham
+@global.49 = external constant [52 x i8]
+@global.50 = external constant [47 x i8]
+@global.51 = external global %struct.ham
+@global.52 = external global %struct.ham
+@global.53 = external global %struct.ham
+@global.54 = external global %struct.ham
+@global.55 = external global %struct.ham.3
+@global.56 = external global %struct.bar
+@global.57 = external global i8
+
+declare %struct.ham* @bar(%struct.ham* returned)
+
+declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*)
+
+declare %struct.ham* @wobble(%struct.ham* returned, %struct.ham* ) 
+
+declare i32 @quux(...)
+
+declare i8* @_Znwm(i32)
+
+declare i32 @wobble.58(%struct.pluto*, [1 x i32], %struct.ham* , %struct.hoge* )
+
+declare i32 @widget(%struct.spam*, [1 x i32], %struct.ham* , %struct.wombat* )
+
+; Just check we didn't crash and did output something...
+; CHECK-LABEL: func:
+; CHECK: trap
+define internal void @func() section "__TEXT,__StaticInit,regular,pure_instructions" personality i32 (...)* @quux {
+  %tmp = tail call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.3 to i8*), i8* @global) #0
+  %tmp2 = invoke %struct.ham* @wobble(%struct.ham* undef, %struct.ham*  @global.9)
+          to label %bb14 unwind label %bbunwind
+
+bb14:
+  %tmp15 = getelementptr  i8, i8* undef, i32 12
+  store i8 0, i8* %tmp15
+  %tmp16 = icmp eq i8 undef, 0
+  br i1 %tmp16, label %bb28, label %bb18
+
+bb18:
+  br i1 undef, label %bb21, label %bb29
+
+bb21:
+  %tmp22 = call i8* @_Znwm(i32 16)
+  store i32 17, i32* getelementptr  (%struct.ham, %struct.ham* @global.10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp23 = call i8* @_Znwm(i32 32)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([27 x i8], [27 x i8]* @global.2, i32 0, i32 0), i32 26, i32 1, i1 false)
+  store i32 33, i32* getelementptr  (%struct.ham, %struct.ham* @global.11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  store i32 23, i32* getelementptr  (%struct.ham, %struct.ham* @global.11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([24 x i8], [24 x i8]* @global.7, i32 0, i32 0), i32 23, i32 1, i1 false)
+  %tmp24 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.11 to i8*), i8* @global) #0
+  store i32 49, i32* getelementptr  (%struct.ham, %struct.ham* @global.12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  store i32 37, i32* getelementptr  (%struct.ham, %struct.ham* @global.13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.14 to i8*), i8 0, i32 12, i32 1, i1 false)
+  %tmp25 = call i8* @_Znwm(i32 48)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp25, i8* align 1 getelementptr  ([40 x i8], [40 x i8]* @global.6, i32 0, i32 0), i32 39, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([47 x i8], [47 x i8]* @global.4, i32 0, i32 0), i32 46, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([61 x i8], [61 x i8]* @global.5, i32 0, i32 0), i32 60, i32 1, i1 false)
+  %tmp26 = call i8* @_Znwm(i32 48)
+  store i32 65, i32* getelementptr  (%struct.ham, %struct.ham* @global.15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp27 = icmp eq i8 undef, 0
+  br i1 %tmp27, label %bb30, label %bb33
+
+bb28:
+  call void @llvm.trap()
+  unreachable
+
+bb29:
+  call void @llvm.trap()
+  unreachable
+
+bb30:
+  %tmp31 = icmp eq i32 undef, 37
+  br i1 %tmp31, label %bb32, label %bb30
+
+bb32:
+  store i8 1, i8* @global.57
+  br label %bb33
+
+bb33:
+  %tmp34 = call i8* @_Znwm(i32 32)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([20 x i8], [20 x i8]* @global.1, i32 0, i32 0), i32 19, i32 1, i1 false)
+  store i32 17, i32* getelementptr  (%struct.ham, %struct.ham* @global.16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  store i32 65, i32* getelementptr  (%struct.ham, %struct.ham* @global.17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([35 x i8], [35 x i8]* @global.18, i32 0, i32 0), i32 34, i32 1, i1 false)
+  store i32 65, i32* getelementptr  (%struct.ham, %struct.ham* @global.19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([53 x i8], [53 x i8]* @global.20, i32 0, i32 0), i32 52, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([20 x i8], [20 x i8]* @global.8, i32 0, i32 0), i32 19, i32 1, i1 false)
+  store i32 37, i32* getelementptr  (%struct.ham, %struct.ham* @global.21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  %tmp35 = call i8* @_Znwm(i32 32)
+  store i8 16, i8* bitcast (%struct.ham* @global.22 to i8*)
+  %tmp36 = call i8* @_Znwm(i32 32)
+  store i32 31, i32* getelementptr  (%struct.ham, %struct.ham* @global.23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp36, i8* align 1 getelementptr  ([32 x i8], [32 x i8]* @global.24, i32 0, i32 0), i32 31, i32 1, i1 false)
+  %tmp37 = getelementptr  i8, i8* %tmp36, i32 31
+  store i8 0, i8* %tmp37
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([47 x i8], [47 x i8]* @global.26, i32 0, i32 0), i32 46, i32 1, i1 false)
+  %tmp38 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.25 to i8*), i8* @global) #0
+  %tmp39 = call i8* @_Znwm(i32 48)
+  store i32 44, i32* getelementptr  (%struct.ham, %struct.ham* @global.27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp39, i8* align 1 getelementptr  ([45 x i8], [45 x i8]* @global.28, i32 0, i32 0), i32 44, i32 1, i1 false)
+  %tmp40 = getelementptr  i8, i8* %tmp39, i32 44
+  store i8 0, i8* %tmp40
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.29 to i8*), i8 0, i32 12, i32 1, i1 false)
+  %tmp41 = call i8* @_Znwm(i32 32)
+  store i32 23, i32* getelementptr  (%struct.ham, %struct.ham* @global.30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp41, i8* align 1 getelementptr  ([24 x i8], [24 x i8]* @global.31, i32 0, i32 0), i32 23, i32 1, i1 false)
+  %tmp42 = getelementptr  i8, i8* %tmp41, i32 23
+  store i8 0, i8* %tmp42
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.32 to i8*), i8 0, i32 12, i32 1, i1 false)
+  store i8 16, i8* bitcast (%struct.ham* @global.32 to i8*)
+  %tmp43 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.33 to i8*), i8* @global) #0
+  %tmp44 = call i8* @_Znwm(i32 16)
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.34 to i8*), i8 0, i32 12, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.9 to i8*), i8 0, i32 12, i32 1, i1 false)
+  %tmp45 = call i8* @_Znwm(i32 32)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp45, i8* align 1 getelementptr  ([27 x i8], [27 x i8]* @global.36, i32 0, i32 0), i32 26, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.37 to i8*), i8 0, i32 12, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 getelementptr (%struct.snork, %struct.snork* bitcast (%struct.ham* @global.37 to %struct.snork*), i32 0, i32 1, i32 0), i8* align 1 getelementptr  ([10 x i8], [10 x i8]* @global.38, i32 0, i32 0), i32 9, i32 1, i1 false)
+  store i32 17, i32* getelementptr  (%struct.ham, %struct.ham* @global.39, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp46 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.40 to i8*), i8* @global) #0
+  %tmp47 = call i8* @_Znwm(i32 32)
+  %tmp48 = getelementptr  i8, i8* %tmp47, i32 21
+  store i8 0, i8* %tmp48
+  store i32 33, i32* getelementptr  (%struct.ham, %struct.ham* @global.41, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  store i32 15, i32* getelementptr  (%struct.ham, %struct.ham* @global.42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  %tmp49 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.43 to i8*), i8* @global) #0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([41 x i8], [41 x i8]* @global.44, i32 0, i32 0), i32 40, i32 1, i1 false)
+  %tmp50 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.45 to i8*), i8* @global) #0
+  %tmp51 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.46 to i8*), i8* @global) #0
+  %tmp52 = call i8* @_Znwm(i32 32)
+  store i8* %tmp52, i8** getelementptr  (%struct.ham, %struct.ham* @global.47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([52 x i8], [52 x i8]* @global.49, i32 0, i32 0), i32 51, i32 1, i1 false)
+  %tmp53 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.48 to i8*), i8* @global) #0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr  ([47 x i8], [47 x i8]* @global.50, i32 0, i32 0), i32 46, i32 1, i1 false)
+  store i32 33, i32* getelementptr  (%struct.ham, %struct.ham* @global.51, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  store i32 37, i32* getelementptr  (%struct.ham, %struct.ham* @global.52, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+  %tmp54 = invoke %struct.ham* @wobble(%struct.ham* undef, %struct.ham*  @global.54)
+          to label %bb58 unwind label %bbunwind
+
+bb58:
+  %tmp59 = invoke i32 @wobble.58(%struct.pluto* getelementptr  (%struct.ham.3, %struct.ham.3* @global.55, i32 0, i32 0), [1 x i32] [i32 ptrtoint (%struct.zot* getelementptr  (%struct.ham.3, %struct.ham.3* @global.55, i32 0, i32 0, i32 1, i32 0, i32 0) to i32)], %struct.ham*  undef, %struct.hoge*  undef)
+          to label %bb71 unwind label %bbunwind
+
+bb71:
+  %tmp72 = invoke i32 @widget(%struct.spam* getelementptr  (%struct.bar, %struct.bar* @global.56, i32 0, i32 0), [1 x i32] [i32 ptrtoint (%struct.zot* getelementptr  (%struct.bar, %struct.bar* @global.56, i32 0, i32 0, i32 1, i32 0, i32 0) to i32)], %struct.ham*  undef, %struct.wombat*  undef)
+          to label %bb73 unwind label %bbunwind
+
+bb73:
+  ret void
+
+bbunwind:
+  %tmp75 = landingpad { i8*, i32 }
+          cleanup
+  resume { i8*, i32 } undef
+}
+
+declare void @llvm.trap()
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* , i8* , i32, i32, i1)
+
+declare void @llvm.memset.p0i8.i32(i8* , i8, i32, i32, i1)
+
+attributes #0 = { nounwind }
--- a/test/CodeGen/Thumb/stm-scavenging.ll
+++ b/test/CodeGen/Thumb/stm-scavenging.ll
@ -0,0 +1,46 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "thumbv6---gnueabi"
+
+; Use STM to save the three registers
+; CHECK-LABEL: use_stm:
+; CHECK: .save   {r7, lr}
+; CHECK: .setfp  r7, sp
+; CHECK: stm r3!, {r0, r1, r2}
+; CHECK: bl throws_1
+define void @use_stm(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr noreturn "no-frame-pointer-elim"="true" {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %d, i32 2
+  store i32 %a, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %d, i32 3
+  store i32 %b, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %d, i32 4
+  store i32 %c, i32* %arrayidx2, align 4
+  tail call void @throws_1(i32 %a, i32 %b, i32 %c) noreturn
+  unreachable
+}
+
+; Don't use STM: there is no available register to store
+; the address. We could transform this with some extra math, but
+; that currently isn't implemented.
+; CHECK-LABEL: no_stm:
+; CHECK: .save   {r7, lr}
+; CHECK: .setfp  r7, sp
+; CHECK: str r0,
+; CHECK: str r1,
+; CHECK: str r2,
+; CHECK: bl throws_2
+define void @no_stm(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr noreturn "no-frame-pointer-elim"="true" {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %d, i32 2
+  store i32 %a, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %d, i32 3
+  store i32 %b, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %d, i32 4
+  store i32 %c, i32* %arrayidx2, align 4
+  tail call void @throws_2(i32 %a, i32 %b, i32 %c, i32* %d) noreturn
+  unreachable
+}
+
+
+declare void @throws_1(i32, i32, i32) noreturn
+declare void @throws_2(i32, i32, i32, i32*) noreturn
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@ -5,59 +5,6 @@
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c


-define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
-; X32-LABEL: test_mm512_kunpackb:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X32-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
-; X32-NEXT:    kunpckbw %k0, %k1, %k1
-; X32-NEXT:    vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT:    kmovw %k0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_kunpackb:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
-; X64-NEXT:    kunpckbw %k0, %k1, %k1
-; X64-NEXT:    vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT:    kmovw %k0, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__A to <16 x i32>
-  %1 = bitcast <8 x i64> %__B to <16 x i32>
-  %2 = icmp ne <16 x i32> %0, %1
-  %3 = bitcast <16 x i1> %2 to i16
-  %4 = bitcast <8 x i64> %__C to <16 x i32>
-  %5 = bitcast <8 x i64> %__D to <16 x i32>
-  %6 = icmp ne <16 x i32> %4, %5
-  %7 = bitcast <16 x i1> %6 to i16
-  %8 = and i16 %7, 255
-  %shl.i = shl i16 %3, 8
-  %or.i = or i16 %8, %shl.i
-  %9 = bitcast <8 x i64> %__E to <16 x i32>
-  %10 = bitcast <8 x i64> %__F to <16 x i32>
-  %11 = icmp ne <16 x i32> %9, %10
-  %12 = bitcast i16 %or.i to <16 x i1>
-  %13 = and <16 x i1> %11, %12
-  %14 = bitcast <16 x i1> %13 to i16
-  ret i16 %14
-}
-
 define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
 ; X32-LABEL: test_mm512_shuffle_f32x4:
 ; X32:       # %bb.0: # %entry
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@ -1,20 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s

-declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
-
-define i16 @unpckbw_test(i16 %a0, i16 %a1) {
-; CHECK-LABEL: unpckbw_test:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    shll $8, %esi
-; CHECK-NEXT:    orl %esi, %eax
-; CHECK-NEXT:    ## kill: def %ax killed %ax killed %eax
-; CHECK-NEXT:    retq
-  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
-  ret i16 %res
-}
-
 define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
 ; CHECK:       ## %bb.0:
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@ -96,6 +96,21 @@ define i16 @test_kor(i16 %a0, i16 %a1) {
  ret i16 %t2
 }

+declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
+
+define i16 @unpckbw_test(i16 %a0, i16 %a1) {
+; CHECK-LABEL: unpckbw_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kunpckbw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT:    retq
+  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
+  ret i16 %res
+}
+
 declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
 ; TODO: the two kxnor instructions here a no op and should be elimintaed,
 ; probably by FoldConstantArithmetic in SelectionDAG.
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@ -2775,3 +2775,99 @@ define i8 @test_v8i1_mul(i8 %x, i8 %y) {
  %ret = bitcast <8 x i1> %m2 to i8
  ret i8 %ret
 }
+
+; Make sure we don't emit a ktest for signed comparisons.
+define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
+; KNL-LABEL: ktest_signed:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    pushq %rax
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testw %ax, %ax
+; KNL-NEXT:    jle LBB63_1
+; KNL-NEXT:  ## %bb.2: ## %bb.2
+; KNL-NEXT:    popq %rax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB63_1: ## %bb.1
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    callq _foo
+; KNL-NEXT:    popq %rax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: ktest_signed:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    pushq %rax
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    testw %ax, %ax
+; SKX-NEXT:    jle LBB63_1
+; SKX-NEXT:  ## %bb.2: ## %bb.2
+; SKX-NEXT:    popq %rax
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+; SKX-NEXT:  LBB63_1: ## %bb.1
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    callq _foo
+; SKX-NEXT:    popq %rax
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: ktest_signed:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    pushq %rax
+; AVX512BW-NEXT:    .cfi_def_cfa_offset 16
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    testw %ax, %ax
+; AVX512BW-NEXT:    jle LBB63_1
+; AVX512BW-NEXT:  ## %bb.2: ## %bb.2
+; AVX512BW-NEXT:    popq %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+; AVX512BW-NEXT:  LBB63_1: ## %bb.1
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    callq _foo
+; AVX512BW-NEXT:    popq %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: ktest_signed:
+; AVX512DQ:       ## %bb.0:
+; AVX512DQ-NEXT:    pushq %rax
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 16
+; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    testw %ax, %ax
+; AVX512DQ-NEXT:    jle LBB63_1
+; AVX512DQ-NEXT:  ## %bb.2: ## %bb.2
+; AVX512DQ-NEXT:    popq %rax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+; AVX512DQ-NEXT:  LBB63_1: ## %bb.1
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    callq _foo
+; AVX512DQ-NEXT:    popq %rax
+; AVX512DQ-NEXT:    retq
+  %a = icmp eq <16 x i32> %x, zeroinitializer
+  %b = icmp eq <16 x i32> %y, zeroinitializer
+  %c = and <16 x i1> %a, %b
+  %d = bitcast <16 x i1> %c to i16
+  %e = icmp sgt i16 %d, 0
+  br i1 %e, label %bb.2, label %bb.1
+bb.1:
+  call void @foo()
+  br label %bb.2
+bb.2:
+  ret void
+}
+declare void @foo()
+
--- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@ -4,117 +4,6 @@

 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c

-define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackd:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT:    vmovdqa64 72(%ebp), %zmm4
-; X32-NEXT:    vmovdqa64 8(%ebp), %zmm5
-; X32-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
-; X32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT:    vpcmpneqb %zmm5, %zmm2, %k0
-; X32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    kunpckdq %k0, %k1, %k1
-; X32-NEXT:    vpcmpneqb %zmm3, %zmm4, %k0 {%k1}
-; X32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_kunpackd:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
-; X64-NEXT:    vpcmpneqb %zmm3, %zmm2, %k1
-; X64-NEXT:    kunpckdq %k0, %k1, %k1
-; X64-NEXT:    vpcmpneqb %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT:    kmovq %k0, %rax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__B to <64 x i8>
-  %1 = bitcast <8 x i64> %__A to <64 x i8>
-  %2 = icmp ne <64 x i8> %0, %1
-  %3 = bitcast <64 x i1> %2 to i64
-  %4 = bitcast <8 x i64> %__C to <64 x i8>
-  %5 = bitcast <8 x i64> %__D to <64 x i8>
-  %6 = icmp ne <64 x i8> %4, %5
-  %7 = bitcast <64 x i1> %6 to i64
-  %and.i = and i64 %7, 4294967295
-  %shl.i = shl i64 %3, 32
-  %or.i = or i64 %and.i, %shl.i
-  %8 = bitcast <8 x i64> %__E to <64 x i8>
-  %9 = bitcast <8 x i64> %__F to <64 x i8>
-  %10 = icmp ne <64 x i8> %8, %9
-  %11 = bitcast i64 %or.i to <64 x i1>
-  %12 = and <64 x i1> %10, %11
-  %13 = bitcast <64 x i1> %12 to i64
-  ret i64 %13
-}
-
-define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackw:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
-; X32-NEXT:    vpcmpneqw 8(%ebp), %zmm2, %k1
-; X32-NEXT:    kunpckwd %k0, %k1, %k1
-; X32-NEXT:    vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_kunpackw:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
-; X64-NEXT:    vpcmpneqw %zmm3, %zmm2, %k1
-; X64-NEXT:    kunpckwd %k0, %k1, %k1
-; X64-NEXT:    vpcmpneqw %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__B to <32 x i16>
-  %1 = bitcast <8 x i64> %__A to <32 x i16>
-  %2 = icmp ne <32 x i16> %0, %1
-  %3 = bitcast <32 x i1> %2 to i32
-  %4 = bitcast <8 x i64> %__C to <32 x i16>
-  %5 = bitcast <8 x i64> %__D to <32 x i16>
-  %6 = icmp ne <32 x i16> %4, %5
-  %7 = bitcast <32 x i1> %6 to i32
-  %and.i = and i32 %7, 65535
-  %shl.i = shl i32 %3, 16
-  %or.i = or i32 %and.i, %shl.i
-  %8 = bitcast <8 x i64> %__E to <32 x i16>
-  %9 = bitcast <8 x i64> %__F to <32 x i16>
-  %10 = icmp ne <32 x i16> %8, %9
-  %11 = bitcast i32 %or.i to <32 x i1>
-  %12 = and <32 x i1> %10, %11
-  %13 = bitcast <32 x i1> %12 to i32
-  ret i32 %13
-}
-
-
 define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A)  {
 ; X32-LABEL: test_mm512_mask_set1_epi8:
 ; X32:       # %bb.0: # %entry
@ -189,46 +78,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $55, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $9, %k0, %k1
 ; X32-NEXT:    andb $2, %al
 ; X32-NEXT:    shrb %al
 ; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $54, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $10, %k0, %k1
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    andb $15, %al
 ; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $53, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $3, %al
-; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $12, %eax
-; X32-NEXT:    andl $15, %eax
-; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kmovd %eax, %k4
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    shrl $13, %eax
 ; X32-NEXT:    andb $1, %al
-; X32-NEXT:    kmovd %eax, %k3
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $14, %eax
-; X32-NEXT:    andl $3, %eax
-; X32-NEXT:    kmovd %eax, %k4
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $15, %eax
-; X32-NEXT:    andl $1, %eax
 ; X32-NEXT:    kmovd %eax, %k5
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrl $16, %edx
@ -243,25 +105,52 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kmovd %eax, %k7
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $55, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $9, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $54, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $10, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $53, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $52, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $12, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $51, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $13, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $50, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $14, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $49, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $15, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $48, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@ -494,22 +383,14 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $43, %k0, %k1
 ; X32-NEXT:    kxorq %k4, %k1, %k1
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $12, %esi
-; X32-NEXT:    andl $15, %esi
-; X32-NEXT:    kmovd %esi, %k2
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $14, %esi
-; X32-NEXT:    andl $3, %esi
-; X32-NEXT:    kmovd %esi, %k3
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $15, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    kmovd %esi, %k4
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $20, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $44, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $19, %k1, %k1
@ -520,12 +401,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kshiftrq $18, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $46, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $17, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $47, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $16, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@ -551,8 +440,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $12, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k4
-; X32-NEXT:    kshiftrq $52, %k4, %k0
+; X32-NEXT:    kxorq %k0, %k1, %k3
+; X32-NEXT:    kshiftrq $52, %k3, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $4, %dl
 ; X32-NEXT:    kmovd %edx, %k1
@ -576,19 +465,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k3
+; X32-NEXT:    kmovd %edx, %k4
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $11, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k4
-; X32-NEXT:    kshiftrq $53, %k4, %k5
+; X32-NEXT:    kxorq %k3, %k5, %k3
+; X32-NEXT:    kshiftrq $53, %k3, %k5
 ; X32-NEXT:    kxorq %k6, %k5, %k5
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $10, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k5
-; X32-NEXT:    kshiftrq $54, %k5, %k4
-; X32-NEXT:    kxorq %k7, %k4, %k6
+; X32-NEXT:    kxorq %k3, %k5, %k5
+; X32-NEXT:    kshiftrq $54, %k5, %k3
+; X32-NEXT:    kxorq %k7, %k3, %k6
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k4
+; X32-NEXT:    kmovd %ecx, %k3
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $29, %ecx
 ; X32-NEXT:    andb $1, %cl
@ -603,12 +492,6 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kxorq %k5, %k0, %k0
 ; X32-NEXT:    kshiftrq $56, %k0, %k5
 ; X32-NEXT:    kxorq %k1, %k5, %k1
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k5
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k6
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $7, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@ -618,17 +501,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kshiftrq $6, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $58, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $5, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $59, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $4, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $60, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $28, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $3, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@ -638,7 +524,10 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kshiftrq $2, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $62, %k0, %k1
-; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $30, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    shrl $31, %eax
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
@ -743,46 +632,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $55, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $9, %k0, %k1
 ; X32-NEXT:    andb $2, %al
 ; X32-NEXT:    shrb %al
 ; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $54, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $10, %k0, %k1
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    andb $15, %al
 ; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $53, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $3, %al
-; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $12, %eax
-; X32-NEXT:    andl $15, %eax
-; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kmovd %eax, %k4
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    shrl $13, %eax
 ; X32-NEXT:    andb $1, %al
-; X32-NEXT:    kmovd %eax, %k3
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $14, %eax
-; X32-NEXT:    andl $3, %eax
-; X32-NEXT:    kmovd %eax, %k4
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $15, %eax
-; X32-NEXT:    andl $1, %eax
 ; X32-NEXT:    kmovd %eax, %k5
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrl $16, %edx
@ -797,25 +659,52 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kmovd %eax, %k7
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $55, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $9, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $54, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $10, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $53, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $52, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $12, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $51, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $13, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $50, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $14, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $49, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $15, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $48, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@ -1048,22 +937,14 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $43, %k0, %k1
 ; X32-NEXT:    kxorq %k4, %k1, %k1
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $12, %esi
-; X32-NEXT:    andl $15, %esi
-; X32-NEXT:    kmovd %esi, %k2
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $14, %esi
-; X32-NEXT:    andl $3, %esi
-; X32-NEXT:    kmovd %esi, %k3
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $15, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    kmovd %esi, %k4
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $20, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $44, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $19, %k1, %k1
@ -1074,12 +955,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kshiftrq $18, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $46, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $17, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $47, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $16, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@ -1105,8 +994,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $12, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k4
-; X32-NEXT:    kshiftrq $52, %k4, %k0
+; X32-NEXT:    kxorq %k0, %k1, %k3
+; X32-NEXT:    kshiftrq $52, %k3, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $4, %dl
 ; X32-NEXT:    kmovd %edx, %k1
@ -1130,19 +1019,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k3
+; X32-NEXT:    kmovd %edx, %k4
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $11, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k4
-; X32-NEXT:    kshiftrq $53, %k4, %k5
+; X32-NEXT:    kxorq %k3, %k5, %k3
+; X32-NEXT:    kshiftrq $53, %k3, %k5
 ; X32-NEXT:    kxorq %k6, %k5, %k5
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $10, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k5
-; X32-NEXT:    kshiftrq $54, %k5, %k4
-; X32-NEXT:    kxorq %k7, %k4, %k6
+; X32-NEXT:    kxorq %k3, %k5, %k5
+; X32-NEXT:    kshiftrq $54, %k5, %k3
+; X32-NEXT:    kxorq %k7, %k3, %k6
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k4
+; X32-NEXT:    kmovd %ecx, %k3
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $29, %ecx
 ; X32-NEXT:    andb $1, %cl
@ -1157,12 +1046,6 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kxorq %k5, %k0, %k0
 ; X32-NEXT:    kshiftrq $56, %k0, %k5
 ; X32-NEXT:    kxorq %k1, %k5, %k1
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k5
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k6
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $7, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@ -1172,17 +1055,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kshiftrq $6, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $58, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $5, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $59, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $4, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $60, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $28, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $3, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@ -1192,7 +1078,10 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kshiftrq $2, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $62, %k0, %k1
-; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $30, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    shrl $31, %eax
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@ -2,46 +2,6 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32

-declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
-
-define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    movzwl %di, %eax
-; AVX512BW-NEXT:    shll $16, %esi
-; AVX512BW-NEXT:    orl %esi, %eax
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    shll $16, %eax
-; AVX512F-32-NEXT:    orl %ecx, %eax
-; AVX512F-32-NEXT:    retl
-  %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
-  ret i32 %res
-}
-
-declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
-
-define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    movl %edi, %eax
-; AVX512BW-NEXT:    shlq $32, %rsi
-; AVX512BW-NEXT:    orq %rsi, %rax
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    retl
-  %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
-  ret i64 %res
-}
-
 declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)

  define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@ -1455,6 +1455,55 @@ define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8>
  ret  <8 x i64> %res2
 }

+declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
+
+define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckwd %k0, %k1, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
+  %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
+  ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
+
+define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovq %rdi, %k0
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    kunpckdq %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
+  %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
+  ret i64 %res
+}
+
 declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)

 define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
--- a/test/CodeGen/X86/domain-reassignment.mir
+++ b/test/CodeGen/X86/domain-reassignment.mir
@ -1,22 +1,23 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -run-pass x86-domain-reassignment -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq -o - %s | FileCheck %s
 --- |
  ; ModuleID = '../test/CodeGen/X86/gpr-to-mask.ll'
  source_filename = "../test/CodeGen/X86/gpr-to-mask.ll"
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-unknown"
-  
+
  define void @test_fcmp_storefloat(i1 %cond, float* %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) #0 {
  entry:
    br i1 %cond, label %if, label %else
-  
+
  if:                                               ; preds = %entry
    %cmp1 = fcmp oeq float %f3, %f4
    br label %exit
-  
+
  else:                                             ; preds = %entry
    %cmp2 = fcmp oeq float %f5, %f6
    br label %exit
-  
+
  exit:                                             ; preds = %else, %if
    %val = phi i1 [ %cmp1, %if ], [ %cmp2, %else ]
    %selected = select i1 %val, float %f1, float %f2
@ -48,14 +49,13 @@
 ...
 ---
 name:            test_fcmp_storefloat
-# CHECK-LABEL: name: test_fcmp_storefloat
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
  - { id: 0, class: gr8, preferred-register: '' }
  - { id: 1, class: gr8, preferred-register: '' }
  - { id: 2, class: gr8, preferred-register: '' }
@ -79,7 +79,7 @@ registers:
  - { id: 20, class: fr128, preferred-register: '' }
  - { id: 21, class: fr128, preferred-register: '' }
  - { id: 22, class: fr32x, preferred-register: '' }
-liveins:         
+liveins:
  - { reg: '%edi', virtual-reg: '%3' }
  - { reg: '%rsi', virtual-reg: '%4' }
  - { reg: '%xmm0', virtual-reg: '%5' }
@ -88,7 +88,7 @@ liveins:
  - { reg: '%xmm3', virtual-reg: '%8' }
  - { reg: '%xmm4', virtual-reg: '%9' }
  - { reg: '%xmm5', virtual-reg: '%10' }
-frameInfo:       
+frameInfo:
  isFrameAddressTaken: false
  isReturnAddressTaken: false
  hasStackMap:     false
@ -105,14 +105,51 @@ frameInfo:
  hasMustTailInVarArgFunc: false
  savePoint:       ''
  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_fcmp_storefloat
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK:   liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+  ; CHECK:   [[COPY:%[0-9]+]]:fr32x = COPY %xmm5
+  ; CHECK:   [[COPY1:%[0-9]+]]:fr32x = COPY %xmm4
+  ; CHECK:   [[COPY2:%[0-9]+]]:fr32x = COPY %xmm3
+  ; CHECK:   [[COPY3:%[0-9]+]]:fr32x = COPY %xmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:fr32x = COPY %xmm1
+  ; CHECK:   [[COPY5:%[0-9]+]]:vr128x = COPY %xmm0
+  ; CHECK:   [[COPY6:%[0-9]+]]:gr64 = COPY %rsi
+  ; CHECK:   [[COPY7:%[0-9]+]]:gr32 = COPY %edi
+  ; CHECK:   [[COPY8:%[0-9]+]]:gr8 = COPY [[COPY7]].sub_8bit
+  ; CHECK:   TEST8ri killed [[COPY8]], 1, implicit-def %eflags
+  ; CHECK:   JE_1 %bb.2, implicit %eflags
+  ; CHECK:   JMP_1 %bb.1
+  ; CHECK: bb.1.if:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   [[VCMPSSZrr:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY3]], [[COPY2]], 0
+  ; CHECK:   [[COPY9:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:vk8 = COPY [[COPY9]]
+  ; CHECK:   JMP_1 %bb.3
+  ; CHECK: bb.2.else:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   [[VCMPSSZrr1:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY1]], [[COPY]], 0
+  ; CHECK:   [[COPY11:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr1]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:vk8 = COPY [[COPY11]]
+  ; CHECK: bb.3.exit:
+  ; CHECK:   [[PHI:%[0-9]+]]:vk8 = PHI [[COPY12]], %bb.2, [[COPY10]], %bb.1
+  ; CHECK:   [[COPY13:%[0-9]+]]:vk32 = COPY [[PHI]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:vk1wm = COPY [[COPY13]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:vr128x = COPY [[COPY4]]
+  ; CHECK:   [[DEF:%[0-9]+]]:fr128 = IMPLICIT_DEF
+  ; CHECK:   [[VMOVSSZrrk:%[0-9]+]]:fr128 = VMOVSSZrrk [[COPY15]], killed [[COPY14]], killed [[DEF]], [[COPY5]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:fr32x = COPY [[VMOVSSZrrk]]
+  ; CHECK:   VMOVSSZmr [[COPY6]], 1, %noreg, 0, %noreg, killed [[COPY16]] :: (store 4 into %ir.fptr)
+  ; CHECK:   RET 0
  bb.0.entry:
    successors: %bb.1(0x40000000), %bb.2(0x40000000)
    liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-  
+
    %10 = COPY %xmm5
    %9 = COPY %xmm4
    %8 = COPY %xmm3
@ -125,38 +162,31 @@ body:             |
    TEST8ri killed %11, 1, implicit-def %eflags
    JE_1 %bb.2, implicit %eflags
    JMP_1 %bb.1
-  
+
  bb.1.if:
    successors: %bb.3(0x80000000)
-  
+
    %14 = VCMPSSZrr %7, %8, 0

    ; check that cross domain copies are replaced with same domain copies.
-    ; CHECK: %15:vk32 = COPY %14
-    ; CHECK: %0:vk8 = COPY %15
-    
+
    %15 = COPY %14
    %0 = COPY %15.sub_8bit
    JMP_1 %bb.3
-  
+
  bb.2.else:
    successors: %bb.3(0x80000000)
    %12 = VCMPSSZrr %9, %10, 0

    ; check that cross domain copies are replaced with same domain copies.
-    ; CHECK: %13:vk32 = COPY %12
-    ; CHECK: %1:vk8 = COPY %13

    %13 = COPY %12
    %1 = COPY %13.sub_8bit
-  
+
  bb.3.exit:

    ; check PHI, IMPLICIT_DEF, and INSERT_SUBREG replacers.
-    ; CHECK: %2:vk8 = PHI %1, %bb.2, %0, %bb.1
-    ; CHECK: %16:vk32 = COPY %2
-    ; CHECK: %18:vk1wm = COPY %16
-  
+
    %2 = PHI %1, %bb.2, %0, %bb.1
    %17 = IMPLICIT_DEF
    %16 = INSERT_SUBREG %17, %2, 1
@ -171,14 +201,13 @@ body:             |
 ...
 ---
 name:            test_8bitops
-# CHECK-LABEL: name: test_8bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
  - { id: 0, class: gr64, preferred-register: '' }
  - { id: 1, class: vr512, preferred-register: '' }
  - { id: 2, class: vr512, preferred-register: '' }
@ -198,13 +227,13 @@ registers:
  - { id: 16, class: gr8, preferred-register: '' }
  - { id: 17, class: gr8, preferred-register: '' }
  - { id: 18, class: gr8, preferred-register: '' }
-liveins:         
+liveins:
  - { reg: '%rdi', virtual-reg: '%0' }
  - { reg: '%zmm0', virtual-reg: '%1' }
  - { reg: '%zmm1', virtual-reg: '%2' }
  - { reg: '%zmm2', virtual-reg: '%3' }
  - { reg: '%zmm3', virtual-reg: '%4' }
-frameInfo:       
+frameInfo:
  isFrameAddressTaken: false
  isReturnAddressTaken: false
  hasStackMap:     false
@ -221,32 +250,50 @@ frameInfo:
  hasMustTailInVarArgFunc: false
  savePoint:       ''
  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_8bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3
+  ; CHECK:   [[VCMPPDZrri:%[0-9]+]]:vk8 = VCMPPDZrri [[COPY3]], [[COPY4]], 0
+  ; CHECK:   [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPDZrri]]
+  ; CHECK:   [[COPY6:%[0-9]+]]:vk8 = COPY [[COPY5]]
+  ; CHECK:   [[KSHIFTRBri:%[0-9]+]]:vk8 = KSHIFTRBri [[COPY6]], 2
+  ; CHECK:   [[KSHIFTLBri:%[0-9]+]]:vk8 = KSHIFTLBri [[KSHIFTRBri]], 1
+  ; CHECK:   [[KNOTBrr:%[0-9]+]]:vk8 = KNOTBrr [[KSHIFTLBri]]
+  ; CHECK:   [[KORBrr:%[0-9]+]]:vk8 = KORBrr [[KNOTBrr]], [[KSHIFTRBri]]
+  ; CHECK:   [[KANDBrr:%[0-9]+]]:vk8 = KANDBrr [[KORBrr]], [[KSHIFTLBri]]
+  ; CHECK:   [[KXORBrr:%[0-9]+]]:vk8 = KXORBrr [[KANDBrr]], [[KSHIFTRBri]]
+  ; CHECK:   [[KADDBrr:%[0-9]+]]:vk8 = KADDBrr [[KXORBrr]], [[KNOTBrr]]
+  ; CHECK:   [[COPY7:%[0-9]+]]:vk32 = COPY [[KADDBrr]]
+  ; CHECK:   [[COPY8:%[0-9]+]]:vk8wm = COPY [[COPY7]]
+  ; CHECK:   [[VMOVAPDZrrk:%[0-9]+]]:vr512 = VMOVAPDZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
+  ; CHECK:   VMOVAPDZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPDZrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
  bb.0:
    liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
-  
+
    %0 = COPY %rdi
    %1 = COPY %zmm0
    %2 = COPY %zmm1
    %3 = COPY %zmm2
    %4 = COPY %zmm3
-  
+
    %5 = VCMPPDZrri %3, %4, 0
-    ; CHECK: %6:vk32 = COPY %5
-    ; CHECK: %7:vk8 = COPY %6
    %6 = COPY %5
    %7 = COPY %6.sub_8bit

-    ; CHECK: %12:vk8 = KSHIFTRBri %7, 2
-    ; CHECK: %13:vk8 = KSHIFTLBri %12, 1
-    ; CHECK: %14:vk8 = KNOTBrr %13
-    ; CHECK: %15:vk8 = KORBrr %14, %12
-    ; CHECK: %16:vk8 = KANDBrr %15, %13
-    ; CHECK: %17:vk8 = KXORBrr %16, %12
-    ; CHECK: %18:vk8 = KADDBrr %17, %14
    %12 = SHR8ri %7, 2, implicit-def dead %eflags
    %13 = SHL8ri %12, 1, implicit-def dead %eflags
    %14 = NOT8r %13
@ -254,19 +301,17 @@ body:             |
    %16 = AND8rr %15, %13, implicit-def dead %eflags
    %17 = XOR8rr %16, %12, implicit-def dead %eflags
    %18 = ADD8rr %17, %14, implicit-def dead %eflags
-  
-    ; CHECK: %9:vk32 = COPY %18
-    ; CHECK: %10:vk8wm = COPY %9
+
    %8 = IMPLICIT_DEF
    %9 = INSERT_SUBREG %8, %18, 1
    %10 = COPY %9
    %11 = VMOVAPDZrrk %2, killed %10, %1
-    VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11 
+    VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11

-    ; CHECK: KTESTBrr %18, %18, implicit-def %eflags
-    TEST8rr %18, %18, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST8rr %18, %18, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2

  bb.1:

@ -276,14 +321,13 @@ body:             |
 ...
 ---
 name:            test_16bitops
-# CHECK-LABEL: name: test_16bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
  - { id: 0, class: gr64, preferred-register: '' }
  - { id: 1, class: vr512, preferred-register: '' }
  - { id: 2, class: vr512, preferred-register: '' }
@ -302,13 +346,13 @@ registers:
  - { id: 15, class: gr16, preferred-register: '' }
  - { id: 16, class: gr16, preferred-register: '' }
  - { id: 17, class: gr16, preferred-register: '' }
-liveins:         
+liveins:
  - { reg: '%rdi', virtual-reg: '%0' }
  - { reg: '%zmm0', virtual-reg: '%1' }
  - { reg: '%zmm1', virtual-reg: '%2' }
  - { reg: '%zmm2', virtual-reg: '%3' }
  - { reg: '%zmm3', virtual-reg: '%4' }
-frameInfo:       
+frameInfo:
  isFrameAddressTaken: false
  isReturnAddressTaken: false
  hasStackMap:     false
@ -325,50 +369,66 @@ frameInfo:
  hasMustTailInVarArgFunc: false
  savePoint:       ''
  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_16bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3
+  ; CHECK:   [[VCMPPSZrri:%[0-9]+]]:vk16 = VCMPPSZrri [[COPY3]], [[COPY4]], 0
+  ; CHECK:   [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPSZrri]]
+  ; CHECK:   [[COPY6:%[0-9]+]]:vk16 = COPY [[COPY5]]
+  ; CHECK:   [[KSHIFTRWri:%[0-9]+]]:vk16 = KSHIFTRWri [[COPY6]], 2
+  ; CHECK:   [[KSHIFTLWri:%[0-9]+]]:vk16 = KSHIFTLWri [[KSHIFTRWri]], 1
+  ; CHECK:   [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[KSHIFTLWri]]
+  ; CHECK:   [[KORWrr:%[0-9]+]]:vk16 = KORWrr [[KNOTWrr]], [[KSHIFTRWri]]
+  ; CHECK:   [[KANDWrr:%[0-9]+]]:vk16 = KANDWrr [[KORWrr]], [[KSHIFTLWri]]
+  ; CHECK:   [[KXORWrr:%[0-9]+]]:vk16 = KXORWrr [[KANDWrr]], [[KSHIFTRWri]]
+  ; CHECK:   [[COPY7:%[0-9]+]]:vk32 = COPY [[KXORWrr]]
+  ; CHECK:   [[COPY8:%[0-9]+]]:vk16wm = COPY [[COPY7]]
+  ; CHECK:   [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
+  ; CHECK:   VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
  bb.0:
    liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
-  
+
    %0 = COPY %rdi
    %1 = COPY %zmm0
    %2 = COPY %zmm1
    %3 = COPY %zmm2
    %4 = COPY %zmm3
-  
+
    %5 = VCMPPSZrri %3, %4, 0
-    ; CHECK: %6:vk32 = COPY %5
-    ; CHECK: %7:vk16 = COPY %6
    %6 = COPY %5
    %7 = COPY %6.sub_16bit

-    ; CHECK: %12:vk16 = KSHIFTRWri %7, 2
-    ; CHECK: %13:vk16 = KSHIFTLWri %12, 1
-    ; CHECK: %14:vk16 = KNOTWrr %13
-    ; CHECK: %15:vk16 = KORWrr %14, %12
-    ; CHECK: %16:vk16 = KANDWrr %15, %13
-    ; CHECK: %17:vk16 = KXORWrr %16, %12
    %12 = SHR16ri %7, 2, implicit-def dead %eflags
    %13 = SHL16ri %12, 1, implicit-def dead %eflags
    %14 = NOT16r %13
    %15 = OR16rr %14, %12, implicit-def dead %eflags
    %16 = AND16rr %15, %13, implicit-def dead %eflags
    %17 = XOR16rr %16, %12, implicit-def dead %eflags
-  
-    ; CHECK: %9:vk32 = COPY %17
-    ; CHECK: %10:vk16wm = COPY %9
+
    %8 = IMPLICIT_DEF
    %9 = INSERT_SUBREG %8, %17, 3
    %10 = COPY %9
    %11 = VMOVAPSZrrk %2, killed %10, %1
-    VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11 
+    VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11

-    ; CHECK: KTESTWrr %17, %17, implicit-def %eflags
-    TEST16rr %17, %17, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST16rr %17, %17, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2

  bb.1:

@ -378,14 +438,13 @@ body:             |
 ...
 ---
 name:            test_32bitops
-# CHECK-LABEL: name: test_32bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
  - { id: 0, class: gr64, preferred-register: '' }
  - { id: 1, class: vr512, preferred-register: '' }
  - { id: 2, class: vr512, preferred-register: '' }
@ -400,11 +459,11 @@ registers:
  - { id: 11, class: gr32, preferred-register: '' }
  - { id: 12, class: gr32, preferred-register: '' }
  - { id: 13, class: gr32, preferred-register: '' }
-liveins:         
+liveins:
  - { reg: '%rdi', virtual-reg: '%0' }
  - { reg: '%zmm0', virtual-reg: '%1' }
  - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
  isFrameAddressTaken: false
  isReturnAddressTaken: false
  hasStackMap:     false
@ -421,26 +480,40 @@ frameInfo:
  hasMustTailInVarArgFunc: false
  savePoint:       ''
  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_32bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[KMOVDkm:%[0-9]+]]:vk32 = KMOVDkm [[COPY]], 1, %noreg, 0, %noreg
+  ; CHECK:   [[KSHIFTRDri:%[0-9]+]]:vk32 = KSHIFTRDri [[KMOVDkm]], 2
+  ; CHECK:   [[KSHIFTLDri:%[0-9]+]]:vk32 = KSHIFTLDri [[KSHIFTRDri]], 1
+  ; CHECK:   [[KNOTDrr:%[0-9]+]]:vk32 = KNOTDrr [[KSHIFTLDri]]
+  ; CHECK:   [[KORDrr:%[0-9]+]]:vk32 = KORDrr [[KNOTDrr]], [[KSHIFTRDri]]
+  ; CHECK:   [[KANDDrr:%[0-9]+]]:vk32 = KANDDrr [[KORDrr]], [[KSHIFTLDri]]
+  ; CHECK:   [[KXORDrr:%[0-9]+]]:vk32 = KXORDrr [[KANDDrr]], [[KSHIFTRDri]]
+  ; CHECK:   [[KANDNDrr:%[0-9]+]]:vk32 = KANDNDrr [[KXORDrr]], [[KORDrr]]
+  ; CHECK:   [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[KANDNDrr]], [[KXORDrr]]
+  ; CHECK:   [[COPY3:%[0-9]+]]:vk32wm = COPY [[KADDDrr]]
+  ; CHECK:   [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
+  ; CHECK:   VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
  bb.0:
    liveins: %rdi, %zmm0, %zmm1
-  
+
    %0 = COPY %rdi
    %1 = COPY %zmm0
    %2 = COPY %zmm1
-  
-    ; CHECK: %5:vk32 = KMOVDkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk32 = KSHIFTRDri %5, 2
-    ; CHECK: %7:vk32 = KSHIFTLDri %6, 1
-    ; CHECK: %8:vk32 = KNOTDrr %7
-    ; CHECK: %9:vk32 = KORDrr %8, %6
-    ; CHECK: %10:vk32 = KANDDrr %9, %7
-    ; CHECK: %11:vk32 = KXORDrr %10, %6
-    ; CHECK: %12:vk32 = KANDNDrr %11, %9
-    ; CHECK: %13:vk32 = KADDDrr %12, %11
+
    %5 = MOV32rm %0, 1, %noreg, 0, %noreg
    %6 = SHR32ri %5, 2, implicit-def dead %eflags
    %7 = SHL32ri %6, 1, implicit-def dead %eflags
@ -450,16 +523,15 @@ body:             |
    %11 = XOR32rr %10, %6, implicit-def dead %eflags
    %12 = ANDN32rr %11, %9, implicit-def dead %eflags
    %13 = ADD32rr %12, %11, implicit-def dead %eflags
-  
-    ; CHECK: %3:vk32wm = COPY %13
+
    %3 = COPY %13
    %4 = VMOVDQU16Zrrk %2, killed %3, %1
    VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4

-    ; CHECK: KTESTDrr %13, %13, implicit-def %eflags
-    TEST32rr %13, %13, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST32rr %13, %13, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2

  bb.1:

@ -469,14 +541,13 @@ body:             |
 ...
 ---
 name:            test_64bitops
-# CHECK-LABEL: name: test_64bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
  - { id: 0, class: gr64, preferred-register: '' }
  - { id: 1, class: vr512, preferred-register: '' }
  - { id: 2, class: vr512, preferred-register: '' }
@ -491,11 +562,11 @@ registers:
  - { id: 11, class: gr64, preferred-register: '' }
  - { id: 12, class: gr64, preferred-register: '' }
  - { id: 13, class: gr64, preferred-register: '' }
-liveins:         
+liveins:
  - { reg: '%rdi', virtual-reg: '%0' }
  - { reg: '%zmm0', virtual-reg: '%1' }
  - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
  isFrameAddressTaken: false
  isReturnAddressTaken: false
  hasStackMap:     false
@ -512,26 +583,40 @@ frameInfo:
  hasMustTailInVarArgFunc: false
  savePoint:       ''
  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_64bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[KMOVQkm:%[0-9]+]]:vk64 = KMOVQkm [[COPY]], 1, %noreg, 0, %noreg
+  ; CHECK:   [[KSHIFTRQri:%[0-9]+]]:vk64 = KSHIFTRQri [[KMOVQkm]], 2
+  ; CHECK:   [[KSHIFTLQri:%[0-9]+]]:vk64 = KSHIFTLQri [[KSHIFTRQri]], 1
+  ; CHECK:   [[KNOTQrr:%[0-9]+]]:vk64 = KNOTQrr [[KSHIFTLQri]]
+  ; CHECK:   [[KORQrr:%[0-9]+]]:vk64 = KORQrr [[KNOTQrr]], [[KSHIFTRQri]]
+  ; CHECK:   [[KANDQrr:%[0-9]+]]:vk64 = KANDQrr [[KORQrr]], [[KSHIFTLQri]]
+  ; CHECK:   [[KXORQrr:%[0-9]+]]:vk64 = KXORQrr [[KANDQrr]], [[KSHIFTRQri]]
+  ; CHECK:   [[KANDNQrr:%[0-9]+]]:vk64 = KANDNQrr [[KXORQrr]], [[KORQrr]]
+  ; CHECK:   [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[KANDNQrr]], [[KXORQrr]]
+  ; CHECK:   [[COPY3:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
+  ; CHECK:   [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
+  ; CHECK:   VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
  bb.0:
    liveins: %rdi, %zmm0, %zmm1
-  
+
    %0 = COPY %rdi
    %1 = COPY %zmm0
    %2 = COPY %zmm1
-  
-    ; CHECK: %5:vk64 = KMOVQkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk64 = KSHIFTRQri %5, 2
-    ; CHECK: %7:vk64 = KSHIFTLQri %6, 1
-    ; CHECK: %8:vk64 = KNOTQrr %7
-    ; CHECK: %9:vk64 = KORQrr %8, %6
-    ; CHECK: %10:vk64 = KANDQrr %9, %7
-    ; CHECK: %11:vk64 = KXORQrr %10, %6
-    ; CHECK: %12:vk64 = KANDNQrr %11, %9
-    ; CHECK: %13:vk64 = KADDQrr %12, %11
+
    %5 = MOV64rm %0, 1, %noreg, 0, %noreg
    %6 = SHR64ri %5, 2, implicit-def dead %eflags
    %7 = SHL64ri %6, 1, implicit-def dead %eflags
@ -541,16 +626,15 @@ body:             |
    %11 = XOR64rr %10, %6, implicit-def dead %eflags
    %12 = ANDN64rr %11, %9, implicit-def dead %eflags
    %13 = ADD64rr %12, %11, implicit-def dead %eflags
-  
-    ; CHECK: %3:vk64wm = COPY %13
+
    %3 = COPY %13
    %4 = VMOVDQU8Zrrk %2, killed %3, %1
    VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4

-    ; CHECK: KTESTQrr %13, %13, implicit-def %eflags
-    TEST64rr %13, %13, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST64rr %13, %13, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2

  bb.1:

@ -560,14 +644,13 @@ body:             |
 ...
 ---
 name:            test_16bitext
-# CHECK-LABEL: name: test_16bitext
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
  - { id: 0, class: gr64, preferred-register: '' }
  - { id: 1, class: vr512, preferred-register: '' }
  - { id: 2, class: vr512, preferred-register: '' }
@ -575,11 +658,11 @@ registers:
  - { id: 4, class: vr512, preferred-register: '' }
  - { id: 5, class: gr16, preferred-register: '' }
  - { id: 6, class: gr16, preferred-register: '' }
-liveins:         
+liveins:
  - { reg: '%rdi', virtual-reg: '%0' }
  - { reg: '%zmm0', virtual-reg: '%1' }
  - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
  isFrameAddressTaken: false
  isReturnAddressTaken: false
  hasStackMap:     false
@ -596,24 +679,32 @@ frameInfo:
  hasMustTailInVarArgFunc: false
  savePoint:       ''
  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
  bb.0:
    liveins: %rdi, %zmm0, %zmm1
-  
+
+    ; CHECK-LABEL: name: test_16bitext
+    ; CHECK: liveins: %rdi, %zmm0, %zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk16 = COPY [[KMOVBkm]]
+    ; CHECK: [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[COPY3]]
+    ; CHECK: [[COPY4:%[0-9]+]]:vk16wm = COPY [[KNOTWrr]]
+    ; CHECK: [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY4]], [[COPY1]]
+    ; CHECK: VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]]
+    ; CHECK: RET 0
    %0 = COPY %rdi
    %1 = COPY %zmm0
    %2 = COPY %zmm1
-  
-    ; CHECK: %7:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %5:vk16 = COPY %7
-    ; CHECK: %6:vk16 = KNOTWrr %5
+
    %5 = MOVZX16rm8 %0, 1, %noreg, 0, %noreg
    %6 = NOT16r %5

-    ; CHECK: %3:vk16wm = COPY %6
    %3 = COPY %6
    %4 = VMOVAPSZrrk %2, killed %3, %1
    VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %4
@ -622,14 +713,13 @@ body:             |
 ...
 ---
 name:            test_32bitext
-# CHECK-LABEL: name: test_32bitext
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
  - { id: 0, class: gr64, preferred-register: '' }
  - { id: 1, class: vr512, preferred-register: '' }
  - { id: 2, class: vr512, preferred-register: '' }
@ -638,11 +728,11 @@ registers:
  - { id: 5, class: gr32, preferred-register: '' }
  - { id: 6, class: gr32, preferred-register: '' }
  - { id: 7, class: gr32, preferred-register: '' }
-liveins:         
+liveins:
  - { reg: '%rdi', virtual-reg: '%0' }
  - { reg: '%zmm0', virtual-reg: '%1' }
  - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
  isFrameAddressTaken: false
  isReturnAddressTaken: false
  hasStackMap:     false
@ -659,27 +749,35 @@ frameInfo:
  hasMustTailInVarArgFunc: false
  savePoint:       ''
  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
  bb.0:
    liveins: %rdi, %zmm0, %zmm1
-  
+
+    ; CHECK-LABEL: name: test_32bitext
+    ; CHECK: liveins: %rdi, %zmm0, %zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk32 = COPY [[KMOVBkm]]
+    ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY4:%[0-9]+]]:vk32 = COPY [[KMOVWkm]]
+    ; CHECK: [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDDrr]]
+    ; CHECK: [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
+    ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]]
+    ; CHECK: RET 0
    %0 = COPY %rdi
    %1 = COPY %zmm0
    %2 = COPY %zmm1
-  
-    ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %5:vk32 = COPY %8
-    ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk32 = COPY %9
-    ; CHECK: %7:vk32 = KADDDrr %5, %6
+
    %5 = MOVZX32rm8 %0, 1, %noreg, 0, %noreg
    %6 = MOVZX32rm16 %0, 1, %noreg, 0, %noreg
    %7 = ADD32rr %5, %6, implicit-def dead %eflags

-    ; CHECK: %3:vk64wm = COPY %7
    %3 = COPY %7
    %4 = VMOVDQU16Zrrk %2, killed %3, %1
    VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
@ -688,14 +786,13 @@ body:             |
 ...
 ---
 name:            test_64bitext
-# CHECK-LABEL: name: test_64bitext
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
  - { id: 0, class: gr64, preferred-register: '' }
  - { id: 1, class: vr512, preferred-register: '' }
  - { id: 2, class: vr512, preferred-register: '' }
@ -704,11 +801,11 @@ registers:
  - { id: 5, class: gr64, preferred-register: '' }
  - { id: 6, class: gr64, preferred-register: '' }
  - { id: 7, class: gr64, preferred-register: '' }
-liveins:         
+liveins:
  - { reg: '%rdi', virtual-reg: '%0' }
  - { reg: '%zmm0', virtual-reg: '%1' }
  - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
  isFrameAddressTaken: false
  isReturnAddressTaken: false
  hasStackMap:     false
@ -725,27 +822,35 @@ frameInfo:
  hasMustTailInVarArgFunc: false
  savePoint:       ''
  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
  bb.0:
    liveins: %rdi, %zmm0, %zmm1
-  
+
+    ; CHECK-LABEL: name: test_64bitext
+    ; CHECK: liveins: %rdi, %zmm0, %zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk64 = COPY [[KMOVBkm]]
+    ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY4:%[0-9]+]]:vk64 = COPY [[KMOVWkm]]
+    ; CHECK: [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
+    ; CHECK: [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
+    ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]]
+    ; CHECK: RET 0
    %0 = COPY %rdi
    %1 = COPY %zmm0
    %2 = COPY %zmm1
-  
-    ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %5:vk64 = COPY %8
-    ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk64 = COPY %9
-    ; CHECK: %7:vk64 = KADDQrr %5, %6
+
    %5 = MOVZX64rm8 %0, 1, %noreg, 0, %noreg
    %6 = MOVZX64rm16 %0, 1, %noreg, 0, %noreg
    %7 = ADD64rr %5, %6, implicit-def dead %eflags

-    ; CHECK: %3:vk64wm = COPY %7
    %3 = COPY %7
    %4 = VMOVDQU8Zrrk %2, killed %3, %1
    VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
--- a/test/CodeGen/X86/inline-asm-modifier-V.ll
+++ b/test/CodeGen/X86/inline-asm-modifier-V.ll
@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=i686-- -no-integrated-as | FileCheck -check-prefix=X86 %s
+; RUN: llc < %s -mtriple=x86_64-- -no-integrated-as | FileCheck -check-prefix=X64 %s
+
+; If the target does not have 64-bit integer registers, emit 32-bit register
+; names.
+
+; X86: call __x86_indirect_thunk_e{{[abcd]}}x
+; X64: call __x86_indirect_thunk_r
+
+define void @q_modifier(i32* %p) {
+entry:
+  tail call void asm sideeffect "call __x86_indirect_thunk_${0:V}", "r,~{dirflag},~{fpsr},~{flags}"(i32* %p)
+  ret void
+}
--- a/test/CodeGen/X86/pr36199.ll
+++ b/test/CodeGen/X86/pr36199.ll
@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s
+
+define void @foo() unnamed_addr #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddps %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovups %zmm0, (%rax)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %1 = fadd <16 x float> undef, undef
+  %bc256 = bitcast <16 x float> %1 to <4 x i128>
+  %2 = extractelement <4 x i128> %bc256, i32 0
+  %3 = bitcast i128 %2 to <4 x float>
+  %4 = shufflevector <4 x float> %3, <4 x float> undef, <16 x i32> <i32 0, i32
+1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0,
+i32 1, i32 2, i32 3>
+  store <16 x float> %4, <16 x float>* undef, align 4
+  ret void
+}
--- a/test/CodeGen/X86/retpoline-external.ll
+++ b/test/CodeGen/X86/retpoline-external.ll
@ -23,18 +23,18 @@ entry:
 ; X64:       callq bar
 ; X64-DAG:   movl %[[x]], %edi
 ; X64-DAG:   movq %[[fp]], %r11
-; X64:       callq __llvm_external_retpoline_r11
+; X64:       callq __x86_indirect_thunk_r11
 ; X64:       movl %[[x]], %edi
 ; X64:       callq bar
 ; X64-DAG:   movl %[[x]], %edi
 ; X64-DAG:   movq %[[fp]], %r11
-; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64:       jmp __x86_indirect_thunk_r11 # TAILCALL

 ; X64FAST-LABEL: icall_reg:
 ; X64FAST:       callq bar
-; X64FAST:       callq __llvm_external_retpoline_r11
+; X64FAST:       callq __x86_indirect_thunk_r11
 ; X64FAST:       callq bar
-; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST:       jmp __x86_indirect_thunk_r11 # TAILCALL

 ; X86-LABEL: icall_reg:
 ; X86-DAG:   movl 12(%esp), %[[fp:[^ ]*]]
@ -43,19 +43,19 @@ entry:
 ; X86:       calll bar
 ; X86:       movl %[[fp]], %eax
 ; X86:       pushl %[[x]]
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86:       pushl %[[x]]
 ; X86:       calll bar
 ; X86:       movl %[[fp]], %eax
 ; X86:       pushl %[[x]]
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86-NOT:   # TAILCALL

 ; X86FAST-LABEL: icall_reg:
 ; X86FAST:       calll bar
-; X86FAST:       calll __llvm_external_retpoline_eax
+; X86FAST:       calll __x86_indirect_thunk_eax
 ; X86FAST:       calll bar
-; X86FAST:       calll __llvm_external_retpoline_eax
+; X86FAST:       calll __x86_indirect_thunk_eax


@global_fp = external global void (i32)*
@ -72,28 +72,28 @@ define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 {
 ; X64-LABEL: icall_global_fp:
 ; X64-DAG:   movl %edi, %[[x:[^ ]*]]
 ; X64-DAG:   movq global_fp(%rip), %r11
-; X64:       callq __llvm_external_retpoline_r11
+; X64:       callq __x86_indirect_thunk_r11
 ; X64-DAG:   movl %[[x]], %edi
 ; X64-DAG:   movq global_fp(%rip), %r11
-; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64:       jmp __x86_indirect_thunk_r11 # TAILCALL

 ; X64FAST-LABEL: icall_global_fp:
 ; X64FAST:       movq global_fp(%rip), %r11
-; X64FAST:       callq __llvm_external_retpoline_r11
+; X64FAST:       callq __x86_indirect_thunk_r11
 ; X64FAST:       movq global_fp(%rip), %r11
-; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST:       jmp __x86_indirect_thunk_r11 # TAILCALL

 ; X86-LABEL: icall_global_fp:
 ; X86:       movl global_fp, %eax
 ; X86:       pushl 4(%esp)
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86:       addl $4, %esp
 ; X86:       movl global_fp, %eax
-; X86:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86:       jmp __x86_indirect_thunk_eax # TAILCALL

 ; X86FAST-LABEL: icall_global_fp:
-; X86FAST:       calll __llvm_external_retpoline_eax
-; X86FAST:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86FAST:       calll __x86_indirect_thunk_eax
+; X86FAST:       jmp __x86_indirect_thunk_eax # TAILCALL


 %struct.Foo = type { void (%struct.Foo*)** }
@ -114,14 +114,14 @@ define void @vcall(%struct.Foo* %obj) #0 {
 ; X64:       movq (%[[obj]]), %[[vptr:[^ ]*]]
 ; X64:       movq 8(%[[vptr]]), %[[fp:[^ ]*]]
 ; X64:       movq %[[fp]], %r11
-; X64:       callq __llvm_external_retpoline_r11
+; X64:       callq __x86_indirect_thunk_r11
 ; X64-DAG:   movq %[[obj]], %rdi
 ; X64-DAG:   movq %[[fp]], %r11
-; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64:       jmp __x86_indirect_thunk_r11 # TAILCALL

 ; X64FAST-LABEL: vcall:
-; X64FAST:       callq __llvm_external_retpoline_r11
-; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST:       callq __x86_indirect_thunk_r11
+; X64FAST:       jmp __x86_indirect_thunk_r11 # TAILCALL

 ; X86-LABEL: vcall:
 ; X86:       movl 8(%esp), %[[obj:[^ ]*]]
@ -129,14 +129,14 @@ define void @vcall(%struct.Foo* %obj) #0 {
 ; X86:       movl 4(%[[vptr]]), %[[fp:[^ ]*]]
 ; X86:       movl %[[fp]], %eax
 ; X86:       pushl %[[obj]]
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86:       addl $4, %esp
 ; X86:       movl %[[fp]], %eax
-; X86:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86:       jmp __x86_indirect_thunk_eax # TAILCALL

 ; X86FAST-LABEL: vcall:
-; X86FAST:       calll __llvm_external_retpoline_eax
-; X86FAST:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86FAST:       calll __x86_indirect_thunk_eax
+; X86FAST:       jmp __x86_indirect_thunk_eax # TAILCALL


 declare void @direct_callee()
--- a/test/CodeGen/X86/retpoline-regparm.ll
+++ b/test/CodeGen/X86/retpoline-regparm.ll
@ -0,0 +1,42 @@
+; RUN: llc -mtriple=i686-linux < %s | FileCheck --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" %s
+
+; Test 32-bit retpoline when -mregparm=3 is used. This case is interesting
+; because there are no available scratch registers.  The Linux kernel builds
+; with -mregparm=3, so we need to support it.  TCO should fail because we need
+; to restore EDI.
+
+define void @call_edi(void (i32, i32, i32)* %fp) #0 {
+entry:
+  tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0)
+  ret void
+}
+
+; CHECK-LABEL: call_edi:
+;     EDI is used, so it must be saved.
+; CHECK: pushl %edi
+; CHECK-DAG: xorl %eax, %eax
+; CHECK-DAG: xorl %edx, %edx
+; CHECK-DAG: xorl %ecx, %ecx
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK: calll __llvm_retpoline_edi
+; CHECK: popl %edi
+; CHECK: retl
+
+define void @edi_external(void (i32, i32, i32)* %fp) #1 {
+entry:
+  tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0)
+  ret void
+}
+
+; CHECK-LABEL: edi_external:
+; CHECK: pushl %edi
+; CHECK-DAG: xorl %eax, %eax
+; CHECK-DAG: xorl %edx, %edx
+; CHECK-DAG: xorl %ecx, %ecx
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK: calll __x86_indirect_thunk_edi
+; CHECK: popl %edi
+; CHECK: retl
+
+attributes #0 = { "target-features"="+retpoline" }
+attributes #1 = { "target-features"="+retpoline-external-thunk" }
--- a/test/CodeGen/X86/retpoline.ll
+++ b/test/CodeGen/X86/retpoline.ll
@ -340,10 +340,10 @@ latch:
 ; X86-NEXT:          movl    %edx, (%esp)
 ; X86-NEXT:          retl
 ;
-; X86-LABEL:         .section        .text.__llvm_retpoline_push,{{.*}},__llvm_retpoline_push,comdat
-; X86-NEXT:          .hidden __llvm_retpoline_push
-; X86-NEXT:          .weak   __llvm_retpoline_push
-; X86:       __llvm_retpoline_push:
+; X86-LABEL:         .section        .text.__llvm_retpoline_edi,{{.*}},__llvm_retpoline_edi,comdat
+; X86-NEXT:          .hidden __llvm_retpoline_edi
+; X86-NEXT:          .weak   __llvm_retpoline_edi
+; X86:       __llvm_retpoline_edi:
 ; X86-NEXT:  # {{.*}}                                # %entry
 ; X86-NEXT:          calll   [[CALL_TARGET:.*]]
 ; X86-NEXT:  [[CAPTURE_SPEC:.*]]:                    # Block address taken
@ -355,11 +355,7 @@ latch:
 ; X86-NEXT:          .p2align        4, 0x90
 ; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
 ; X86-NEXT:                                          # %entry
-; X86-NEXT:          addl    $4, %esp
-; X86-NEXT:          pushl   4(%esp)
-; X86-NEXT:          pushl   4(%esp)
-; X86-NEXT:          popl    8(%esp)
-; X86-NEXT:          popl    (%esp)
+; X86-NEXT:          movl    %edi, (%esp)
 ; X86-NEXT:          retl


--- a/test/DebugInfo/X86/void-typedef.ll
+++ b/test/DebugInfo/X86/void-typedef.ll
@ -0,0 +1,88 @@
+; Choosing CodeView generates debug metadata for class-scope typedefs that
+; Dwarf would normally omit.  Choosing both CodeView and Dwarf triggered
+; assertion failures and crashes because the Dwarf handler wasn't prepared for
+; those records (in particular, ones with the void type represented by a
+; null pointer).
+;
+; This test was generated with:
+;    clang++ -cc1 -emit-llvm -debug-info-kind=limited -dwarf-version=4 -gcodeview -x c++
+; on the following source code:
+;
+;   class A {
+;     typedef void _Nodeptr;
+;   };
+;   class B {
+;     A FailedTestsCache;
+;     bool m_fn1();
+;   };
+;   bool B::m_fn1() {}
+;
+; CodeView generates a DIDerivedType for the _Nodeptr typedef.
+;
+; RUN: llc %s -o - 2>&1 | FileCheck %s
+; CHECK-NOT: Assertion failed
+
+; ModuleID = 'bug.cpp'
+source_filename = "bug.cpp"
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+%class.B = type { %class.A }
+%class.A = type { i8 }
+
+; Function Attrs: noinline nounwind optnone
+define x86_thiscallcc zeroext i1 @"\01?m_fn1@B@@AAE_NXZ"(%class.B* %this) #0 align 2 !dbg !9 {
+entry:
+  %retval = alloca i1, align 1
+  %this.addr = alloca %class.B*, align 4
+  store %class.B* %this, %class.B** %this.addr, align 4
+  call void @llvm.dbg.declare(metadata %class.B** %this.addr, metadata !22, metadata !DIExpression()), !dbg !24
+  %this1 = load %class.B*, %class.B** %this.addr, align 4
+  call void @llvm.trap(), !dbg !25
+  unreachable, !dbg !25
+
+return:                                           ; No predecessors!
+  %0 = load i1, i1* %retval, align 1, !dbg !25
+  ret i1 %0, !dbg !25
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap() #2
+
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { noreturn nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "D:\5Csrc\5Cbug", checksumkind: CSK_MD5, checksum: "2216f11c5ddda8c48a6f92a6079ad4b6")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"CodeView", i32 1}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 2}
+!8 = !{!"clang version 6.0.0 "}
+!9 = distinct !DISubprogram(name: "m_fn1", linkageName: "\01?m_fn1@B@@AAE_NXZ", scope: !11, file: !10, line: 8, type: !18, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !17, variables: !2)
+!10 = !DIFile(filename: "bug.cpp", directory: "D:\5Csrc\5Cbug", checksumkind: CSK_MD5, checksum: "2216f11c5ddda8c48a6f92a6079ad4b6")
+!11 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "B", file: !10, line: 4, size: 8, elements: !12, identifier: ".?AVB@@")
+!12 = !{!13, !17}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "FailedTestsCache", scope: !11, file: !10, line: 5, baseType: !14, size: 8)
+!14 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !10, line: 1, size: 8, elements: !15, identifier: ".?AVA@@")
+!15 = !{!16}
+!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "_Nodeptr", scope: !14, file: !10, line: 2, baseType: null)
+!17 = !DISubprogram(name: "m_fn1", linkageName: "\01?m_fn1@B@@AAE_NXZ", scope: !11, file: !10, line: 6, type: !18, isLocal: false, isDefinition: false, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false)
+!18 = !DISubroutineType(cc: DW_CC_BORLAND_thiscall, types: !19)
+!19 = !{!20, !21}
+!20 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean)
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
+!22 = !DILocalVariable(name: "this", arg: 1, scope: !9, type: !23, flags: DIFlagArtificial | DIFlagObjectPointer)
+!23 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32)
+!24 = !DILocation(line: 0, scope: !9)
+!25 = !DILocation(line: 8, scope: !9)
--- a/test/MC/AsmParser/inline_macro_duplication.ll
+++ b/test/MC/AsmParser/inline_macro_duplication.ll
@ -0,0 +1,8 @@
+; RUN: not llc < %s 2>&1 | FileCheck %s
+
+define void @test() {
+  call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1
+  call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1
+; CHECK: error: macro 'FOO' is already defined
+  ret void
+}
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@ -622,6 +622,11 @@ movl	$12, foo(%rip)
 // CHECK: encoding: [0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
 // CHECK:    fixup A - offset: 2, value: foo-8, kind: reloc_riprel_4byte

+// rdar://37247000
+movl	$12, 1024(%rip)
+// CHECK: movl	$12, 1024(%rip)
+// CHECK: encoding: [0xc7,0x05,0x00,0x04,0x00,0x00,0x0c,0x00,0x00,0x00]
+
 movq	$12, foo(%rip)
 // CHECK:  movq	$12, foo(%rip)
 // CHECK: encoding: [0x48,0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
--- a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@ -722,6 +722,114 @@ define <2 x half> @constant_rtz_pkrtz() {
  ret <2 x half> %cvt
 }

+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pknorm.i16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pknorm_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
+define <2 x i16> @undef_lhs_cvt_pknorm_i16(float %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pknorm_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
+define <2 x i16> @undef_rhs_cvt_pknorm_i16(float %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pknorm_i16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pknorm_i16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float undef)
+  ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pknorm.u16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pknorm_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
+define <2 x i16> @undef_lhs_cvt_pknorm_u16(float %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pknorm_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
+define <2 x i16> @undef_rhs_cvt_pknorm_u16(float %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pknorm_u16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pknorm_u16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float undef)
+  ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pk.i16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pk_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
+define <2 x i16> @undef_lhs_cvt_pk_i16(i32 %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pk_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
+define <2 x i16> @undef_rhs_cvt_pk_i16(i32 %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pk_i16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pk_i16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 undef)
+  ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pk.u16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pk_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
+define <2 x i16> @undef_lhs_cvt_pk_u16(i32 %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pk_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
+define <2 x i16> @undef_rhs_cvt_pk_u16(i32 %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pk_u16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pk_u16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 undef)
+  ret <2 x i16> %cvt
+}
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.ubfe
 ; --------------------------------------------------------------------