From 3c315f3a8e8f326948fc789f146794ecd33cc540 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Fri, 16 Feb 2018 19:10:15 +0000 Subject: [PATCH] Vendor import of llvm release_60 branch r325330: https://llvm.org/svn/llvm-project/llvm/branches/release_60@325330 --- docs/ReleaseNotes.rst | 38 ++ include/llvm/IR/IntrinsicsAMDGPU.td | 20 + include/llvm/IR/IntrinsicsX86.td | 9 + include/llvm/MC/MCAsmMacro.h | 38 ++ include/llvm/MC/MCContext.h | 15 + .../llvm/Support/GenericDomTreeConstruction.h | 32 +- lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp | 3 +- lib/CodeGen/AsmPrinter/DwarfUnit.cpp | 3 +- lib/CodeGen/LivePhysRegs.cpp | 31 +- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 +- lib/CodeGen/SplitKit.cpp | 53 ++- lib/CodeGen/SplitKit.h | 6 +- lib/IR/AutoUpgrade.cpp | 7 - lib/MC/MCParser/AsmParser.cpp | 56 +-- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 + lib/Target/AMDGPU/AMDGPUISelLowering.h | 4 + lib/Target/AMDGPU/AMDGPUInstrInfo.cpp | 18 + lib/Target/AMDGPU/AMDGPUInstrInfo.h | 2 + lib/Target/AMDGPU/AMDGPUInstrInfo.td | 8 + lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 +- lib/Target/AMDGPU/SIISelLowering.cpp | 52 ++- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 18 - lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 1 - lib/Target/AMDGPU/VOP2Instructions.td | 8 +- .../X86/MCTargetDesc/X86MCCodeEmitter.cpp | 10 +- lib/Target/X86/X86AsmPrinter.cpp | 11 +- lib/Target/X86/X86DomainReassignment.cpp | 12 +- lib/Target/X86/X86ISelLowering.cpp | 217 ++++----- lib/Target/X86/X86IntrinsicsInfo.h | 5 +- lib/Target/X86/X86RetpolineThunks.cpp | 68 +-- .../InstCombine/InstCombineCalls.cpp | 12 + test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll | 84 ++++ test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll | 84 ++++ .../AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll | 164 +++++++ .../AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll | 164 +++++++ test/CodeGen/ARM/pr25838.ll | 2 +- test/CodeGen/ARM/splitkit.ll | 245 ++++++++++ test/CodeGen/Thumb/stm-scavenging.ll | 46 ++ .../X86/avx512-intrinsics-fast-isel.ll | 53 --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 14 - test/CodeGen/X86/avx512-intrinsics.ll | 15 + test/CodeGen/X86/avx512-mask-op.ll | 96 ++++ .../X86/avx512bw-intrinsics-fast-isel.ll | 371 ++++++--------- .../X86/avx512bw-intrinsics-upgrade.ll | 40 -- test/CodeGen/X86/avx512bw-intrinsics.ll | 49 ++ test/CodeGen/X86/domain-reassignment.mir | 439 +++++++++++------- test/CodeGen/X86/inline-asm-modifier-V.ll | 14 + test/CodeGen/X86/pr36199.ll | 22 + test/CodeGen/X86/retpoline-external.ll | 48 +- test/CodeGen/X86/retpoline-regparm.ll | 42 ++ test/CodeGen/X86/retpoline.ll | 14 +- test/DebugInfo/X86/void-typedef.ll | 88 ++++ test/MC/AsmParser/inline_macro_duplication.ll | 8 + test/MC/X86/x86-64.s | 5 + .../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 108 +++++ 55 files changed, 2116 insertions(+), 866 deletions(-) create mode 100644 include/llvm/MC/MCAsmMacro.h create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll create mode 100644 test/CodeGen/ARM/splitkit.ll create mode 100644 test/CodeGen/Thumb/stm-scavenging.ll create mode 100644 test/CodeGen/X86/inline-asm-modifier-V.ll create mode 100644 test/CodeGen/X86/pr36199.ll create mode 100644 test/CodeGen/X86/retpoline-regparm.ll create mode 100644 test/DebugInfo/X86/void-typedef.ll create mode 100644 test/MC/AsmParser/inline_macro_duplication.ll diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index c28a3829bfee..949ec85c270b 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -71,6 +71,13 @@ Non-comprehensive list of changes in this release Changes to the LLVM IR ---------------------- +Changes to the AArch64 Target +----------------------------- + +During this release: + + * Enabled the new GlobalISel instruction selection framework by default at ``-O0``. + Changes to the ARM Target ------------------------- @@ -80,6 +87,28 @@ During this release the ARM target has: isn't the default. +Changes to the Hexagon Target +----------------------------- + +* The Hexagon backend now supports V65 ISA. + +* The ``-mhvx`` option now takes an optional value that specified the ISA + version of the HVX coprocessor. The available values are v60, v62 and v65. + By default, the value is set to be the same as the CPU version. + +* The compiler option ``-mhvx-double`` is deprecated and will be removed in + the next release of the compiler. Programmers should use ``-mhvx-length`` + option to specify the desired vector length: ``-mhvx-length=64b`` for + 64-byte vectors and ``-mhvx-length=128b`` for 128-byte vectors. While the + current default vector length is 64 bytes, users should always specify the + length explicitly, since the default value may change in the future. + +* The target feature ``hvx-double`` is deprecated and will be removed in the + next release. LLVM IR generators should use target features ``hvx-length64b`` + and ``hvx-length128b`` to indicate the vector length. The length should + always be specified when HVX code generation is enabled. + + Changes to the MIPS Target -------------------------- @@ -91,6 +120,15 @@ Changes to the PowerPC Target During this release ... +Changes to the SystemZ Target +----------------------------- + +During this release the SystemZ target has: + +* Added support for 128-bit atomic operations. + +* Added support for the "o" constraint for inline asm statements. + Changes to the X86 Target ------------------------- diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index 22a3a0fe618f..3397fa41db1b 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -238,6 +238,26 @@ def int_amdgcn_cvt_pkrtz : Intrinsic< [IntrNoMem, IntrSpeculatable] >; +def int_amdgcn_cvt_pknorm_i16 : Intrinsic< + [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] +>; + +def int_amdgcn_cvt_pknorm_u16 : Intrinsic< + [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] +>; + +def int_amdgcn_cvt_pk_i16 : Intrinsic< + [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; + +def int_amdgcn_cvt_pk_u16 : Intrinsic< + [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; + def int_amdgcn_class : Intrinsic< [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable] diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index bd6177c5b3d9..7c000e2b1dc7 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3738,6 +3738,15 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_kxnor_w : // TODO: remove this intrinsic Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">, + Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">, + Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], + [IntrNoMem]>; def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">, Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; diff --git a/include/llvm/MC/MCAsmMacro.h b/include/llvm/MC/MCAsmMacro.h new file mode 100644 index 000000000000..34d14abc9645 --- /dev/null +++ b/include/llvm/MC/MCAsmMacro.h @@ -0,0 +1,38 @@ +//===- MCAsmMacro.h - Assembly Macros ---------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MC_MCASMMACRO_H +#define LLVM_MC_MCASMMACRO_H + +#include "llvm/MC/MCParser/MCAsmLexer.h" + +namespace llvm { + +struct MCAsmMacroParameter { + StringRef Name; + std::vector Value; + bool Required = false; + bool Vararg = false; + + MCAsmMacroParameter() = default; +}; + +typedef std::vector MCAsmMacroParameters; +struct MCAsmMacro { + StringRef Name; + StringRef Body; + MCAsmMacroParameters Parameters; + +public: + MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P) + : Name(N), Body(B), Parameters(std::move(P)) {} +}; +}; // namespace llvm + +#endif diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h index 432fc0ede072..358f67c4db6d 100644 --- a/include/llvm/MC/MCContext.h +++ b/include/llvm/MC/MCContext.h @@ -18,6 +18,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/MC/MCAsmMacro.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/SectionKind.h" @@ -268,6 +269,9 @@ namespace llvm { unsigned UniqueID, const MCSymbolELF *Associated); + /// \brief Map of currently defined macros. + StringMap MacroMap; + public: explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI, const MCObjectFileInfo *MOFI, @@ -618,6 +622,17 @@ namespace llvm { // FIXME: We should really do something about that. LLVM_ATTRIBUTE_NORETURN void reportFatalError(SMLoc L, const Twine &Msg); + + const MCAsmMacro *lookupMacro(StringRef Name) { + StringMap::iterator I = MacroMap.find(Name); + return (I == MacroMap.end()) ? nullptr : &I->getValue(); + } + + void defineMacro(StringRef Name, MCAsmMacro Macro) { + MacroMap.insert(std::make_pair(Name, std::move(Macro))); + } + + void undefineMacro(StringRef Name) { MacroMap.erase(Name); } }; } // end namespace llvm diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h index 25175fe66aa8..9438c9e08850 100644 --- a/include/llvm/Support/GenericDomTreeConstruction.h +++ b/include/llvm/Support/GenericDomTreeConstruction.h @@ -698,24 +698,20 @@ struct SemiNCAInfo { return; // Recalculate the set of roots. - DT.Roots = FindRoots(DT, BUI); - for (const NodePtr R : DT.Roots) { - const TreeNodePtr TN = DT.getNode(R); - // A CFG node was selected as a tree root, but the corresponding tree node - // is not connected to the virtual root. This is because the incremental - // algorithm does not really know or use the set of roots and can make a - // different (implicit) decision about which nodes within an infinite loop - // becomes a root. - if (TN && !DT.isVirtualRoot(TN->getIDom())) { - DEBUG(dbgs() << "Root " << BlockNamePrinter(R) - << " is not virtual root's child\n" - << "The entire tree needs to be rebuilt\n"); - // It should be possible to rotate the subtree instead of recalculating - // the whole tree, but this situation happens extremely rarely in - // practice. - CalculateFromScratch(DT, BUI); - return; - } + auto Roots = FindRoots(DT, BUI); + if (DT.Roots.size() != Roots.size() || + !std::is_permutation(DT.Roots.begin(), DT.Roots.end(), Roots.begin())) { + // The roots chosen in the CFG have changed. This is because the + // incremental algorithm does not really know or use the set of roots and + // can make a different (implicit) decision about which node within an + // infinite loop becomes a root. + + DEBUG(dbgs() << "Roots are different in updated trees\n" + << "The entire tree needs to be rebuilt\n"); + // It may be possible to update the tree without recalculating it, but + // we do not know yet how to do it, and it happens rarely in practise. + CalculateFromScratch(DT, BUI); + return; } } diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index d94b0e5c2118..2e5c22447936 100644 --- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -163,7 +163,8 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) { DIType *BaseType = DDTy->getBaseType().resolve(); - assert(BaseType && "Unexpected invalid base type"); + if (!BaseType) + return 0; // If this is a derived type, go ahead and get the base type, unless it's a // reference then it's just the size of the field. Pointer types have no need diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 911e46235781..4ea59f504bd4 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -1391,7 +1391,8 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) { if (!Name.empty()) addString(MemberDie, dwarf::DW_AT_name, Name); - addType(MemberDie, resolve(DT->getBaseType())); + if (DIType *Resolved = resolve(DT->getBaseType())) + addType(MemberDie, Resolved); addSourceLine(MemberDie, DT); diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp index f4b43a9b8ead..277212cf7dac 100644 --- a/lib/CodeGen/LivePhysRegs.cpp +++ b/lib/CodeGen/LivePhysRegs.cpp @@ -205,14 +205,18 @@ void LivePhysRegs::addPristines(const MachineFunction &MF) { } void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) { - if (!MBB.succ_empty()) { - // To get the live-outs we simply merge the live-ins of all successors. - for (const MachineBasicBlock *Succ : MBB.successors()) - addBlockLiveIns(*Succ); - } else if (MBB.isReturnBlock()) { - // For the return block: Add all callee saved registers that are saved and - // restored (somewhere); This does not include callee saved registers that - // are unused and hence not saved and restored; they are called pristine. + // To get the live-outs we simply merge the live-ins of all successors. + for (const MachineBasicBlock *Succ : MBB.successors()) + addBlockLiveIns(*Succ); + if (MBB.isReturnBlock()) { + // Return blocks are a special case because we currently don't mark up + // return instructions completely: specifically, there is no explicit + // use for callee-saved registers. So we add all callee saved registers + // that are saved and restored (somewhere). This does not include + // callee saved registers that are unused and hence not saved and + // restored; they are called pristine. + // FIXME: PEI should add explicit markings to return instructions + // instead of implicitly handling them here. const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); if (MFI.isCalleeSavedInfoValid()) { @@ -225,15 +229,8 @@ void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) { void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) { const MachineFunction &MF = *MBB.getParent(); - if (!MBB.succ_empty()) { - addPristines(MF); - addLiveOutsNoPristines(MBB); - } else if (MBB.isReturnBlock()) { - // For the return block: Add all callee saved registers. - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.isCalleeSavedInfoValid()) - addCalleeSavedRegs(*this, MF); - } + addPristines(MF); + addLiveOutsNoPristines(MBB); } void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) { diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2c6b724c02df..03cb2e310c7e 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16409,7 +16409,9 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOperand(0).getOperand(1) == N2 && N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() == - VT.getVectorNumElements()) { + VT.getVectorNumElements() && + N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() == + VT.getSizeInBits()) { return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0)); } diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp index c99c3b09d88a..1628ee28b8a3 100644 --- a/lib/CodeGen/SplitKit.cpp +++ b/lib/CodeGen/SplitKit.cpp @@ -491,9 +491,8 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx, return VNI; } -void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) { - assert(ParentVNI && "Mapping NULL value"); - ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI->id)]; +void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) { + ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI.id)]; VNInfo *VNI = VFP.getPointer(); // ParentVNI was either unmapped or already complex mapped. Either way, just @@ -777,7 +776,7 @@ SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) { // the source live range. The spiller also won't try to hoist this copy. if (SpillMode && !SlotIndex::isSameInstr(ParentVNI->def, Idx) && MI->readsVirtualRegister(Edit->getReg())) { - forceRecompute(0, ParentVNI); + forceRecompute(0, *ParentVNI); defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI); return Idx; } @@ -835,7 +834,7 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) { // The complement interval will be extended as needed by LRCalc.extend(). if (ParentVNI) - forceRecompute(0, ParentVNI); + forceRecompute(0, *ParentVNI); DEBUG(dbgs() << " overlapIntv [" << Start << ';' << End << "):"); RegAssign.insert(Start, End, OpenIdx); DEBUG(dump()); @@ -878,7 +877,7 @@ void SplitEditor::removeBackCopies(SmallVectorImpl &Copies) { unsigned RegIdx = AssignI.value(); if (AtBegin || !MBBI->readsVirtualRegister(Edit->getReg())) { DEBUG(dbgs() << " cannot find simple kill of RegIdx " << RegIdx << '\n'); - forceRecompute(RegIdx, Edit->getParent().getVNInfoAt(Def)); + forceRecompute(RegIdx, *Edit->getParent().getVNInfoAt(Def)); } else { SlotIndex Kill = LIS.getInstructionIndex(*MBBI).getRegSlot(); DEBUG(dbgs() << " move kill to " << Kill << '\t' << *MBBI); @@ -982,7 +981,7 @@ void SplitEditor::computeRedundantBackCopies( } } if (!DominatedVNIs.empty()) { - forceRecompute(0, ParentVNI); + forceRecompute(0, *ParentVNI); for (auto VNI : DominatedVNIs) { BackCopies.push_back(VNI); } @@ -1102,7 +1101,7 @@ void SplitEditor::hoistCopies() { NotToHoistSet.count(ParentVNI->id)) continue; BackCopies.push_back(VNI); - forceRecompute(0, ParentVNI); + forceRecompute(0, *ParentVNI); } // If it is not beneficial to hoist all the BackCopies, simply remove @@ -1428,6 +1427,41 @@ void SplitEditor::deleteRematVictims() { Edit->eliminateDeadDefs(Dead, None, &AA); } +void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) { + // Fast-path for common case. + if (!ParentVNI.isPHIDef()) { + for (unsigned I = 0, E = Edit->size(); I != E; ++I) + forceRecompute(I, ParentVNI); + return; + } + + // Trace value through phis. + SmallPtrSet Visited; ///< whether VNI was/is in worklist. + SmallVector WorkList; + Visited.insert(&ParentVNI); + WorkList.push_back(&ParentVNI); + + const LiveInterval &ParentLI = Edit->getParent(); + const SlotIndexes &Indexes = *LIS.getSlotIndexes(); + do { + const VNInfo &VNI = *WorkList.back(); + WorkList.pop_back(); + for (unsigned I = 0, E = Edit->size(); I != E; ++I) + forceRecompute(I, VNI); + if (!VNI.isPHIDef()) + continue; + + MachineBasicBlock &MBB = *Indexes.getMBBFromIndex(VNI.def); + for (const MachineBasicBlock *Pred : MBB.predecessors()) { + SlotIndex PredEnd = Indexes.getMBBEndIdx(Pred); + VNInfo *PredVNI = ParentLI.getVNInfoBefore(PredEnd); + assert(PredVNI && "Value available in PhiVNI predecessor"); + if (Visited.insert(PredVNI).second) + WorkList.push_back(PredVNI); + } + } while(!WorkList.empty()); +} + void SplitEditor::finish(SmallVectorImpl *LRMap) { ++NumFinished; @@ -1444,8 +1478,7 @@ void SplitEditor::finish(SmallVectorImpl *LRMap) { // Force rematted values to be recomputed everywhere. // The new live ranges may be truncated. if (Edit->didRematerialize(ParentVNI)) - for (unsigned i = 0, e = Edit->size(); i != e; ++i) - forceRecompute(i, ParentVNI); + forceRecomputeVNI(*ParentVNI); } // Hoist back-copies to the complement interval when in spill mode. diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h index c0608893d4e5..2dafaf587801 100644 --- a/lib/CodeGen/SplitKit.h +++ b/lib/CodeGen/SplitKit.h @@ -357,7 +357,11 @@ class LLVM_LIBRARY_VISIBILITY SplitEditor { /// recomputed by LiveRangeCalc::extend regardless of the number of defs. /// This is used for values whose live range doesn't match RegAssign exactly. /// They could have rematerialized, or back-copies may have been moved. - void forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI); + void forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI); + + /// Calls forceRecompute() on any affected regidx and on ParentVNI + /// predecessors in case of a phi definition. + void forceRecomputeVNI(const VNInfo &ParentVNI); /// defFromParent - Define Reg from ParentVNI at UseIdx using either /// rematerialization or a COPY from parent. Return the new value. diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index c258d1a4e3ad..c56a022c6705 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -75,7 +75,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name=="ssse3.pabs.d.128" || // Added in 6.0 Name.startswith("avx512.mask.shuf.i") || // Added in 6.0 Name.startswith("avx512.mask.shuf.f") || // Added in 6.0 - Name.startswith("avx512.kunpck") || //added in 6.0 Name.startswith("avx2.pabs.") || // Added in 6.0 Name.startswith("avx512.mask.pabs.") || // Added in 6.0 Name.startswith("avx512.broadcastm") || // Added in 6.0 @@ -1063,12 +1062,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0)); Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); - } else if (IsX86 && (Name.startswith("avx512.kunpck"))) { - uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2; - uint64_t And = (1ULL << Shift) - 1; - Value* LowBits = Builder.CreateAnd(CI->getArgOperand(0), And); - Value* HighBits = Builder.CreateShl(CI->getArgOperand(1), Shift); - Rep = Builder.CreateOr(LowBits, HighBits); } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) { Type *I32Ty = Type::getInt32Ty(C); Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0), diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp index 2259136c6ec4..ce3b70bed740 100644 --- a/lib/MC/MCParser/AsmParser.cpp +++ b/lib/MC/MCParser/AsmParser.cpp @@ -83,27 +83,6 @@ namespace { typedef std::vector MCAsmMacroArgument; typedef std::vector MCAsmMacroArguments; -struct MCAsmMacroParameter { - StringRef Name; - MCAsmMacroArgument Value; - bool Required = false; - bool Vararg = false; - - MCAsmMacroParameter() = default; -}; - -typedef std::vector MCAsmMacroParameters; - -struct MCAsmMacro { - StringRef Name; - StringRef Body; - MCAsmMacroParameters Parameters; - -public: - MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P) - : Name(N), Body(B), Parameters(std::move(P)) {} -}; - /// \brief Helper class for storing information about an active macro /// instantiation. struct MacroInstantiation { @@ -164,9 +143,6 @@ class AsmParser : public MCAsmParser { /// addDirectiveHandler. StringMap ExtensionDirectiveMap; - /// \brief Map of currently defined macros. - StringMap MacroMap; - /// \brief Stack of active macro instantiations. std::vector ActiveMacros; @@ -308,17 +284,6 @@ class AsmParser : public MCAsmParser { /// \brief Control a flag in the parser that enables or disables macros. void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;} - /// \brief Lookup a previously defined macro. - /// \param Name Macro name. - /// \returns Pointer to macro. NULL if no such macro was defined. - const MCAsmMacro* lookupMacro(StringRef Name); - - /// \brief Define a new macro with the given name and information. - void defineMacro(StringRef Name, MCAsmMacro Macro); - - /// \brief Undefine a macro. If no such macro was defined, it's a no-op. - void undefineMacro(StringRef Name); - /// \brief Are we inside a macro instantiation? bool isInsideMacroInstantiation() {return !ActiveMacros.empty();} @@ -1841,7 +1806,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, // If macros are enabled, check to see if this is a macro instantiation. if (areMacrosEnabled()) - if (const MCAsmMacro *M = lookupMacro(IDVal)) { + if (const MCAsmMacro *M = getContext().lookupMacro(IDVal)) { return handleMacroEntry(M, IDLoc); } @@ -2720,17 +2685,6 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M, return TokError("too many positional arguments"); } -const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) { - StringMap::iterator I = MacroMap.find(Name); - return (I == MacroMap.end()) ? nullptr : &I->getValue(); -} - -void AsmParser::defineMacro(StringRef Name, MCAsmMacro Macro) { - MacroMap.insert(std::make_pair(Name, std::move(Macro))); -} - -void AsmParser::undefineMacro(StringRef Name) { MacroMap.erase(Name); } - bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) { // Arbitrarily limit macro nesting depth (default matches 'as'). We can // eliminate this, although we should protect against infinite loops. @@ -4249,7 +4203,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) { eatToEndOfStatement(); } - if (lookupMacro(Name)) { + if (getContext().lookupMacro(Name)) { return Error(DirectiveLoc, "macro '" + Name + "' is already defined"); } @@ -4257,7 +4211,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) { const char *BodyEnd = EndToken.getLoc().getPointer(); StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart); checkForBadMacro(DirectiveLoc, Name, Body, Parameters); - defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters))); + getContext().defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters))); return false; } @@ -4416,10 +4370,10 @@ bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) { "unexpected token in '.purgem' directive")) return true; - if (!lookupMacro(Name)) + if (!getContext().lookupMacro(Name)) return Error(DirectiveLoc, "macro '" + Name + "' is not defined"); - undefineMacro(Name); + getContext().undefineMacro(Name); return false; } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 49929441ef21..21192a2c1cc8 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3957,6 +3957,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE2) NODE_NAME_CASE(CVT_F32_UBYTE3) NODE_NAME_CASE(CVT_PKRTZ_F16_F32) + NODE_NAME_CASE(CVT_PKNORM_I16_F32) + NODE_NAME_CASE(CVT_PKNORM_U16_F32) + NODE_NAME_CASE(CVT_PK_I16_I32) + NODE_NAME_CASE(CVT_PK_U16_U32) NODE_NAME_CASE(FP_TO_FP16) NODE_NAME_CASE(FP16_ZEXT) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 5c31bddd9b1a..039ee174e5b7 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -417,6 +417,10 @@ enum NodeType : unsigned { // Convert two float 32 numbers into a single register holding two packed f16 // with round to zero. CVT_PKRTZ_F16_F32, + CVT_PKNORM_I16_F32, + CVT_PKNORM_U16_F32, + CVT_PK_I16_I32, + CVT_PK_U16_U32, // Same as the standard node, except the high bits of the resulting integer // are known 0. diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 8156599528c2..61892efe39e0 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -108,3 +108,21 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { return MCOp; } + +// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence. +bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) { + const Value *Ptr = MMO->getValue(); + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then that means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa(Ptr) || + isa(Ptr) || isa(Ptr)) + return true; + + if (const Argument *Arg = dyn_cast(Ptr)) + return AMDGPU::isArgPassedInSGPR(Arg); + + const Instruction *I = dyn_cast(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h index a9fcd4834638..74e14ef8fbd8 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -50,6 +50,8 @@ class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; + + static bool isUniformMMO(const MachineMemOperand *MMO); }; } // End llvm namespace diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index c024010f3e96..65c483d85c5a 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -35,6 +35,10 @@ def AMDGPUFPPackOp : SDTypeProfile<1, 2, [SDTCisFP<1>, SDTCisSameAs<1, 2>] >; +def AMDGPUIntPackOp : SDTypeProfile<1, 2, + [SDTCisInt<1>, SDTCisSameAs<1, 2>] +>; + def AMDGPUDivScaleOp : SDTypeProfile<2, 3, [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] >; @@ -142,6 +146,10 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; +def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>; +def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>; +def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; +def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>; diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 1ed02fae085a..e3df6d9bee88 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -120,7 +120,7 @@ static bool isInstrUniform(const MachineInstr &MI) { return false; const MachineMemOperand *MMO = *MI.memoperands_begin(); - return AMDGPU::isUniformMMO(MMO); + return AMDGPUInstrInfo::isUniformMMO(MMO); } const RegisterBankInfo::InstructionMapping & diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 415d8a512aa8..6d89aa6968e9 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -205,6 +205,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -1085,7 +1086,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, bool SITargetLowering::isMemOpUniform(const SDNode *N) const { const MemSDNode *MemNode = cast(N); - return AMDGPU::isUniformMMO(MemNode->getMemOperand()); + return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand()); } TargetLoweringBase::LegalizeTypeAction @@ -3517,7 +3518,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(N->getOperand(0))->getZExtValue(); - if (IID == Intrinsic::amdgcn_cvt_pkrtz) { + switch (IID) { + case Intrinsic::amdgcn_cvt_pkrtz: { SDValue Src0 = N->getOperand(1); SDValue Src1 = N->getOperand(2); SDLoc SL(N); @@ -3526,6 +3528,29 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); return; } + case Intrinsic::amdgcn_cvt_pknorm_i16: + case Intrinsic::amdgcn_cvt_pknorm_u16: + case Intrinsic::amdgcn_cvt_pk_i16: + case Intrinsic::amdgcn_cvt_pk_u16: { + SDValue Src0 = N->getOperand(1); + SDValue Src1 = N->getOperand(2); + SDLoc SL(N); + unsigned Opcode; + + if (IID == Intrinsic::amdgcn_cvt_pknorm_i16) + Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; + else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16) + Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; + else if (IID == Intrinsic::amdgcn_cvt_pk_i16) + Opcode = AMDGPUISD::CVT_PK_I16_I32; + else + Opcode = AMDGPUISD::CVT_PK_U16_U32; + + SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt)); + return; + } + } break; } case ISD::SELECT: { @@ -4424,10 +4449,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_ubfe: return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::amdgcn_cvt_pkrtz: { - // FIXME: Stop adding cast if v2f16 legal. + case Intrinsic::amdgcn_cvt_pkrtz: + case Intrinsic::amdgcn_cvt_pknorm_i16: + case Intrinsic::amdgcn_cvt_pknorm_u16: + case Intrinsic::amdgcn_cvt_pk_i16: + case Intrinsic::amdgcn_cvt_pk_u16: { + // FIXME: Stop adding cast if v2f16/v2i16 are legal. EVT VT = Op.getValueType(); - SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32, + unsigned Opcode; + + if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz) + Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32; + else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16) + Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; + else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16) + Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; + else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16) + Opcode = AMDGPUISD::CVT_PK_I16_I32; + else + Opcode = AMDGPUISD::CVT_PK_U16_U32; + + SDValue Node = DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2)); return DAG.getNode(ISD::BITCAST, DL, VT, Node); } diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 125a3b22d0cf..bf9d5bc6ebdc 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -871,24 +871,6 @@ bool isArgPassedInSGPR(const Argument *A) { } } -// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence. -bool isUniformMMO(const MachineMemOperand *MMO) { - const Value *Ptr = MMO->getValue(); - // UndefValue means this is a load of a kernel input. These are uniform. - // Sometimes LDS instructions have constant pointers. - // If Ptr is null, then that means this mem operand contains a - // PseudoSourceValue like GOT. - if (!Ptr || isa(Ptr) || - isa(Ptr) || isa(Ptr)) - return true; - - if (const Argument *Arg = dyn_cast(Ptr)) - return isArgPassedInSGPR(Arg); - - const Instruction *I = dyn_cast(Ptr); - return I && I->getMetadata("amdgpu.uniform"); -} - int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { if (isGCN3Encoding(ST)) return ByteOffset; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a215b445378e..9515001b63d2 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -363,7 +363,6 @@ LLVM_READNONE bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); bool isArgPassedInSGPR(const Argument *Arg); -bool isUniformMMO(const MachineMemOperand *MMO); /// \returns The encoding that will be used for \p ByteOffset in the SMRD /// offset field. diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index ef90b68db1a8..56b934f92f61 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -407,11 +407,11 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT, int_amdgcn_mbcnt_hi>; defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT, AMDGPUldexp>; defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT>; // TODO: set "Uses = dst" -defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT>; -defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT>; +defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT, AMDGPUpknorm_i16_f32>; +defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT, AMDGPUpknorm_u16_f32>; defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT, AMDGPUpkrtz_f16_f32>; -defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT>; -defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT>; +defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT, AMDGPUpk_u16_u32>; +defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT, AMDGPUpk_i16_i32>; } // End SubtargetPredicate = isGCN diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index a7059c6914df..4ddc1f0ba429 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -396,10 +396,14 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // rip-relative addressing is actually relative to the *next* instruction. // Since an immediate can follow the mod/rm byte for an instruction, this - // means that we need to bias the immediate field of the instruction with - // the size of the immediate field. If we have this case, add it into the + // means that we need to bias the displacement field of the instruction with + // the size of the immediate field. If we have this case, add it into the // expression to emit. - int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0; + // Note: rip-relative addressing using immediate displacement values should + // not be adjusted, assuming it was the user's intent. + int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags) + ? X86II::getSizeOfImm(TSFlags) + : 0; EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS, Fixups, -ImmSize); diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 71526dd77f11..2a501efbc1bf 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -370,6 +370,8 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI, static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, char Mode, raw_ostream &O) { unsigned Reg = MO.getReg(); + bool EmitPercent = true; + switch (Mode) { default: return true; // Unknown mode. case 'b': // Print QImode register @@ -384,6 +386,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, case 'k': // Print SImode register Reg = getX86SubSuperRegister(Reg, 32); break; + case 'V': + EmitPercent = false; + LLVM_FALLTHROUGH; case 'q': // Print 64-bit register names if 64-bit integer registers are available. // Otherwise, print 32-bit register names. @@ -391,7 +396,10 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, break; } - O << '%' << X86ATTInstPrinter::getRegisterName(Reg); + if (EmitPercent) + O << '%'; + + O << X86ATTInstPrinter::getRegisterName(Reg); return false; } @@ -464,6 +472,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case 'w': // Print HImode register case 'k': // Print SImode register case 'q': // Print DImode register + case 'V': // Print native register without '%' if (MO.isReg()) return printAsmMRegister(*this, MO, ExtraCode[0], O); printOperand(*this, MI, OpNo, O); diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp index ba7280c29cc9..bc0f55f581ff 100644 --- a/lib/Target/X86/X86DomainReassignment.cpp +++ b/lib/Target/X86/X86DomainReassignment.cpp @@ -663,8 +663,10 @@ void X86DomainReassignment::initConverters() { createReplacer(X86::XOR32rr, X86::KXORDrr); createReplacer(X86::XOR64rr, X86::KXORQrr); - createReplacer(X86::TEST32rr, X86::KTESTDrr); - createReplacer(X86::TEST64rr, X86::KTESTQrr); + // TODO: KTEST is not a replacement for TEST due to flag differences. Need + // to prove only Z flag is used. + //createReplacer(X86::TEST32rr, X86::KTESTDrr); + //createReplacer(X86::TEST64rr, X86::KTESTQrr); } if (STI->hasDQI()) { @@ -684,8 +686,10 @@ void X86DomainReassignment::initConverters() { createReplacer(X86::SHR8ri, X86::KSHIFTRBri); createReplacer(X86::SHL8ri, X86::KSHIFTLBri); - createReplacer(X86::TEST8rr, X86::KTESTBrr); - createReplacer(X86::TEST16rr, X86::KTESTWrr); + // TODO: KTEST is not a replacement for TEST due to flag differences. Need + // to prove only Z flag is used. + //createReplacer(X86::TEST8rr, X86::KTESTBrr); + //createReplacer(X86::TEST16rr, X86::KTESTWrr); createReplacer(X86::XOR8rr, X86::KXORBrr); } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 38885c42b529..9237833a2cd0 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -17017,24 +17017,6 @@ static bool hasNonFlagsUse(SDValue Op) { return false; } -// Emit KTEST instruction for bit vectors on AVX-512 -static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (Op.getOpcode() == ISD::BITCAST) { - auto hasKTEST = [&](MVT VT) { - unsigned SizeInBits = VT.getSizeInBits(); - return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) || - (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64)); - }; - SDValue Op0 = Op.getOperand(0); - MVT Op0VT = Op0.getValueType().getSimpleVT(); - if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 && - hasKTEST(Op0VT)) - return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0); - } - return SDValue(); -} - /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, @@ -17079,9 +17061,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. if (Op.getResNo() != 0 || NeedOF || NeedCF) { - // Emit KTEST for bit vectors - if (auto Node = EmitKTEST(Op, DAG, Subtarget)) - return Node; // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); @@ -17310,10 +17289,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, } if (Opcode == 0) { - // Emit KTEST for bit vectors - if (auto Node = EmitKTEST(Op, DAG, Subtarget)) - return Node; - // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); @@ -18093,6 +18068,34 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, return Result; } +// Try to select this as a KTEST+SETCC if possible. +static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Only support equality comparisons. + if (CC != ISD::SETEQ && CC != ISD::SETNE) + return SDValue(); + + // Must be a bitcast from vXi1. + if (Op0.getOpcode() != ISD::BITCAST) + return SDValue(); + + Op0 = Op0.getOperand(0); + MVT VT = Op0.getSimpleValueType(); + if (!(Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) && + !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))) + return SDValue(); + + X86::CondCode X86CC; + if (isNullConstant(Op1)) { + X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; + } else + return SDValue(); + + SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0); + return getSETCC(X86CC, KTEST, dl, DAG); +} + SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); @@ -18115,6 +18118,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return NewSetCC; } + // Try to lower using KTEST. + if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget)) + return NewSetCC; + // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. if ((isOneConstant(Op1) || isNullConstant(Op1)) && @@ -20525,6 +20532,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Mask = DAG.getBitcast(MaskVT, Mask); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); } + case KUNPCK: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + // Arguments should be swapped. + SDValue Res = DAG.getNode(IntrData->Opc0, dl, + MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), + Src2, Src1); + return DAG.getBitcast(VT, Res); + } case MASK_BINOP: { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); @@ -27094,28 +27113,57 @@ static unsigned getOpcodeForRetpoline(unsigned RPOpc) { static const char *getRetpolineSymbol(const X86Subtarget &Subtarget, unsigned Reg) { + if (Subtarget.useRetpolineExternalThunk()) { + // When using an external thunk for retpolines, we pick names that match the + // names GCC happens to use as well. This helps simplify the implementation + // of the thunks for kernels where they have no easy ability to create + // aliases and are doing non-trivial configuration of the thunk's body. For + // example, the Linux kernel will do boot-time hot patching of the thunk + // bodies and cannot easily export aliases of these to loaded modules. + // + // Note that at any point in the future, we may need to change the semantics + // of how we implement retpolines and at that time will likely change the + // name of the called thunk. Essentially, there is no hard guarantee that + // LLVM will generate calls to specific thunks, we merely make a best-effort + // attempt to help out kernels and other systems where duplicating the + // thunks is costly. + switch (Reg) { + case X86::EAX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_eax"; + case X86::ECX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_ecx"; + case X86::EDX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_edx"; + case X86::EDI: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_edi"; + case X86::R11: + assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); + return "__x86_indirect_thunk_r11"; + } + llvm_unreachable("unexpected reg for retpoline"); + } + + // When targeting an internal COMDAT thunk use an LLVM-specific name. switch (Reg) { - case 0: - assert(!Subtarget.is64Bit() && "R11 should always be available on x64"); - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_push" - : "__llvm_retpoline_push"; case X86::EAX: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_eax" - : "__llvm_retpoline_eax"; + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_eax"; case X86::ECX: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_ecx" - : "__llvm_retpoline_ecx"; + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_ecx"; case X86::EDX: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_edx" - : "__llvm_retpoline_edx"; + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_edx"; + case X86::EDI: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_edi"; case X86::R11: - return Subtarget.useRetpolineExternalThunk() - ? "__llvm_external_retpoline_r11" - : "__llvm_retpoline_r11"; + assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); + return "__llvm_retpoline_r11"; } llvm_unreachable("unexpected reg for retpoline"); } @@ -27134,15 +27182,13 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, // just use R11, but we scan for uses anyway to ensure we don't generate // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't // already a register use operand to the call to hold the callee. If none - // are available, push the callee instead. This is less efficient, but is - // necessary for functions using 3 regparms. Such function calls are - // (currently) not eligible for tail call optimization, because there is no - // scratch register available to hold the address of the callee. + // are available, use EDI instead. EDI is chosen because EBX is the PIC base + // register and ESI is the base pointer to realigned stack frames with VLAs. SmallVector AvailableRegs; if (Subtarget.is64Bit()) AvailableRegs.push_back(X86::R11); else - AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX}); + AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI}); // Zero out any registers that are already used. for (const auto &MO : MI.operands()) { @@ -27160,30 +27206,18 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, break; } } + if (!AvailableReg) + report_fatal_error("calling convention incompatible with retpoline, no " + "available registers"); const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg); - if (AvailableReg == 0) { - // No register available. Use PUSH. This must not be a tailcall, and this - // must not be x64. - if (Subtarget.is64Bit()) - report_fatal_error( - "Cannot make an indirect call on x86-64 using both retpoline and a " - "calling convention that preservers r11"); - if (Opc != X86::CALLpcrel32) - report_fatal_error("Cannot make an indirect tail call on x86 using " - "retpoline without a preserved register"); - BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg); - MI.getOperand(0).ChangeToES(Symbol); - MI.setDesc(TII->get(Opc)); - } else { - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) - .addReg(CalleeVReg); - MI.getOperand(0).ChangeToES(Symbol); - MI.setDesc(TII->get(Opc)); - MachineInstrBuilder(*BB->getParent(), &MI) - .addReg(AvailableReg, RegState::Implicit | RegState::Kill); - } + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) + .addReg(CalleeVReg); + MI.getOperand(0).ChangeToES(Symbol); + MI.setDesc(TII->get(Opc)); + MachineInstrBuilder(*BB->getParent(), &MI) + .addReg(AvailableReg, RegState::Implicit | RegState::Kill); return BB; } @@ -30432,53 +30466,6 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, SDValue N0 = BitCast.getOperand(0); EVT VecVT = N0->getValueType(0); - if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() && - N0->getOpcode() == ISD::OR) { - SDValue Op0 = N0->getOperand(0); - SDValue Op1 = N0->getOperand(1); - MVT TrunckVT; - MVT BitcastVT; - switch (VT.getSimpleVT().SimpleTy) { - default: - return SDValue(); - case MVT::v16i1: - TrunckVT = MVT::i8; - BitcastVT = MVT::v8i1; - break; - case MVT::v32i1: - TrunckVT = MVT::i16; - BitcastVT = MVT::v16i1; - break; - case MVT::v64i1: - TrunckVT = MVT::i32; - BitcastVT = MVT::v32i1; - break; - } - bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL; - bool isArg0UndefLeft = - Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND; - bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL; - bool isArg1UndefLeft = - Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND; - SDValue OpLeft; - SDValue OpRight; - if (isArg0UndefRight && isArg1UndefLeft) { - OpLeft = Op0; - OpRight = Op1; - } else if (isArg1UndefRight && isArg0UndefLeft) { - OpLeft = Op1; - OpRight = Op0; - } else - return SDValue(); - SDLoc DL(BitCast); - SDValue Shr = OpLeft->getOperand(0); - SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr); - SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1); - SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight); - SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2); - } - if (!VT.isScalarInteger() || !VecVT.isSimple()) return SDValue(); diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 0782d5598746..fae0889950b2 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t { COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, - TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, + TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP, ROUNDP, ROUNDS }; @@ -479,6 +479,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0), X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, X86ISD::FADD_RND), diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp index 223fa5771498..d03826bbe992 100644 --- a/lib/Target/X86/X86RetpolineThunks.cpp +++ b/lib/Target/X86/X86RetpolineThunks.cpp @@ -43,7 +43,7 @@ static const char R11ThunkName[] = "__llvm_retpoline_r11"; static const char EAXThunkName[] = "__llvm_retpoline_eax"; static const char ECXThunkName[] = "__llvm_retpoline_ecx"; static const char EDXThunkName[] = "__llvm_retpoline_edx"; -static const char PushThunkName[] = "__llvm_retpoline_push"; +static const char EDIThunkName[] = "__llvm_retpoline_edi"; namespace { class X86RetpolineThunks : public MachineFunctionPass { @@ -74,7 +74,6 @@ class X86RetpolineThunks : public MachineFunctionPass { void createThunkFunction(Module &M, StringRef Name); void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg); - void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB); void populateThunk(MachineFunction &MF, Optional Reg = None); }; @@ -127,7 +126,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { createThunkFunction(M, R11ThunkName); else for (StringRef Name : - {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName}) + {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName}) createThunkFunction(M, Name); InsertedThunks = true; return true; @@ -151,9 +150,8 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { populateThunk(MF, X86::R11); } else { // For 32-bit targets we need to emit a collection of thunks for various - // possible scratch registers as well as a fallback that is used when - // there are no scratch registers and assumes the retpoline target has - // been pushed. + // possible scratch registers as well as a fallback that uses EDI, which is + // normally callee saved. // __llvm_retpoline_eax: // calll .Leax_call_target // .Leax_capture_spec: @@ -174,32 +172,18 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { // movl %edx, (%esp) // retl // - // This last one is a bit more special and so needs a little extra - // handling. - // __llvm_retpoline_push: - // calll .Lpush_call_target - // .Lpush_capture_spec: - // pause - // lfence - // jmp .Lpush_capture_spec - // .align 16 - // .Lpush_call_target: - // # Clear pause_loop return address. - // addl $4, %esp - // # Top of stack words are: Callee, RA. Exchange Callee and RA. - // pushl 4(%esp) # Push callee - // pushl 4(%esp) # Push RA - // popl 8(%esp) # Pop RA to final RA - // popl (%esp) # Pop callee to next top of stack - // retl # Ret to callee + // __llvm_retpoline_edi: + // ... # Same setup + // movl %edi, (%esp) + // retl if (MF.getName() == EAXThunkName) populateThunk(MF, X86::EAX); else if (MF.getName() == ECXThunkName) populateThunk(MF, X86::ECX); else if (MF.getName() == EDXThunkName) populateThunk(MF, X86::EDX); - else if (MF.getName() == PushThunkName) - populateThunk(MF); + else if (MF.getName() == EDIThunkName) + populateThunk(MF, X86::EDI); else llvm_unreachable("Invalid thunk name on x86-32!"); } @@ -240,31 +224,6 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB, .addReg(Reg); } -void X86RetpolineThunks::insert32BitPushReturnAddrClobber( - MachineBasicBlock &MBB) { - // The instruction sequence we use to replace the return address without - // a scratch register is somewhat complicated: - // # Clear capture_spec from return address. - // addl $4, %esp - // # Top of stack words are: Callee, RA. Exchange Callee and RA. - // pushl 4(%esp) # Push callee - // pushl 4(%esp) # Push RA - // popl 8(%esp) # Pop RA to final RA - // popl (%esp) # Pop callee to next top of stack - // retl # Ret to callee - BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP) - .addReg(X86::ESP) - .addImm(4); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, - false, 4); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP, - false, 4); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, - false, 8); - addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP, - false, 0); -} - void X86RetpolineThunks::populateThunk(MachineFunction &MF, Optional Reg) { // Set MF properties. We never use vregs... @@ -301,11 +260,6 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF, CaptureSpec->addSuccessor(CaptureSpec); CallTarget->setAlignment(4); - if (Reg) { - insertRegReturnAddrClobber(*CallTarget, *Reg); - } else { - assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!"); - insert32BitPushReturnAddrClobber(*CallTarget); - } + insertRegReturnAddrClobber(*CallTarget, *Reg); BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 40e52ee755e5..2f2f0696366c 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3264,6 +3264,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::amdgcn_cvt_pknorm_i16: + case Intrinsic::amdgcn_cvt_pknorm_u16: + case Intrinsic::amdgcn_cvt_pk_i16: + case Intrinsic::amdgcn_cvt_pk_u16: { + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + + if (isa(Src0) && isa(Src1)) + return replaceInstUsesWith(*II, UndefValue::get(II->getType())); + + break; + } case Intrinsic::amdgcn_ubfe: case Intrinsic::amdgcn_sbfe: { // Decompose simple cases into standard shifts. diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll new file mode 100644 index 000000000000..241b708e7baf --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll @@ -0,0 +1,84 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}s_cvt_pk_i16_i32: +; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}} +; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] +; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]] +; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]] +define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { + %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y) + %r = bitcast <2 x i16> %result to i32 + store i32 %r, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_cvt_pk_i16_samereg_i32: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]] +define amdgpu_kernel void @s_cvt_pk_i16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 { + %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %x) + %r = bitcast <2 x i16> %result to i32 + store i32 %r, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pk_i16_i32: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[A]], [[B]] +; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @v_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.gep + %b = load volatile i32, i32 addrspace(1)* %b.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 %b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_reg_imm: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1 +define amdgpu_kernel void @v_cvt_pk_i16_i32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 1) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_imm_reg: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, 1, [[A]] +; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, 1, [[A]] +define amdgpu_kernel void @v_cvt_pk_i16_i32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 1, i32 %a) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 + + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll new file mode 100644 index 000000000000..8d5c9aa95219 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll @@ -0,0 +1,84 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}s_cvt_pk_u16_u32: +; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}} +; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] +; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]] +; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]] +define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { + %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y) + %r = bitcast <2 x i16> %result to i32 + store i32 %r, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_cvt_pk_u16_samereg_i32: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]] +define amdgpu_kernel void @s_cvt_pk_u16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 { + %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %x) + %r = bitcast <2 x i16> %result to i32 + store i32 %r, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pk_u16_u32: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[A]], [[B]] +; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @v_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.gep + %b = load volatile i32, i32 addrspace(1)* %b.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 %b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_reg_imm: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1 +define amdgpu_kernel void @v_cvt_pk_u16_u32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 1) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_imm_reg: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, 1, [[A]] +; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, 1, [[A]] +define amdgpu_kernel void @v_cvt_pk_u16_u32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile i32, i32 addrspace(1)* %a.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 1, i32 %a) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 + + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll new file mode 100644 index 000000000000..822e8c2886ba --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll @@ -0,0 +1,164 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32: +; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}} +; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] +; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]] +; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]] +define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 { + %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y) + %r = bitcast <2 x i16> %result to i32 + store i32 %r, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_cvt_pknorm_i16_samereg_f32: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]] +define amdgpu_kernel void @s_cvt_pknorm_i16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 { + %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %x) + %r = bitcast <2 x i16> %result to i32 + store i32 %r, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[A]], [[B]] +; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @v_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_reg_imm: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0 +define amdgpu_kernel void @v_cvt_pknorm_i16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float 1.0) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_imm_reg: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, 1.0, [[A]] +; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, 1.0, [[A]] +define amdgpu_kernel void @v_cvt_pknorm_i16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float 1.0, float %a) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]] +define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.a = fsub float -0.0, %a + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_hi: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]] +define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.b = fsub float -0.0, %b + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %neg.b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo_hi: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]] +define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.a = fsub float -0.0, %a + %neg.b = fsub float -0.0, %b + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %neg.b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]] +define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fabs.a = call float @llvm.fabs.f32(float %a) + %neg.fabs.a = fsub float -0.0, %fabs.a + %neg.b = fsub float -0.0, %b + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.fabs.a, float %neg.b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) #1 +declare float @llvm.fabs.f32(float) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 + + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll new file mode 100644 index 000000000000..c2b8f3cb28ca --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll @@ -0,0 +1,164 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32: +; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}} +; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] +; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]] +; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]] +define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 { + %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y) + %r = bitcast <2 x i16> %result to i32 + store i32 %r, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_cvt_pknorm_u16_samereg_f32: +; GCN: s_load_dword [[X:s[0-9]+]] +; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]] +define amdgpu_kernel void @s_cvt_pknorm_u16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 { + %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %x) + %r = bitcast <2 x i16> %result to i32 + store i32 %r, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[A]], [[B]] +; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[A]], [[B]] +define amdgpu_kernel void @v_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_reg_imm: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0 +define amdgpu_kernel void @v_cvt_pknorm_u16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float 1.0) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_imm_reg: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, 1.0, [[A]] +; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, 1.0, [[A]] +define amdgpu_kernel void @v_cvt_pknorm_u16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float 1.0, float %a) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]] +define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.a = fsub float -0.0, %a + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_hi: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]] +define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.b = fsub float -0.0, %b + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %neg.b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo_hi: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]] +define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %neg.a = fsub float -0.0, %a + %neg.b = fsub float -0.0, %b + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %neg.b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]] +define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fabs.a = call float @llvm.fabs.f32(float %a) + %neg.fabs.a = fsub float -0.0, %fabs.a + %neg.b = fsub float -0.0, %b + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.fabs.a, float %neg.b) + %r = bitcast <2 x i16> %cvt to i32 + store i32 %r, i32 addrspace(1)* %out.gep + ret void +} + +declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) #1 +declare float @llvm.fabs.f32(float) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 + + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/ARM/pr25838.ll b/test/CodeGen/ARM/pr25838.ll index 0aa95fd2d720..f3bb98f4260c 100644 --- a/test/CodeGen/ARM/pr25838.ll +++ b/test/CodeGen/ARM/pr25838.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s +; RUN: llc -verify-machineinstrs < %s ; PR25838 target triple = "armv7--linux-android" diff --git a/test/CodeGen/ARM/splitkit.ll b/test/CodeGen/ARM/splitkit.ll new file mode 100644 index 000000000000..d51d35174450 --- /dev/null +++ b/test/CodeGen/ARM/splitkit.ll @@ -0,0 +1,245 @@ +; RUN: llc -o - %s | FileCheck %s +; Make sure RegAllocGreedy/SplitKit do not produce invalid liveness information +; and crash when splitting a liverange twice and rematerializing each time. +; (Sorry for the testcase; this was ran through bugpoint and then manually +; reduced for several hours but is still big...) +target triple = "thumbv7-apple-ios" + +%struct.ham = type { %struct.wombat.0 } +%struct.wombat.0 = type { %struct.barney } +%struct.barney = type { %struct.snork.1 } +%struct.snork.1 = type { %struct.wobble.2 } +%struct.wobble.2 = type { %struct.blam } +%struct.blam = type { i32, i32, i8* } +%struct.ham.3 = type { %struct.pluto } +%struct.pluto = type { %struct.zot*, %struct.snork.5, %struct.wibble } +%struct.zot = type { %struct.blam.4* } +%struct.blam.4 = type <{ %struct.zot, %struct.blam.4*, %struct.zot*, i8, [3 x i8] }> +%struct.snork.5 = type { %struct.quux } +%struct.quux = type { %struct.zot } +%struct.wibble = type { %struct.widget } +%struct.widget = type { i32 } +%struct.bar = type { %struct.spam } +%struct.spam = type { %struct.zot*, %struct.wobble, %struct.zot.7 } +%struct.wobble = type { %struct.wibble.6 } +%struct.wibble.6 = type { %struct.zot } +%struct.zot.7 = type { %struct.ham.8 } +%struct.ham.8 = type { i32 } +%struct.hoge = type { %struct.ham, %struct.foo } +%struct.foo = type { float, float } +%struct.wombat = type { %struct.ham, float } +%struct.snork = type { %struct.ham.9, [11 x i8] } +%struct.ham.9 = type { i8 } + +@global = external global i8 +@global.1 = private constant [20 x i8] c"aaaaaaaaaaaaaaaaaa0\00" +@global.2 = external constant [27 x i8] +@global.3 = external global %struct.ham +@global.4 = external constant [47 x i8] +@global.5 = external constant [61 x i8] +@global.6 = external constant [40 x i8] +@global.7 = external constant [24 x i8] +@global.8 = external constant [20 x i8] +@global.9 = external global %struct.ham +@global.10 = external global %struct.ham +@global.11 = external global %struct.ham +@global.12 = external global %struct.ham +@global.13 = external global %struct.ham +@global.14 = external global %struct.ham +@global.15 = external global %struct.ham +@global.16 = external global %struct.ham +@global.17 = external global %struct.ham +@global.18 = external constant [35 x i8] +@global.19 = external global %struct.ham +@global.20 = external constant [53 x i8] +@global.21 = external global %struct.ham +@global.22 = external global %struct.ham +@global.23 = external global %struct.ham +@global.24 = external constant [32 x i8] +@global.25 = external global %struct.ham +@global.26 = external constant [47 x i8] +@global.27 = external global %struct.ham +@global.28 = external constant [45 x i8] +@global.29 = external global %struct.ham +@global.30 = external global %struct.ham +@global.31 = external constant [24 x i8] +@global.32 = external global %struct.ham +@global.33 = external global %struct.ham +@global.34 = external global %struct.ham +@global.35 = external global %struct.ham +@global.36 = external constant [27 x i8] +@global.37 = external global %struct.ham +@global.38 = external constant [10 x i8] +@global.39 = external global %struct.ham +@global.40 = external global %struct.ham +@global.41 = external global %struct.ham +@global.42 = external global %struct.ham +@global.43 = external global %struct.ham +@global.44 = external constant [41 x i8] +@global.45 = external global %struct.ham +@global.46 = external global %struct.ham +@global.47 = external global %struct.ham +@global.48 = external global %struct.ham +@global.49 = external constant [52 x i8] +@global.50 = external constant [47 x i8] +@global.51 = external global %struct.ham +@global.52 = external global %struct.ham +@global.53 = external global %struct.ham +@global.54 = external global %struct.ham +@global.55 = external global %struct.ham.3 +@global.56 = external global %struct.bar +@global.57 = external global i8 + +declare %struct.ham* @bar(%struct.ham* returned) + +declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) + +declare %struct.ham* @wobble(%struct.ham* returned, %struct.ham* ) + +declare i32 @quux(...) + +declare i8* @_Znwm(i32) + +declare i32 @wobble.58(%struct.pluto*, [1 x i32], %struct.ham* , %struct.hoge* ) + +declare i32 @widget(%struct.spam*, [1 x i32], %struct.ham* , %struct.wombat* ) + +; Just check we didn't crash and did output something... +; CHECK-LABEL: func: +; CHECK: trap +define internal void @func() section "__TEXT,__StaticInit,regular,pure_instructions" personality i32 (...)* @quux { + %tmp = tail call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.3 to i8*), i8* @global) #0 + %tmp2 = invoke %struct.ham* @wobble(%struct.ham* undef, %struct.ham* @global.9) + to label %bb14 unwind label %bbunwind + +bb14: + %tmp15 = getelementptr i8, i8* undef, i32 12 + store i8 0, i8* %tmp15 + %tmp16 = icmp eq i8 undef, 0 + br i1 %tmp16, label %bb28, label %bb18 + +bb18: + br i1 undef, label %bb21, label %bb29 + +bb21: + %tmp22 = call i8* @_Znwm(i32 16) + store i32 17, i32* getelementptr (%struct.ham, %struct.ham* @global.10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp23 = call i8* @_Znwm(i32 32) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([27 x i8], [27 x i8]* @global.2, i32 0, i32 0), i32 26, i32 1, i1 false) + store i32 33, i32* getelementptr (%struct.ham, %struct.ham* @global.11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + store i32 23, i32* getelementptr (%struct.ham, %struct.ham* @global.11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([24 x i8], [24 x i8]* @global.7, i32 0, i32 0), i32 23, i32 1, i1 false) + %tmp24 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.11 to i8*), i8* @global) #0 + store i32 49, i32* getelementptr (%struct.ham, %struct.ham* @global.12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + store i32 37, i32* getelementptr (%struct.ham, %struct.ham* @global.13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.14 to i8*), i8 0, i32 12, i32 1, i1 false) + %tmp25 = call i8* @_Znwm(i32 48) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp25, i8* align 1 getelementptr ([40 x i8], [40 x i8]* @global.6, i32 0, i32 0), i32 39, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([47 x i8], [47 x i8]* @global.4, i32 0, i32 0), i32 46, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([61 x i8], [61 x i8]* @global.5, i32 0, i32 0), i32 60, i32 1, i1 false) + %tmp26 = call i8* @_Znwm(i32 48) + store i32 65, i32* getelementptr (%struct.ham, %struct.ham* @global.15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp27 = icmp eq i8 undef, 0 + br i1 %tmp27, label %bb30, label %bb33 + +bb28: + call void @llvm.trap() + unreachable + +bb29: + call void @llvm.trap() + unreachable + +bb30: + %tmp31 = icmp eq i32 undef, 37 + br i1 %tmp31, label %bb32, label %bb30 + +bb32: + store i8 1, i8* @global.57 + br label %bb33 + +bb33: + %tmp34 = call i8* @_Znwm(i32 32) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([20 x i8], [20 x i8]* @global.1, i32 0, i32 0), i32 19, i32 1, i1 false) + store i32 17, i32* getelementptr (%struct.ham, %struct.ham* @global.16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + store i32 65, i32* getelementptr (%struct.ham, %struct.ham* @global.17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([35 x i8], [35 x i8]* @global.18, i32 0, i32 0), i32 34, i32 1, i1 false) + store i32 65, i32* getelementptr (%struct.ham, %struct.ham* @global.19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([53 x i8], [53 x i8]* @global.20, i32 0, i32 0), i32 52, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([20 x i8], [20 x i8]* @global.8, i32 0, i32 0), i32 19, i32 1, i1 false) + store i32 37, i32* getelementptr (%struct.ham, %struct.ham* @global.21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + %tmp35 = call i8* @_Znwm(i32 32) + store i8 16, i8* bitcast (%struct.ham* @global.22 to i8*) + %tmp36 = call i8* @_Znwm(i32 32) + store i32 31, i32* getelementptr (%struct.ham, %struct.ham* @global.23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp36, i8* align 1 getelementptr ([32 x i8], [32 x i8]* @global.24, i32 0, i32 0), i32 31, i32 1, i1 false) + %tmp37 = getelementptr i8, i8* %tmp36, i32 31 + store i8 0, i8* %tmp37 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([47 x i8], [47 x i8]* @global.26, i32 0, i32 0), i32 46, i32 1, i1 false) + %tmp38 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.25 to i8*), i8* @global) #0 + %tmp39 = call i8* @_Znwm(i32 48) + store i32 44, i32* getelementptr (%struct.ham, %struct.ham* @global.27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp39, i8* align 1 getelementptr ([45 x i8], [45 x i8]* @global.28, i32 0, i32 0), i32 44, i32 1, i1 false) + %tmp40 = getelementptr i8, i8* %tmp39, i32 44 + store i8 0, i8* %tmp40 + call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.29 to i8*), i8 0, i32 12, i32 1, i1 false) + %tmp41 = call i8* @_Znwm(i32 32) + store i32 23, i32* getelementptr (%struct.ham, %struct.ham* @global.30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp41, i8* align 1 getelementptr ([24 x i8], [24 x i8]* @global.31, i32 0, i32 0), i32 23, i32 1, i1 false) + %tmp42 = getelementptr i8, i8* %tmp41, i32 23 + store i8 0, i8* %tmp42 + call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.32 to i8*), i8 0, i32 12, i32 1, i1 false) + store i8 16, i8* bitcast (%struct.ham* @global.32 to i8*) + %tmp43 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.33 to i8*), i8* @global) #0 + %tmp44 = call i8* @_Znwm(i32 16) + call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.34 to i8*), i8 0, i32 12, i32 1, i1 false) + call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.9 to i8*), i8 0, i32 12, i32 1, i1 false) + %tmp45 = call i8* @_Znwm(i32 32) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp45, i8* align 1 getelementptr ([27 x i8], [27 x i8]* @global.36, i32 0, i32 0), i32 26, i32 1, i1 false) + call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.37 to i8*), i8 0, i32 12, i32 1, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 getelementptr (%struct.snork, %struct.snork* bitcast (%struct.ham* @global.37 to %struct.snork*), i32 0, i32 1, i32 0), i8* align 1 getelementptr ([10 x i8], [10 x i8]* @global.38, i32 0, i32 0), i32 9, i32 1, i1 false) + store i32 17, i32* getelementptr (%struct.ham, %struct.ham* @global.39, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp46 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.40 to i8*), i8* @global) #0 + %tmp47 = call i8* @_Znwm(i32 32) + %tmp48 = getelementptr i8, i8* %tmp47, i32 21 + store i8 0, i8* %tmp48 + store i32 33, i32* getelementptr (%struct.ham, %struct.ham* @global.41, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + store i32 15, i32* getelementptr (%struct.ham, %struct.ham* @global.42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + %tmp49 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.43 to i8*), i8* @global) #0 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([41 x i8], [41 x i8]* @global.44, i32 0, i32 0), i32 40, i32 1, i1 false) + %tmp50 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.45 to i8*), i8* @global) #0 + %tmp51 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.46 to i8*), i8* @global) #0 + %tmp52 = call i8* @_Znwm(i32 32) + store i8* %tmp52, i8** getelementptr (%struct.ham, %struct.ham* @global.47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([52 x i8], [52 x i8]* @global.49, i32 0, i32 0), i32 51, i32 1, i1 false) + %tmp53 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.48 to i8*), i8* @global) #0 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([47 x i8], [47 x i8]* @global.50, i32 0, i32 0), i32 46, i32 1, i1 false) + store i32 33, i32* getelementptr (%struct.ham, %struct.ham* @global.51, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + store i32 37, i32* getelementptr (%struct.ham, %struct.ham* @global.52, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1) + %tmp54 = invoke %struct.ham* @wobble(%struct.ham* undef, %struct.ham* @global.54) + to label %bb58 unwind label %bbunwind + +bb58: + %tmp59 = invoke i32 @wobble.58(%struct.pluto* getelementptr (%struct.ham.3, %struct.ham.3* @global.55, i32 0, i32 0), [1 x i32] [i32 ptrtoint (%struct.zot* getelementptr (%struct.ham.3, %struct.ham.3* @global.55, i32 0, i32 0, i32 1, i32 0, i32 0) to i32)], %struct.ham* undef, %struct.hoge* undef) + to label %bb71 unwind label %bbunwind + +bb71: + %tmp72 = invoke i32 @widget(%struct.spam* getelementptr (%struct.bar, %struct.bar* @global.56, i32 0, i32 0), [1 x i32] [i32 ptrtoint (%struct.zot* getelementptr (%struct.bar, %struct.bar* @global.56, i32 0, i32 0, i32 1, i32 0, i32 0) to i32)], %struct.ham* undef, %struct.wombat* undef) + to label %bb73 unwind label %bbunwind + +bb73: + ret void + +bbunwind: + %tmp75 = landingpad { i8*, i32 } + cleanup + resume { i8*, i32 } undef +} + +declare void @llvm.trap() + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* , i8* , i32, i32, i1) + +declare void @llvm.memset.p0i8.i32(i8* , i8, i32, i32, i1) + +attributes #0 = { nounwind } diff --git a/test/CodeGen/Thumb/stm-scavenging.ll b/test/CodeGen/Thumb/stm-scavenging.ll new file mode 100644 index 000000000000..3ed5763f2955 --- /dev/null +++ b/test/CodeGen/Thumb/stm-scavenging.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s | FileCheck %s +target triple = "thumbv6---gnueabi" + +; Use STM to save the three registers +; CHECK-LABEL: use_stm: +; CHECK: .save {r7, lr} +; CHECK: .setfp r7, sp +; CHECK: stm r3!, {r0, r1, r2} +; CHECK: bl throws_1 +define void @use_stm(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr noreturn "no-frame-pointer-elim"="true" { +entry: + %arrayidx = getelementptr inbounds i32, i32* %d, i32 2 + store i32 %a, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %d, i32 3 + store i32 %b, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %d, i32 4 + store i32 %c, i32* %arrayidx2, align 4 + tail call void @throws_1(i32 %a, i32 %b, i32 %c) noreturn + unreachable +} + +; Don't use STM: there is no available register to store +; the address. We could transform this with some extra math, but +; that currently isn't implemented. +; CHECK-LABEL: no_stm: +; CHECK: .save {r7, lr} +; CHECK: .setfp r7, sp +; CHECK: str r0, +; CHECK: str r1, +; CHECK: str r2, +; CHECK: bl throws_2 +define void @no_stm(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr noreturn "no-frame-pointer-elim"="true" { +entry: + %arrayidx = getelementptr inbounds i32, i32* %d, i32 2 + store i32 %a, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %d, i32 3 + store i32 %b, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %d, i32 4 + store i32 %c, i32* %arrayidx2, align 4 + tail call void @throws_2(i32 %a, i32 %b, i32 %c, i32* %d) noreturn + unreachable +} + + +declare void @throws_1(i32, i32, i32) noreturn +declare void @throws_2(i32, i32, i32, i32*) noreturn diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 50de773af001..80127f66bdfe 100644 --- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -5,59 +5,6 @@ ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c -define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 { -; X32-LABEL: test_mm512_kunpackb: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: .cfi_def_cfa_register %ebp -; X32-NEXT: andl $-64, %esp -; X32-NEXT: subl $64, %esp -; X32-NEXT: vmovdqa64 136(%ebp), %zmm3 -; X32-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; X32-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 -; X32-NEXT: kunpckbw %k0, %k1, %k1 -; X32-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1} -; X32-NEXT: kmovw %k0, %eax -; X32-NEXT: movzwl %ax, %eax -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: vzeroupper -; X32-NEXT: retl -; -; X64-LABEL: test_mm512_kunpackb: -; X64: # %bb.0: # %entry -; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 -; X64-NEXT: kunpckbw %k0, %k1, %k1 -; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1} -; X64-NEXT: kmovw %k0, %eax -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: vzeroupper -; X64-NEXT: retq -entry: - %0 = bitcast <8 x i64> %__A to <16 x i32> - %1 = bitcast <8 x i64> %__B to <16 x i32> - %2 = icmp ne <16 x i32> %0, %1 - %3 = bitcast <16 x i1> %2 to i16 - %4 = bitcast <8 x i64> %__C to <16 x i32> - %5 = bitcast <8 x i64> %__D to <16 x i32> - %6 = icmp ne <16 x i32> %4, %5 - %7 = bitcast <16 x i1> %6 to i16 - %8 = and i16 %7, 255 - %shl.i = shl i16 %3, 8 - %or.i = or i16 %8, %shl.i - %9 = bitcast <8 x i64> %__E to <16 x i32> - %10 = bitcast <8 x i64> %__F to <16 x i32> - %11 = icmp ne <16 x i32> %9, %10 - %12 = bitcast i16 %or.i to <16 x i1> - %13 = and <16 x i1> %11, %12 - %14 = bitcast <16 x i1> %13 to i16 - ret i16 %14 -} - define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) { ; X32-LABEL: test_mm512_shuffle_f32x4: ; X32: # %bb.0: # %entry diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index f3ca0644e463..378dbda2dc0a 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1,20 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone - -define i16 @unpckbw_test(i16 %a0, i16 %a1) { -; CHECK-LABEL: unpckbw_test: -; CHECK: ## %bb.0: -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: shll $8, %esi -; CHECK-NEXT: orl %esi, %eax -; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax -; CHECK-NEXT: retq - %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) - ret i16 %res -} - define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: ; CHECK: ## %bb.0: diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 5faa202c30f3..628199d4ac9e 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -96,6 +96,21 @@ define i16 @test_kor(i16 %a0, i16 %a1) { ret i16 %t2 } +declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone + +define i16 @unpckbw_test(i16 %a0, i16 %a1) { +; CHECK-LABEL: unpckbw_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: kunpckbw %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax +; CHECK-NEXT: retq + %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) + ret i16 %res +} + declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone ; TODO: the two kxnor instructions here a no op and should be elimintaed, ; probably by FoldConstantArithmetic in SelectionDAG. diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index 4877157d911d..d112577a6104 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -2775,3 +2775,99 @@ define i8 @test_v8i1_mul(i8 %x, i8 %y) { %ret = bitcast <8 x i1> %m2 to i8 ret i8 %ret } + +; Make sure we don't emit a ktest for signed comparisons. +define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) { +; KNL-LABEL: ktest_signed: +; KNL: ## %bb.0: +; KNL-NEXT: pushq %rax +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testw %ax, %ax +; KNL-NEXT: jle LBB63_1 +; KNL-NEXT: ## %bb.2: ## %bb.2 +; KNL-NEXT: popq %rax +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; KNL-NEXT: LBB63_1: ## %bb.1 +; KNL-NEXT: vzeroupper +; KNL-NEXT: callq _foo +; KNL-NEXT: popq %rax +; KNL-NEXT: retq +; +; SKX-LABEL: ktest_signed: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rax +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: testw %ax, %ax +; SKX-NEXT: jle LBB63_1 +; SKX-NEXT: ## %bb.2: ## %bb.2 +; SKX-NEXT: popq %rax +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; SKX-NEXT: LBB63_1: ## %bb.1 +; SKX-NEXT: vzeroupper +; SKX-NEXT: callq _foo +; SKX-NEXT: popq %rax +; SKX-NEXT: retq +; +; AVX512BW-LABEL: ktest_signed: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: pushq %rax +; AVX512BW-NEXT: .cfi_def_cfa_offset 16 +; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testw %ax, %ax +; AVX512BW-NEXT: jle LBB63_1 +; AVX512BW-NEXT: ## %bb.2: ## %bb.2 +; AVX512BW-NEXT: popq %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; AVX512BW-NEXT: LBB63_1: ## %bb.1 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: callq _foo +; AVX512BW-NEXT: popq %rax +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: ktest_signed: +; AVX512DQ: ## %bb.0: +; AVX512DQ-NEXT: pushq %rax +; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 +; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: testw %ax, %ax +; AVX512DQ-NEXT: jle LBB63_1 +; AVX512DQ-NEXT: ## %bb.2: ## %bb.2 +; AVX512DQ-NEXT: popq %rax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; AVX512DQ-NEXT: LBB63_1: ## %bb.1 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: callq _foo +; AVX512DQ-NEXT: popq %rax +; AVX512DQ-NEXT: retq + %a = icmp eq <16 x i32> %x, zeroinitializer + %b = icmp eq <16 x i32> %y, zeroinitializer + %c = and <16 x i1> %a, %b + %d = bitcast <16 x i1> %c to i16 + %e = icmp sgt i16 %d, 0 + br i1 %e, label %bb.2, label %bb.1 +bb.1: + call void @foo() + br label %bb.2 +bb.2: + ret void +} +declare void @foo() + diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll index 1e754be6fe49..a56111f3453e 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -4,117 +4,6 @@ ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c -define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) { -; X32-LABEL: test_mm512_kunpackd: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: .cfi_def_cfa_register %ebp -; X32-NEXT: andl $-64, %esp -; X32-NEXT: subl $64, %esp -; X32-NEXT: vmovdqa64 136(%ebp), %zmm3 -; X32-NEXT: vmovdqa64 72(%ebp), %zmm4 -; X32-NEXT: vmovdqa64 8(%ebp), %zmm5 -; X32-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 -; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; X32-NEXT: vpcmpneqb %zmm5, %zmm2, %k0 -; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; X32-NEXT: kunpckdq %k0, %k1, %k1 -; X32-NEXT: vpcmpneqb %zmm3, %zmm4, %k0 {%k1} -; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: vzeroupper -; X32-NEXT: retl -; -; X64-LABEL: test_mm512_kunpackd: -; X64: # %bb.0: # %entry -; X64-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 -; X64-NEXT: vpcmpneqb %zmm3, %zmm2, %k1 -; X64-NEXT: kunpckdq %k0, %k1, %k1 -; X64-NEXT: vpcmpneqb %zmm5, %zmm4, %k0 {%k1} -; X64-NEXT: kmovq %k0, %rax -; X64-NEXT: vzeroupper -; X64-NEXT: retq -entry: - %0 = bitcast <8 x i64> %__B to <64 x i8> - %1 = bitcast <8 x i64> %__A to <64 x i8> - %2 = icmp ne <64 x i8> %0, %1 - %3 = bitcast <64 x i1> %2 to i64 - %4 = bitcast <8 x i64> %__C to <64 x i8> - %5 = bitcast <8 x i64> %__D to <64 x i8> - %6 = icmp ne <64 x i8> %4, %5 - %7 = bitcast <64 x i1> %6 to i64 - %and.i = and i64 %7, 4294967295 - %shl.i = shl i64 %3, 32 - %or.i = or i64 %and.i, %shl.i - %8 = bitcast <8 x i64> %__E to <64 x i8> - %9 = bitcast <8 x i64> %__F to <64 x i8> - %10 = icmp ne <64 x i8> %8, %9 - %11 = bitcast i64 %or.i to <64 x i1> - %12 = and <64 x i1> %10, %11 - %13 = bitcast <64 x i1> %12 to i64 - ret i64 %13 -} - -define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) { -; X32-LABEL: test_mm512_kunpackw: -; X32: # %bb.0: # %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: .cfi_offset %ebp, -8 -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: .cfi_def_cfa_register %ebp -; X32-NEXT: andl $-64, %esp -; X32-NEXT: subl $64, %esp -; X32-NEXT: vmovdqa64 136(%ebp), %zmm3 -; X32-NEXT: vpcmpneqw %zmm0, %zmm1, %k0 -; X32-NEXT: vpcmpneqw 8(%ebp), %zmm2, %k1 -; X32-NEXT: kunpckwd %k0, %k1, %k1 -; X32-NEXT: vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1} -; X32-NEXT: kmovd %k0, %eax -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: vzeroupper -; X32-NEXT: retl -; -; X64-LABEL: test_mm512_kunpackw: -; X64: # %bb.0: # %entry -; X64-NEXT: vpcmpneqw %zmm0, %zmm1, %k0 -; X64-NEXT: vpcmpneqw %zmm3, %zmm2, %k1 -; X64-NEXT: kunpckwd %k0, %k1, %k1 -; X64-NEXT: vpcmpneqw %zmm5, %zmm4, %k0 {%k1} -; X64-NEXT: kmovd %k0, %eax -; X64-NEXT: vzeroupper -; X64-NEXT: retq -entry: - %0 = bitcast <8 x i64> %__B to <32 x i16> - %1 = bitcast <8 x i64> %__A to <32 x i16> - %2 = icmp ne <32 x i16> %0, %1 - %3 = bitcast <32 x i1> %2 to i32 - %4 = bitcast <8 x i64> %__C to <32 x i16> - %5 = bitcast <8 x i64> %__D to <32 x i16> - %6 = icmp ne <32 x i16> %4, %5 - %7 = bitcast <32 x i1> %6 to i32 - %and.i = and i32 %7, 65535 - %shl.i = shl i32 %3, 16 - %or.i = or i32 %and.i, %shl.i - %8 = bitcast <8 x i64> %__E to <32 x i16> - %9 = bitcast <8 x i64> %__F to <32 x i16> - %10 = icmp ne <32 x i16> %8, %9 - %11 = bitcast i32 %or.i to <32 x i1> - %12 = and <32 x i1> %10, %11 - %13 = bitcast <32 x i1> %12 to i32 - ret i32 %13 -} - - define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A) { ; X32-LABEL: test_mm512_mask_set1_epi8: ; X32: # %bb.0: # %entry @@ -189,46 +78,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext ; X32-NEXT: movb %ch, %al ; X32-NEXT: kmovd %eax, %k2 ; X32-NEXT: kxorq %k2, %k1, %k1 -; X32-NEXT: kshiftlq $63, %k1, %k1 -; X32-NEXT: kshiftrq $55, %k1, %k1 -; X32-NEXT: kxorq %k0, %k1, %k0 -; X32-NEXT: kshiftrq $9, %k0, %k1 ; X32-NEXT: andb $2, %al ; X32-NEXT: shrb %al ; X32-NEXT: kmovd %eax, %k2 -; X32-NEXT: kxorq %k2, %k1, %k1 -; X32-NEXT: kshiftlq $63, %k1, %k1 -; X32-NEXT: kshiftrq $54, %k1, %k1 -; X32-NEXT: kxorq %k0, %k1, %k0 -; X32-NEXT: kshiftrq $10, %k0, %k1 ; X32-NEXT: movb %ch, %al ; X32-NEXT: andb $15, %al ; X32-NEXT: movl %eax, %edx ; X32-NEXT: shrb $2, %dl -; X32-NEXT: kmovd %edx, %k2 -; X32-NEXT: kxorq %k2, %k1, %k1 -; X32-NEXT: kshiftlq $63, %k1, %k1 -; X32-NEXT: kshiftrq $53, %k1, %k1 -; X32-NEXT: kxorq %k0, %k1, %k0 -; X32-NEXT: kshiftrq $11, %k0, %k1 +; X32-NEXT: kmovd %edx, %k3 ; X32-NEXT: shrb $3, %al -; X32-NEXT: kmovd %eax, %k2 -; X32-NEXT: kxorq %k2, %k1, %k1 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: shrl $12, %eax -; X32-NEXT: andl $15, %eax -; X32-NEXT: kmovd %eax, %k2 +; X32-NEXT: kmovd %eax, %k4 ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: shrl $13, %eax ; X32-NEXT: andb $1, %al -; X32-NEXT: kmovd %eax, %k3 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: shrl $14, %eax -; X32-NEXT: andl $3, %eax -; X32-NEXT: kmovd %eax, %k4 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: shrl $15, %eax -; X32-NEXT: andl $1, %eax ; X32-NEXT: kmovd %eax, %k5 ; X32-NEXT: movl %ecx, %edx ; X32-NEXT: shrl $16, %edx @@ -243,25 +105,52 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext ; X32-NEXT: kmovd %eax, %k7 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kshiftlq $63, %k1, %k1 +; X32-NEXT: kshiftrq $55, %k1, %k1 +; X32-NEXT: kxorq %k0, %k1, %k0 +; X32-NEXT: kshiftrq $9, %k0, %k1 +; X32-NEXT: kxorq %k2, %k1, %k1 +; X32-NEXT: kshiftlq $63, %k1, %k1 +; X32-NEXT: kshiftrq $54, %k1, %k1 +; X32-NEXT: kxorq %k0, %k1, %k0 +; X32-NEXT: kshiftrq $10, %k0, %k1 +; X32-NEXT: kxorq %k3, %k1, %k1 +; X32-NEXT: kshiftlq $63, %k1, %k1 +; X32-NEXT: kshiftrq $53, %k1, %k1 +; X32-NEXT: kxorq %k0, %k1, %k0 +; X32-NEXT: kshiftrq $11, %k0, %k1 +; X32-NEXT: kxorq %k4, %k1, %k1 +; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $52, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $12, %k0, %k1 +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: shrl $12, %esi +; X32-NEXT: andl $15, %esi +; X32-NEXT: kmovd %esi, %k2 ; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $51, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $13, %k0, %k1 -; X32-NEXT: kxorq %k3, %k1, %k1 +; X32-NEXT: kxorq %k5, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $50, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $14, %k0, %k1 -; X32-NEXT: kxorq %k4, %k1, %k1 +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: shrl $14, %esi +; X32-NEXT: andl $3, %esi +; X32-NEXT: kmovd %esi, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $49, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $15, %k0, %k1 -; X32-NEXT: kxorq %k5, %k1, %k1 +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: shrl $15, %esi +; X32-NEXT: andl $1, %esi +; X32-NEXT: kmovd %esi, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $48, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 @@ -494,22 +383,14 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $43, %k0, %k1 ; X32-NEXT: kxorq %k4, %k1, %k1 -; X32-NEXT: movl %eax, %esi -; X32-NEXT: shrl $12, %esi -; X32-NEXT: andl $15, %esi -; X32-NEXT: kmovd %esi, %k2 -; X32-NEXT: movl %eax, %esi -; X32-NEXT: shrl $14, %esi -; X32-NEXT: andl $3, %esi -; X32-NEXT: kmovd %esi, %k3 -; X32-NEXT: movl %eax, %esi -; X32-NEXT: shrl $15, %esi -; X32-NEXT: andl $1, %esi -; X32-NEXT: kmovd %esi, %k4 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $20, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $44, %k0, %k1 +; X32-NEXT: movl %eax, %esi +; X32-NEXT: shrl $12, %esi +; X32-NEXT: andl $15, %esi +; X32-NEXT: kmovd %esi, %k2 ; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $19, %k1, %k1 @@ -520,12 +401,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext ; X32-NEXT: kshiftrq $18, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $46, %k0, %k1 -; X32-NEXT: kxorq %k3, %k1, %k1 +; X32-NEXT: movl %eax, %esi +; X32-NEXT: shrl $14, %esi +; X32-NEXT: andl $3, %esi +; X32-NEXT: kmovd %esi, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $17, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $47, %k0, %k1 -; X32-NEXT: kxorq %k4, %k1, %k1 +; X32-NEXT: movl %eax, %esi +; X32-NEXT: shrl $15, %esi +; X32-NEXT: andl $1, %esi +; X32-NEXT: kmovd %esi, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $16, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 @@ -551,8 +440,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext ; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $12, %k1, %k1 -; X32-NEXT: kxorq %k0, %k1, %k4 -; X32-NEXT: kshiftrq $52, %k4, %k0 +; X32-NEXT: kxorq %k0, %k1, %k3 +; X32-NEXT: kshiftrq $52, %k3, %k0 ; X32-NEXT: movl %ecx, %edx ; X32-NEXT: shrb $4, %dl ; X32-NEXT: kmovd %edx, %k1 @@ -576,19 +465,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext ; X32-NEXT: andb $15, %cl ; X32-NEXT: movl %ecx, %edx ; X32-NEXT: shrb $2, %dl -; X32-NEXT: kmovd %edx, %k3 +; X32-NEXT: kmovd %edx, %k4 ; X32-NEXT: kshiftlq $63, %k5, %k5 ; X32-NEXT: kshiftrq $11, %k5, %k5 -; X32-NEXT: kxorq %k4, %k5, %k4 -; X32-NEXT: kshiftrq $53, %k4, %k5 +; X32-NEXT: kxorq %k3, %k5, %k3 +; X32-NEXT: kshiftrq $53, %k3, %k5 ; X32-NEXT: kxorq %k6, %k5, %k5 ; X32-NEXT: kshiftlq $63, %k5, %k5 ; X32-NEXT: kshiftrq $10, %k5, %k5 -; X32-NEXT: kxorq %k4, %k5, %k5 -; X32-NEXT: kshiftrq $54, %k5, %k4 -; X32-NEXT: kxorq %k7, %k4, %k6 +; X32-NEXT: kxorq %k3, %k5, %k5 +; X32-NEXT: kshiftrq $54, %k5, %k3 +; X32-NEXT: kxorq %k7, %k3, %k6 ; X32-NEXT: shrb $3, %cl -; X32-NEXT: kmovd %ecx, %k4 +; X32-NEXT: kmovd %ecx, %k3 ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: shrl $29, %ecx ; X32-NEXT: andb $1, %cl @@ -603,12 +492,6 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext ; X32-NEXT: kxorq %k5, %k0, %k0 ; X32-NEXT: kshiftrq $56, %k0, %k5 ; X32-NEXT: kxorq %k1, %k5, %k1 -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl $28, %ecx -; X32-NEXT: kmovd %ecx, %k5 -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl $30, %ecx -; X32-NEXT: kmovd %ecx, %k6 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $7, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 @@ -618,17 +501,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext ; X32-NEXT: kshiftrq $6, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $58, %k0, %k1 -; X32-NEXT: kxorq %k3, %k1, %k1 +; X32-NEXT: kxorq %k4, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $5, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $59, %k0, %k1 -; X32-NEXT: kxorq %k4, %k1, %k1 +; X32-NEXT: kxorq %k3, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $4, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $60, %k0, %k1 -; X32-NEXT: kxorq %k5, %k1, %k1 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $28, %ecx +; X32-NEXT: kmovd %ecx, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $3, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 @@ -638,7 +524,10 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext ; X32-NEXT: kshiftrq $2, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $62, %k0, %k1 -; X32-NEXT: kxorq %k6, %k1, %k1 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $30, %ecx +; X32-NEXT: kmovd %ecx, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: shrl $31, %eax ; X32-NEXT: kmovd %eax, %k2 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al @@ -743,46 +632,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { ; X32-NEXT: movb %ch, %al ; X32-NEXT: kmovd %eax, %k2 ; X32-NEXT: kxorq %k2, %k1, %k1 -; X32-NEXT: kshiftlq $63, %k1, %k1 -; X32-NEXT: kshiftrq $55, %k1, %k1 -; X32-NEXT: kxorq %k0, %k1, %k0 -; X32-NEXT: kshiftrq $9, %k0, %k1 ; X32-NEXT: andb $2, %al ; X32-NEXT: shrb %al ; X32-NEXT: kmovd %eax, %k2 -; X32-NEXT: kxorq %k2, %k1, %k1 -; X32-NEXT: kshiftlq $63, %k1, %k1 -; X32-NEXT: kshiftrq $54, %k1, %k1 -; X32-NEXT: kxorq %k0, %k1, %k0 -; X32-NEXT: kshiftrq $10, %k0, %k1 ; X32-NEXT: movb %ch, %al ; X32-NEXT: andb $15, %al ; X32-NEXT: movl %eax, %edx ; X32-NEXT: shrb $2, %dl -; X32-NEXT: kmovd %edx, %k2 -; X32-NEXT: kxorq %k2, %k1, %k1 -; X32-NEXT: kshiftlq $63, %k1, %k1 -; X32-NEXT: kshiftrq $53, %k1, %k1 -; X32-NEXT: kxorq %k0, %k1, %k0 -; X32-NEXT: kshiftrq $11, %k0, %k1 +; X32-NEXT: kmovd %edx, %k3 ; X32-NEXT: shrb $3, %al -; X32-NEXT: kmovd %eax, %k2 -; X32-NEXT: kxorq %k2, %k1, %k1 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: shrl $12, %eax -; X32-NEXT: andl $15, %eax -; X32-NEXT: kmovd %eax, %k2 +; X32-NEXT: kmovd %eax, %k4 ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: shrl $13, %eax ; X32-NEXT: andb $1, %al -; X32-NEXT: kmovd %eax, %k3 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: shrl $14, %eax -; X32-NEXT: andl $3, %eax -; X32-NEXT: kmovd %eax, %k4 -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: shrl $15, %eax -; X32-NEXT: andl $1, %eax ; X32-NEXT: kmovd %eax, %k5 ; X32-NEXT: movl %ecx, %edx ; X32-NEXT: shrl $16, %edx @@ -797,25 +659,52 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { ; X32-NEXT: kmovd %eax, %k7 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kshiftlq $63, %k1, %k1 +; X32-NEXT: kshiftrq $55, %k1, %k1 +; X32-NEXT: kxorq %k0, %k1, %k0 +; X32-NEXT: kshiftrq $9, %k0, %k1 +; X32-NEXT: kxorq %k2, %k1, %k1 +; X32-NEXT: kshiftlq $63, %k1, %k1 +; X32-NEXT: kshiftrq $54, %k1, %k1 +; X32-NEXT: kxorq %k0, %k1, %k0 +; X32-NEXT: kshiftrq $10, %k0, %k1 +; X32-NEXT: kxorq %k3, %k1, %k1 +; X32-NEXT: kshiftlq $63, %k1, %k1 +; X32-NEXT: kshiftrq $53, %k1, %k1 +; X32-NEXT: kxorq %k0, %k1, %k0 +; X32-NEXT: kshiftrq $11, %k0, %k1 +; X32-NEXT: kxorq %k4, %k1, %k1 +; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $52, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $12, %k0, %k1 +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: shrl $12, %esi +; X32-NEXT: andl $15, %esi +; X32-NEXT: kmovd %esi, %k2 ; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $51, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $13, %k0, %k1 -; X32-NEXT: kxorq %k3, %k1, %k1 +; X32-NEXT: kxorq %k5, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $50, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $14, %k0, %k1 -; X32-NEXT: kxorq %k4, %k1, %k1 +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: shrl $14, %esi +; X32-NEXT: andl $3, %esi +; X32-NEXT: kmovd %esi, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $49, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $15, %k0, %k1 -; X32-NEXT: kxorq %k5, %k1, %k1 +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: shrl $15, %esi +; X32-NEXT: andl $1, %esi +; X32-NEXT: kmovd %esi, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $48, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 @@ -1048,22 +937,14 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $43, %k0, %k1 ; X32-NEXT: kxorq %k4, %k1, %k1 -; X32-NEXT: movl %eax, %esi -; X32-NEXT: shrl $12, %esi -; X32-NEXT: andl $15, %esi -; X32-NEXT: kmovd %esi, %k2 -; X32-NEXT: movl %eax, %esi -; X32-NEXT: shrl $14, %esi -; X32-NEXT: andl $3, %esi -; X32-NEXT: kmovd %esi, %k3 -; X32-NEXT: movl %eax, %esi -; X32-NEXT: shrl $15, %esi -; X32-NEXT: andl $1, %esi -; X32-NEXT: kmovd %esi, %k4 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $20, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $44, %k0, %k1 +; X32-NEXT: movl %eax, %esi +; X32-NEXT: shrl $12, %esi +; X32-NEXT: andl $15, %esi +; X32-NEXT: kmovd %esi, %k2 ; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $19, %k1, %k1 @@ -1074,12 +955,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { ; X32-NEXT: kshiftrq $18, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $46, %k0, %k1 -; X32-NEXT: kxorq %k3, %k1, %k1 +; X32-NEXT: movl %eax, %esi +; X32-NEXT: shrl $14, %esi +; X32-NEXT: andl $3, %esi +; X32-NEXT: kmovd %esi, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $17, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $47, %k0, %k1 -; X32-NEXT: kxorq %k4, %k1, %k1 +; X32-NEXT: movl %eax, %esi +; X32-NEXT: shrl $15, %esi +; X32-NEXT: andl $1, %esi +; X32-NEXT: kmovd %esi, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $16, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 @@ -1105,8 +994,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { ; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $12, %k1, %k1 -; X32-NEXT: kxorq %k0, %k1, %k4 -; X32-NEXT: kshiftrq $52, %k4, %k0 +; X32-NEXT: kxorq %k0, %k1, %k3 +; X32-NEXT: kshiftrq $52, %k3, %k0 ; X32-NEXT: movl %ecx, %edx ; X32-NEXT: shrb $4, %dl ; X32-NEXT: kmovd %edx, %k1 @@ -1130,19 +1019,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { ; X32-NEXT: andb $15, %cl ; X32-NEXT: movl %ecx, %edx ; X32-NEXT: shrb $2, %dl -; X32-NEXT: kmovd %edx, %k3 +; X32-NEXT: kmovd %edx, %k4 ; X32-NEXT: kshiftlq $63, %k5, %k5 ; X32-NEXT: kshiftrq $11, %k5, %k5 -; X32-NEXT: kxorq %k4, %k5, %k4 -; X32-NEXT: kshiftrq $53, %k4, %k5 +; X32-NEXT: kxorq %k3, %k5, %k3 +; X32-NEXT: kshiftrq $53, %k3, %k5 ; X32-NEXT: kxorq %k6, %k5, %k5 ; X32-NEXT: kshiftlq $63, %k5, %k5 ; X32-NEXT: kshiftrq $10, %k5, %k5 -; X32-NEXT: kxorq %k4, %k5, %k5 -; X32-NEXT: kshiftrq $54, %k5, %k4 -; X32-NEXT: kxorq %k7, %k4, %k6 +; X32-NEXT: kxorq %k3, %k5, %k5 +; X32-NEXT: kshiftrq $54, %k5, %k3 +; X32-NEXT: kxorq %k7, %k3, %k6 ; X32-NEXT: shrb $3, %cl -; X32-NEXT: kmovd %ecx, %k4 +; X32-NEXT: kmovd %ecx, %k3 ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: shrl $29, %ecx ; X32-NEXT: andb $1, %cl @@ -1157,12 +1046,6 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { ; X32-NEXT: kxorq %k5, %k0, %k0 ; X32-NEXT: kshiftrq $56, %k0, %k5 ; X32-NEXT: kxorq %k1, %k5, %k1 -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl $28, %ecx -; X32-NEXT: kmovd %ecx, %k5 -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: shrl $30, %ecx -; X32-NEXT: kmovd %ecx, %k6 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $7, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 @@ -1172,17 +1055,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { ; X32-NEXT: kshiftrq $6, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $58, %k0, %k1 -; X32-NEXT: kxorq %k3, %k1, %k1 +; X32-NEXT: kxorq %k4, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $5, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $59, %k0, %k1 -; X32-NEXT: kxorq %k4, %k1, %k1 +; X32-NEXT: kxorq %k3, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $4, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $60, %k0, %k1 -; X32-NEXT: kxorq %k5, %k1, %k1 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $28, %ecx +; X32-NEXT: kmovd %ecx, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: kshiftlq $63, %k1, %k1 ; X32-NEXT: kshiftrq $3, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 @@ -1192,7 +1078,10 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { ; X32-NEXT: kshiftrq $2, %k1, %k1 ; X32-NEXT: kxorq %k0, %k1, %k0 ; X32-NEXT: kshiftrq $62, %k0, %k1 -; X32-NEXT: kxorq %k6, %k1, %k1 +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $30, %ecx +; X32-NEXT: kmovd %ecx, %k2 +; X32-NEXT: kxorq %k2, %k1, %k1 ; X32-NEXT: shrl $31, %eax ; X32-NEXT: kmovd %eax, %k2 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index f19e09758f12..13aca464b9e2 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -2,46 +2,6 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 -declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32) - -define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { -; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: movzwl %di, %eax -; AVX512BW-NEXT: shll $16, %esi -; AVX512BW-NEXT: orl %esi, %eax -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: shll $16, %eax -; AVX512F-32-NEXT: orl %ecx, %eax -; AVX512F-32-NEXT: retl - %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) - ret i32 %res -} - -declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64) - -define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { -; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: movl %edi, %eax -; AVX512BW-NEXT: shlq $32, %rsi -; AVX512BW-NEXT: orq %rsi, %rax -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: retl - %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1) - ret i64 %res -} - declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 2fa7c2c5b8a8..7b5cc5feff0c 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1455,6 +1455,55 @@ define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> ret <8 x i64> %res2 } +declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32) + +define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { +; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k0 +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: kunpckwd %k1, %k0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0 +; AVX512F-32-NEXT: kmovd %k0, %eax +; AVX512F-32-NEXT: retl + %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) + ret i32 %res +} + +declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64) + +define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { +; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovq %rdi, %k0 +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: kunpckdq %k1, %k0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: subl $12, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k0 +; AVX512F-32-NEXT: kmovq %k0, (%esp) +; AVX512F-32-NEXT: movl (%esp), %eax +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: retl + %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1) + ret i64 %res +} + declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>) define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) { diff --git a/test/CodeGen/X86/domain-reassignment.mir b/test/CodeGen/X86/domain-reassignment.mir index 3cb4b5dd1396..7da9b083c22e 100644 --- a/test/CodeGen/X86/domain-reassignment.mir +++ b/test/CodeGen/X86/domain-reassignment.mir @@ -1,22 +1,23 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -run-pass x86-domain-reassignment -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq -o - %s | FileCheck %s --- | ; ModuleID = '../test/CodeGen/X86/gpr-to-mask.ll' source_filename = "../test/CodeGen/X86/gpr-to-mask.ll" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" - + define void @test_fcmp_storefloat(i1 %cond, float* %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) #0 { entry: br i1 %cond, label %if, label %else - + if: ; preds = %entry %cmp1 = fcmp oeq float %f3, %f4 br label %exit - + else: ; preds = %entry %cmp2 = fcmp oeq float %f5, %f6 br label %exit - + exit: ; preds = %else, %if %val = phi i1 [ %cmp1, %if ], [ %cmp2, %else ] %selected = select i1 %val, float %f1, float %f2 @@ -48,14 +49,13 @@ ... --- name: test_fcmp_storefloat -# CHECK-LABEL: name: test_fcmp_storefloat alignment: 4 exposesReturnsTwice: false legalized: false regBankSelected: false selected: false tracksRegLiveness: true -registers: +registers: - { id: 0, class: gr8, preferred-register: '' } - { id: 1, class: gr8, preferred-register: '' } - { id: 2, class: gr8, preferred-register: '' } @@ -79,7 +79,7 @@ registers: - { id: 20, class: fr128, preferred-register: '' } - { id: 21, class: fr128, preferred-register: '' } - { id: 22, class: fr32x, preferred-register: '' } -liveins: +liveins: - { reg: '%edi', virtual-reg: '%3' } - { reg: '%rsi', virtual-reg: '%4' } - { reg: '%xmm0', virtual-reg: '%5' } @@ -88,7 +88,7 @@ liveins: - { reg: '%xmm3', virtual-reg: '%8' } - { reg: '%xmm4', virtual-reg: '%9' } - { reg: '%xmm5', virtual-reg: '%10' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -105,14 +105,51 @@ frameInfo: hasMustTailInVarArgFunc: false savePoint: '' restorePoint: '' -fixedStack: -stack: -constants: +fixedStack: +stack: +constants: body: | + ; CHECK-LABEL: name: test_fcmp_storefloat + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 + ; CHECK: [[COPY:%[0-9]+]]:fr32x = COPY %xmm5 + ; CHECK: [[COPY1:%[0-9]+]]:fr32x = COPY %xmm4 + ; CHECK: [[COPY2:%[0-9]+]]:fr32x = COPY %xmm3 + ; CHECK: [[COPY3:%[0-9]+]]:fr32x = COPY %xmm2 + ; CHECK: [[COPY4:%[0-9]+]]:fr32x = COPY %xmm1 + ; CHECK: [[COPY5:%[0-9]+]]:vr128x = COPY %xmm0 + ; CHECK: [[COPY6:%[0-9]+]]:gr64 = COPY %rsi + ; CHECK: [[COPY7:%[0-9]+]]:gr32 = COPY %edi + ; CHECK: [[COPY8:%[0-9]+]]:gr8 = COPY [[COPY7]].sub_8bit + ; CHECK: TEST8ri killed [[COPY8]], 1, implicit-def %eflags + ; CHECK: JE_1 %bb.2, implicit %eflags + ; CHECK: JMP_1 %bb.1 + ; CHECK: bb.1.if: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: [[VCMPSSZrr:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY3]], [[COPY2]], 0 + ; CHECK: [[COPY9:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr]] + ; CHECK: [[COPY10:%[0-9]+]]:vk8 = COPY [[COPY9]] + ; CHECK: JMP_1 %bb.3 + ; CHECK: bb.2.else: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: [[VCMPSSZrr1:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY1]], [[COPY]], 0 + ; CHECK: [[COPY11:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr1]] + ; CHECK: [[COPY12:%[0-9]+]]:vk8 = COPY [[COPY11]] + ; CHECK: bb.3.exit: + ; CHECK: [[PHI:%[0-9]+]]:vk8 = PHI [[COPY12]], %bb.2, [[COPY10]], %bb.1 + ; CHECK: [[COPY13:%[0-9]+]]:vk32 = COPY [[PHI]] + ; CHECK: [[COPY14:%[0-9]+]]:vk1wm = COPY [[COPY13]] + ; CHECK: [[COPY15:%[0-9]+]]:vr128x = COPY [[COPY4]] + ; CHECK: [[DEF:%[0-9]+]]:fr128 = IMPLICIT_DEF + ; CHECK: [[VMOVSSZrrk:%[0-9]+]]:fr128 = VMOVSSZrrk [[COPY15]], killed [[COPY14]], killed [[DEF]], [[COPY5]] + ; CHECK: [[COPY16:%[0-9]+]]:fr32x = COPY [[VMOVSSZrrk]] + ; CHECK: VMOVSSZmr [[COPY6]], 1, %noreg, 0, %noreg, killed [[COPY16]] :: (store 4 into %ir.fptr) + ; CHECK: RET 0 bb.0.entry: successors: %bb.1(0x40000000), %bb.2(0x40000000) liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 - + %10 = COPY %xmm5 %9 = COPY %xmm4 %8 = COPY %xmm3 @@ -125,38 +162,31 @@ body: | TEST8ri killed %11, 1, implicit-def %eflags JE_1 %bb.2, implicit %eflags JMP_1 %bb.1 - + bb.1.if: successors: %bb.3(0x80000000) - + %14 = VCMPSSZrr %7, %8, 0 ; check that cross domain copies are replaced with same domain copies. - ; CHECK: %15:vk32 = COPY %14 - ; CHECK: %0:vk8 = COPY %15 - + %15 = COPY %14 %0 = COPY %15.sub_8bit JMP_1 %bb.3 - + bb.2.else: successors: %bb.3(0x80000000) %12 = VCMPSSZrr %9, %10, 0 ; check that cross domain copies are replaced with same domain copies. - ; CHECK: %13:vk32 = COPY %12 - ; CHECK: %1:vk8 = COPY %13 %13 = COPY %12 %1 = COPY %13.sub_8bit - + bb.3.exit: ; check PHI, IMPLICIT_DEF, and INSERT_SUBREG replacers. - ; CHECK: %2:vk8 = PHI %1, %bb.2, %0, %bb.1 - ; CHECK: %16:vk32 = COPY %2 - ; CHECK: %18:vk1wm = COPY %16 - + %2 = PHI %1, %bb.2, %0, %bb.1 %17 = IMPLICIT_DEF %16 = INSERT_SUBREG %17, %2, 1 @@ -171,14 +201,13 @@ body: | ... --- name: test_8bitops -# CHECK-LABEL: name: test_8bitops alignment: 4 exposesReturnsTwice: false legalized: false regBankSelected: false selected: false tracksRegLiveness: true -registers: +registers: - { id: 0, class: gr64, preferred-register: '' } - { id: 1, class: vr512, preferred-register: '' } - { id: 2, class: vr512, preferred-register: '' } @@ -198,13 +227,13 @@ registers: - { id: 16, class: gr8, preferred-register: '' } - { id: 17, class: gr8, preferred-register: '' } - { id: 18, class: gr8, preferred-register: '' } -liveins: +liveins: - { reg: '%rdi', virtual-reg: '%0' } - { reg: '%zmm0', virtual-reg: '%1' } - { reg: '%zmm1', virtual-reg: '%2' } - { reg: '%zmm2', virtual-reg: '%3' } - { reg: '%zmm3', virtual-reg: '%4' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -221,32 +250,50 @@ frameInfo: hasMustTailInVarArgFunc: false savePoint: '' restorePoint: '' -fixedStack: -stack: -constants: +fixedStack: +stack: +constants: body: | + ; CHECK-LABEL: name: test_8bitops + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3 + ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi + ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0 + ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1 + ; CHECK: [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2 + ; CHECK: [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3 + ; CHECK: [[VCMPPDZrri:%[0-9]+]]:vk8 = VCMPPDZrri [[COPY3]], [[COPY4]], 0 + ; CHECK: [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPDZrri]] + ; CHECK: [[COPY6:%[0-9]+]]:vk8 = COPY [[COPY5]] + ; CHECK: [[KSHIFTRBri:%[0-9]+]]:vk8 = KSHIFTRBri [[COPY6]], 2 + ; CHECK: [[KSHIFTLBri:%[0-9]+]]:vk8 = KSHIFTLBri [[KSHIFTRBri]], 1 + ; CHECK: [[KNOTBrr:%[0-9]+]]:vk8 = KNOTBrr [[KSHIFTLBri]] + ; CHECK: [[KORBrr:%[0-9]+]]:vk8 = KORBrr [[KNOTBrr]], [[KSHIFTRBri]] + ; CHECK: [[KANDBrr:%[0-9]+]]:vk8 = KANDBrr [[KORBrr]], [[KSHIFTLBri]] + ; CHECK: [[KXORBrr:%[0-9]+]]:vk8 = KXORBrr [[KANDBrr]], [[KSHIFTRBri]] + ; CHECK: [[KADDBrr:%[0-9]+]]:vk8 = KADDBrr [[KXORBrr]], [[KNOTBrr]] + ; CHECK: [[COPY7:%[0-9]+]]:vk32 = COPY [[KADDBrr]] + ; CHECK: [[COPY8:%[0-9]+]]:vk8wm = COPY [[COPY7]] + ; CHECK: [[VMOVAPDZrrk:%[0-9]+]]:vr512 = VMOVAPDZrrk [[COPY2]], killed [[COPY8]], [[COPY1]] + ; CHECK: VMOVAPDZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPDZrrk]] + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: bb.2: + ; CHECK: RET 0 bb.0: liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3 - + %0 = COPY %rdi %1 = COPY %zmm0 %2 = COPY %zmm1 %3 = COPY %zmm2 %4 = COPY %zmm3 - + %5 = VCMPPDZrri %3, %4, 0 - ; CHECK: %6:vk32 = COPY %5 - ; CHECK: %7:vk8 = COPY %6 %6 = COPY %5 %7 = COPY %6.sub_8bit - ; CHECK: %12:vk8 = KSHIFTRBri %7, 2 - ; CHECK: %13:vk8 = KSHIFTLBri %12, 1 - ; CHECK: %14:vk8 = KNOTBrr %13 - ; CHECK: %15:vk8 = KORBrr %14, %12 - ; CHECK: %16:vk8 = KANDBrr %15, %13 - ; CHECK: %17:vk8 = KXORBrr %16, %12 - ; CHECK: %18:vk8 = KADDBrr %17, %14 %12 = SHR8ri %7, 2, implicit-def dead %eflags %13 = SHL8ri %12, 1, implicit-def dead %eflags %14 = NOT8r %13 @@ -254,19 +301,17 @@ body: | %16 = AND8rr %15, %13, implicit-def dead %eflags %17 = XOR8rr %16, %12, implicit-def dead %eflags %18 = ADD8rr %17, %14, implicit-def dead %eflags - - ; CHECK: %9:vk32 = COPY %18 - ; CHECK: %10:vk8wm = COPY %9 + %8 = IMPLICIT_DEF %9 = INSERT_SUBREG %8, %18, 1 %10 = COPY %9 %11 = VMOVAPDZrrk %2, killed %10, %1 - VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11 + VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11 - ; CHECK: KTESTBrr %18, %18, implicit-def %eflags - TEST8rr %18, %18, implicit-def %eflags - JE_1 %bb.1, implicit %eflags - JMP_1 %bb.2 + ; FIXME We can't replace TEST with KTEST due to flag differences + ; TEST8rr %18, %18, implicit-def %eflags + ; JE_1 %bb.1, implicit %eflags + ; JMP_1 %bb.2 bb.1: @@ -276,14 +321,13 @@ body: | ... --- name: test_16bitops -# CHECK-LABEL: name: test_16bitops alignment: 4 exposesReturnsTwice: false legalized: false regBankSelected: false selected: false tracksRegLiveness: true -registers: +registers: - { id: 0, class: gr64, preferred-register: '' } - { id: 1, class: vr512, preferred-register: '' } - { id: 2, class: vr512, preferred-register: '' } @@ -302,13 +346,13 @@ registers: - { id: 15, class: gr16, preferred-register: '' } - { id: 16, class: gr16, preferred-register: '' } - { id: 17, class: gr16, preferred-register: '' } -liveins: +liveins: - { reg: '%rdi', virtual-reg: '%0' } - { reg: '%zmm0', virtual-reg: '%1' } - { reg: '%zmm1', virtual-reg: '%2' } - { reg: '%zmm2', virtual-reg: '%3' } - { reg: '%zmm3', virtual-reg: '%4' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -325,50 +369,66 @@ frameInfo: hasMustTailInVarArgFunc: false savePoint: '' restorePoint: '' -fixedStack: -stack: -constants: +fixedStack: +stack: +constants: body: | + ; CHECK-LABEL: name: test_16bitops + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3 + ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi + ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0 + ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1 + ; CHECK: [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2 + ; CHECK: [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3 + ; CHECK: [[VCMPPSZrri:%[0-9]+]]:vk16 = VCMPPSZrri [[COPY3]], [[COPY4]], 0 + ; CHECK: [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPSZrri]] + ; CHECK: [[COPY6:%[0-9]+]]:vk16 = COPY [[COPY5]] + ; CHECK: [[KSHIFTRWri:%[0-9]+]]:vk16 = KSHIFTRWri [[COPY6]], 2 + ; CHECK: [[KSHIFTLWri:%[0-9]+]]:vk16 = KSHIFTLWri [[KSHIFTRWri]], 1 + ; CHECK: [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[KSHIFTLWri]] + ; CHECK: [[KORWrr:%[0-9]+]]:vk16 = KORWrr [[KNOTWrr]], [[KSHIFTRWri]] + ; CHECK: [[KANDWrr:%[0-9]+]]:vk16 = KANDWrr [[KORWrr]], [[KSHIFTLWri]] + ; CHECK: [[KXORWrr:%[0-9]+]]:vk16 = KXORWrr [[KANDWrr]], [[KSHIFTRWri]] + ; CHECK: [[COPY7:%[0-9]+]]:vk32 = COPY [[KXORWrr]] + ; CHECK: [[COPY8:%[0-9]+]]:vk16wm = COPY [[COPY7]] + ; CHECK: [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY8]], [[COPY1]] + ; CHECK: VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]] + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: bb.2: + ; CHECK: RET 0 bb.0: liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3 - + %0 = COPY %rdi %1 = COPY %zmm0 %2 = COPY %zmm1 %3 = COPY %zmm2 %4 = COPY %zmm3 - + %5 = VCMPPSZrri %3, %4, 0 - ; CHECK: %6:vk32 = COPY %5 - ; CHECK: %7:vk16 = COPY %6 %6 = COPY %5 %7 = COPY %6.sub_16bit - ; CHECK: %12:vk16 = KSHIFTRWri %7, 2 - ; CHECK: %13:vk16 = KSHIFTLWri %12, 1 - ; CHECK: %14:vk16 = KNOTWrr %13 - ; CHECK: %15:vk16 = KORWrr %14, %12 - ; CHECK: %16:vk16 = KANDWrr %15, %13 - ; CHECK: %17:vk16 = KXORWrr %16, %12 %12 = SHR16ri %7, 2, implicit-def dead %eflags %13 = SHL16ri %12, 1, implicit-def dead %eflags %14 = NOT16r %13 %15 = OR16rr %14, %12, implicit-def dead %eflags %16 = AND16rr %15, %13, implicit-def dead %eflags %17 = XOR16rr %16, %12, implicit-def dead %eflags - - ; CHECK: %9:vk32 = COPY %17 - ; CHECK: %10:vk16wm = COPY %9 + %8 = IMPLICIT_DEF %9 = INSERT_SUBREG %8, %17, 3 %10 = COPY %9 %11 = VMOVAPSZrrk %2, killed %10, %1 - VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11 + VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11 - ; CHECK: KTESTWrr %17, %17, implicit-def %eflags - TEST16rr %17, %17, implicit-def %eflags - JE_1 %bb.1, implicit %eflags - JMP_1 %bb.2 + ; FIXME We can't replace TEST with KTEST due to flag differences + ; TEST16rr %17, %17, implicit-def %eflags + ; JE_1 %bb.1, implicit %eflags + ; JMP_1 %bb.2 bb.1: @@ -378,14 +438,13 @@ body: | ... --- name: test_32bitops -# CHECK-LABEL: name: test_32bitops alignment: 4 exposesReturnsTwice: false legalized: false regBankSelected: false selected: false tracksRegLiveness: true -registers: +registers: - { id: 0, class: gr64, preferred-register: '' } - { id: 1, class: vr512, preferred-register: '' } - { id: 2, class: vr512, preferred-register: '' } @@ -400,11 +459,11 @@ registers: - { id: 11, class: gr32, preferred-register: '' } - { id: 12, class: gr32, preferred-register: '' } - { id: 13, class: gr32, preferred-register: '' } -liveins: +liveins: - { reg: '%rdi', virtual-reg: '%0' } - { reg: '%zmm0', virtual-reg: '%1' } - { reg: '%zmm1', virtual-reg: '%2' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -421,26 +480,40 @@ frameInfo: hasMustTailInVarArgFunc: false savePoint: '' restorePoint: '' -fixedStack: -stack: -constants: +fixedStack: +stack: +constants: body: | + ; CHECK-LABEL: name: test_32bitops + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: %rdi, %zmm0, %zmm1 + ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi + ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0 + ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1 + ; CHECK: [[KMOVDkm:%[0-9]+]]:vk32 = KMOVDkm [[COPY]], 1, %noreg, 0, %noreg + ; CHECK: [[KSHIFTRDri:%[0-9]+]]:vk32 = KSHIFTRDri [[KMOVDkm]], 2 + ; CHECK: [[KSHIFTLDri:%[0-9]+]]:vk32 = KSHIFTLDri [[KSHIFTRDri]], 1 + ; CHECK: [[KNOTDrr:%[0-9]+]]:vk32 = KNOTDrr [[KSHIFTLDri]] + ; CHECK: [[KORDrr:%[0-9]+]]:vk32 = KORDrr [[KNOTDrr]], [[KSHIFTRDri]] + ; CHECK: [[KANDDrr:%[0-9]+]]:vk32 = KANDDrr [[KORDrr]], [[KSHIFTLDri]] + ; CHECK: [[KXORDrr:%[0-9]+]]:vk32 = KXORDrr [[KANDDrr]], [[KSHIFTRDri]] + ; CHECK: [[KANDNDrr:%[0-9]+]]:vk32 = KANDNDrr [[KXORDrr]], [[KORDrr]] + ; CHECK: [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[KANDNDrr]], [[KXORDrr]] + ; CHECK: [[COPY3:%[0-9]+]]:vk32wm = COPY [[KADDDrr]] + ; CHECK: [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]] + ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]] + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: bb.2: + ; CHECK: RET 0 bb.0: liveins: %rdi, %zmm0, %zmm1 - + %0 = COPY %rdi %1 = COPY %zmm0 %2 = COPY %zmm1 - - ; CHECK: %5:vk32 = KMOVDkm %0, 1, %noreg, 0, %noreg - ; CHECK: %6:vk32 = KSHIFTRDri %5, 2 - ; CHECK: %7:vk32 = KSHIFTLDri %6, 1 - ; CHECK: %8:vk32 = KNOTDrr %7 - ; CHECK: %9:vk32 = KORDrr %8, %6 - ; CHECK: %10:vk32 = KANDDrr %9, %7 - ; CHECK: %11:vk32 = KXORDrr %10, %6 - ; CHECK: %12:vk32 = KANDNDrr %11, %9 - ; CHECK: %13:vk32 = KADDDrr %12, %11 + %5 = MOV32rm %0, 1, %noreg, 0, %noreg %6 = SHR32ri %5, 2, implicit-def dead %eflags %7 = SHL32ri %6, 1, implicit-def dead %eflags @@ -450,16 +523,15 @@ body: | %11 = XOR32rr %10, %6, implicit-def dead %eflags %12 = ANDN32rr %11, %9, implicit-def dead %eflags %13 = ADD32rr %12, %11, implicit-def dead %eflags - - ; CHECK: %3:vk32wm = COPY %13 + %3 = COPY %13 %4 = VMOVDQU16Zrrk %2, killed %3, %1 VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4 - ; CHECK: KTESTDrr %13, %13, implicit-def %eflags - TEST32rr %13, %13, implicit-def %eflags - JE_1 %bb.1, implicit %eflags - JMP_1 %bb.2 + ; FIXME We can't replace TEST with KTEST due to flag differences + ; TEST32rr %13, %13, implicit-def %eflags + ; JE_1 %bb.1, implicit %eflags + ; JMP_1 %bb.2 bb.1: @@ -469,14 +541,13 @@ body: | ... --- name: test_64bitops -# CHECK-LABEL: name: test_64bitops alignment: 4 exposesReturnsTwice: false legalized: false regBankSelected: false selected: false tracksRegLiveness: true -registers: +registers: - { id: 0, class: gr64, preferred-register: '' } - { id: 1, class: vr512, preferred-register: '' } - { id: 2, class: vr512, preferred-register: '' } @@ -491,11 +562,11 @@ registers: - { id: 11, class: gr64, preferred-register: '' } - { id: 12, class: gr64, preferred-register: '' } - { id: 13, class: gr64, preferred-register: '' } -liveins: +liveins: - { reg: '%rdi', virtual-reg: '%0' } - { reg: '%zmm0', virtual-reg: '%1' } - { reg: '%zmm1', virtual-reg: '%2' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -512,26 +583,40 @@ frameInfo: hasMustTailInVarArgFunc: false savePoint: '' restorePoint: '' -fixedStack: -stack: -constants: +fixedStack: +stack: +constants: body: | + ; CHECK-LABEL: name: test_64bitops + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: %rdi, %zmm0, %zmm1 + ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi + ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0 + ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1 + ; CHECK: [[KMOVQkm:%[0-9]+]]:vk64 = KMOVQkm [[COPY]], 1, %noreg, 0, %noreg + ; CHECK: [[KSHIFTRQri:%[0-9]+]]:vk64 = KSHIFTRQri [[KMOVQkm]], 2 + ; CHECK: [[KSHIFTLQri:%[0-9]+]]:vk64 = KSHIFTLQri [[KSHIFTRQri]], 1 + ; CHECK: [[KNOTQrr:%[0-9]+]]:vk64 = KNOTQrr [[KSHIFTLQri]] + ; CHECK: [[KORQrr:%[0-9]+]]:vk64 = KORQrr [[KNOTQrr]], [[KSHIFTRQri]] + ; CHECK: [[KANDQrr:%[0-9]+]]:vk64 = KANDQrr [[KORQrr]], [[KSHIFTLQri]] + ; CHECK: [[KXORQrr:%[0-9]+]]:vk64 = KXORQrr [[KANDQrr]], [[KSHIFTRQri]] + ; CHECK: [[KANDNQrr:%[0-9]+]]:vk64 = KANDNQrr [[KXORQrr]], [[KORQrr]] + ; CHECK: [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[KANDNQrr]], [[KXORQrr]] + ; CHECK: [[COPY3:%[0-9]+]]:vk64wm = COPY [[KADDQrr]] + ; CHECK: [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]] + ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]] + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: bb.2: + ; CHECK: RET 0 bb.0: liveins: %rdi, %zmm0, %zmm1 - + %0 = COPY %rdi %1 = COPY %zmm0 %2 = COPY %zmm1 - - ; CHECK: %5:vk64 = KMOVQkm %0, 1, %noreg, 0, %noreg - ; CHECK: %6:vk64 = KSHIFTRQri %5, 2 - ; CHECK: %7:vk64 = KSHIFTLQri %6, 1 - ; CHECK: %8:vk64 = KNOTQrr %7 - ; CHECK: %9:vk64 = KORQrr %8, %6 - ; CHECK: %10:vk64 = KANDQrr %9, %7 - ; CHECK: %11:vk64 = KXORQrr %10, %6 - ; CHECK: %12:vk64 = KANDNQrr %11, %9 - ; CHECK: %13:vk64 = KADDQrr %12, %11 + %5 = MOV64rm %0, 1, %noreg, 0, %noreg %6 = SHR64ri %5, 2, implicit-def dead %eflags %7 = SHL64ri %6, 1, implicit-def dead %eflags @@ -541,16 +626,15 @@ body: | %11 = XOR64rr %10, %6, implicit-def dead %eflags %12 = ANDN64rr %11, %9, implicit-def dead %eflags %13 = ADD64rr %12, %11, implicit-def dead %eflags - - ; CHECK: %3:vk64wm = COPY %13 + %3 = COPY %13 %4 = VMOVDQU8Zrrk %2, killed %3, %1 VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4 - ; CHECK: KTESTQrr %13, %13, implicit-def %eflags - TEST64rr %13, %13, implicit-def %eflags - JE_1 %bb.1, implicit %eflags - JMP_1 %bb.2 + ; FIXME We can't replace TEST with KTEST due to flag differences + ; TEST64rr %13, %13, implicit-def %eflags + ; JE_1 %bb.1, implicit %eflags + ; JMP_1 %bb.2 bb.1: @@ -560,14 +644,13 @@ body: | ... --- name: test_16bitext -# CHECK-LABEL: name: test_16bitext alignment: 4 exposesReturnsTwice: false legalized: false regBankSelected: false selected: false tracksRegLiveness: true -registers: +registers: - { id: 0, class: gr64, preferred-register: '' } - { id: 1, class: vr512, preferred-register: '' } - { id: 2, class: vr512, preferred-register: '' } @@ -575,11 +658,11 @@ registers: - { id: 4, class: vr512, preferred-register: '' } - { id: 5, class: gr16, preferred-register: '' } - { id: 6, class: gr16, preferred-register: '' } -liveins: +liveins: - { reg: '%rdi', virtual-reg: '%0' } - { reg: '%zmm0', virtual-reg: '%1' } - { reg: '%zmm1', virtual-reg: '%2' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -596,24 +679,32 @@ frameInfo: hasMustTailInVarArgFunc: false savePoint: '' restorePoint: '' -fixedStack: -stack: -constants: +fixedStack: +stack: +constants: body: | bb.0: liveins: %rdi, %zmm0, %zmm1 - + + ; CHECK-LABEL: name: test_16bitext + ; CHECK: liveins: %rdi, %zmm0, %zmm1 + ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi + ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0 + ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1 + ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg + ; CHECK: [[COPY3:%[0-9]+]]:vk16 = COPY [[KMOVBkm]] + ; CHECK: [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[COPY3]] + ; CHECK: [[COPY4:%[0-9]+]]:vk16wm = COPY [[KNOTWrr]] + ; CHECK: [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY4]], [[COPY1]] + ; CHECK: VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]] + ; CHECK: RET 0 %0 = COPY %rdi %1 = COPY %zmm0 %2 = COPY %zmm1 - - ; CHECK: %7:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg - ; CHECK: %5:vk16 = COPY %7 - ; CHECK: %6:vk16 = KNOTWrr %5 + %5 = MOVZX16rm8 %0, 1, %noreg, 0, %noreg %6 = NOT16r %5 - ; CHECK: %3:vk16wm = COPY %6 %3 = COPY %6 %4 = VMOVAPSZrrk %2, killed %3, %1 VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %4 @@ -622,14 +713,13 @@ body: | ... --- name: test_32bitext -# CHECK-LABEL: name: test_32bitext alignment: 4 exposesReturnsTwice: false legalized: false regBankSelected: false selected: false tracksRegLiveness: true -registers: +registers: - { id: 0, class: gr64, preferred-register: '' } - { id: 1, class: vr512, preferred-register: '' } - { id: 2, class: vr512, preferred-register: '' } @@ -638,11 +728,11 @@ registers: - { id: 5, class: gr32, preferred-register: '' } - { id: 6, class: gr32, preferred-register: '' } - { id: 7, class: gr32, preferred-register: '' } -liveins: +liveins: - { reg: '%rdi', virtual-reg: '%0' } - { reg: '%zmm0', virtual-reg: '%1' } - { reg: '%zmm1', virtual-reg: '%2' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -659,27 +749,35 @@ frameInfo: hasMustTailInVarArgFunc: false savePoint: '' restorePoint: '' -fixedStack: -stack: -constants: +fixedStack: +stack: +constants: body: | bb.0: liveins: %rdi, %zmm0, %zmm1 - + + ; CHECK-LABEL: name: test_32bitext + ; CHECK: liveins: %rdi, %zmm0, %zmm1 + ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi + ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0 + ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1 + ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg + ; CHECK: [[COPY3:%[0-9]+]]:vk32 = COPY [[KMOVBkm]] + ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg + ; CHECK: [[COPY4:%[0-9]+]]:vk32 = COPY [[KMOVWkm]] + ; CHECK: [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[COPY3]], [[COPY4]] + ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDDrr]] + ; CHECK: [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]] + ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]] + ; CHECK: RET 0 %0 = COPY %rdi %1 = COPY %zmm0 %2 = COPY %zmm1 - - ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg - ; CHECK: %5:vk32 = COPY %8 - ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg - ; CHECK: %6:vk32 = COPY %9 - ; CHECK: %7:vk32 = KADDDrr %5, %6 + %5 = MOVZX32rm8 %0, 1, %noreg, 0, %noreg %6 = MOVZX32rm16 %0, 1, %noreg, 0, %noreg %7 = ADD32rr %5, %6, implicit-def dead %eflags - ; CHECK: %3:vk64wm = COPY %7 %3 = COPY %7 %4 = VMOVDQU16Zrrk %2, killed %3, %1 VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4 @@ -688,14 +786,13 @@ body: | ... --- name: test_64bitext -# CHECK-LABEL: name: test_64bitext alignment: 4 exposesReturnsTwice: false legalized: false regBankSelected: false selected: false tracksRegLiveness: true -registers: +registers: - { id: 0, class: gr64, preferred-register: '' } - { id: 1, class: vr512, preferred-register: '' } - { id: 2, class: vr512, preferred-register: '' } @@ -704,11 +801,11 @@ registers: - { id: 5, class: gr64, preferred-register: '' } - { id: 6, class: gr64, preferred-register: '' } - { id: 7, class: gr64, preferred-register: '' } -liveins: +liveins: - { reg: '%rdi', virtual-reg: '%0' } - { reg: '%zmm0', virtual-reg: '%1' } - { reg: '%zmm1', virtual-reg: '%2' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -725,27 +822,35 @@ frameInfo: hasMustTailInVarArgFunc: false savePoint: '' restorePoint: '' -fixedStack: -stack: -constants: +fixedStack: +stack: +constants: body: | bb.0: liveins: %rdi, %zmm0, %zmm1 - + + ; CHECK-LABEL: name: test_64bitext + ; CHECK: liveins: %rdi, %zmm0, %zmm1 + ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi + ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0 + ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1 + ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg + ; CHECK: [[COPY3:%[0-9]+]]:vk64 = COPY [[KMOVBkm]] + ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg + ; CHECK: [[COPY4:%[0-9]+]]:vk64 = COPY [[KMOVWkm]] + ; CHECK: [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[COPY3]], [[COPY4]] + ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDQrr]] + ; CHECK: [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]] + ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]] + ; CHECK: RET 0 %0 = COPY %rdi %1 = COPY %zmm0 %2 = COPY %zmm1 - - ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg - ; CHECK: %5:vk64 = COPY %8 - ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg - ; CHECK: %6:vk64 = COPY %9 - ; CHECK: %7:vk64 = KADDQrr %5, %6 + %5 = MOVZX64rm8 %0, 1, %noreg, 0, %noreg %6 = MOVZX64rm16 %0, 1, %noreg, 0, %noreg %7 = ADD64rr %5, %6, implicit-def dead %eflags - ; CHECK: %3:vk64wm = COPY %7 %3 = COPY %7 %4 = VMOVDQU8Zrrk %2, killed %3, %1 VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4 diff --git a/test/CodeGen/X86/inline-asm-modifier-V.ll b/test/CodeGen/X86/inline-asm-modifier-V.ll new file mode 100644 index 000000000000..5a7f3fdd25fd --- /dev/null +++ b/test/CodeGen/X86/inline-asm-modifier-V.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=i686-- -no-integrated-as | FileCheck -check-prefix=X86 %s +; RUN: llc < %s -mtriple=x86_64-- -no-integrated-as | FileCheck -check-prefix=X64 %s + +; If the target does not have 64-bit integer registers, emit 32-bit register +; names. + +; X86: call __x86_indirect_thunk_e{{[abcd]}}x +; X64: call __x86_indirect_thunk_r + +define void @q_modifier(i32* %p) { +entry: + tail call void asm sideeffect "call __x86_indirect_thunk_${0:V}", "r,~{dirflag},~{fpsr},~{flags}"(i32* %p) + ret void +} diff --git a/test/CodeGen/X86/pr36199.ll b/test/CodeGen/X86/pr36199.ll new file mode 100644 index 000000000000..84e17dba92e0 --- /dev/null +++ b/test/CodeGen/X86/pr36199.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s + +define void @foo() unnamed_addr #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddps %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovups %zmm0, (%rax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = fadd <16 x float> undef, undef + %bc256 = bitcast <16 x float> %1 to <4 x i128> + %2 = extractelement <4 x i128> %bc256, i32 0 + %3 = bitcast i128 %2 to <4 x float> + %4 = shufflevector <4 x float> %3, <4 x float> undef, <16 x i32> + store <16 x float> %4, <16 x float>* undef, align 4 + ret void +} diff --git a/test/CodeGen/X86/retpoline-external.ll b/test/CodeGen/X86/retpoline-external.ll index 66d32ba5d73d..2f21bb2566de 100644 --- a/test/CodeGen/X86/retpoline-external.ll +++ b/test/CodeGen/X86/retpoline-external.ll @@ -23,18 +23,18 @@ entry: ; X64: callq bar ; X64-DAG: movl %[[x]], %edi ; X64-DAG: movq %[[fp]], %r11 -; X64: callq __llvm_external_retpoline_r11 +; X64: callq __x86_indirect_thunk_r11 ; X64: movl %[[x]], %edi ; X64: callq bar ; X64-DAG: movl %[[x]], %edi ; X64-DAG: movq %[[fp]], %r11 -; X64: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64: jmp __x86_indirect_thunk_r11 # TAILCALL ; X64FAST-LABEL: icall_reg: ; X64FAST: callq bar -; X64FAST: callq __llvm_external_retpoline_r11 +; X64FAST: callq __x86_indirect_thunk_r11 ; X64FAST: callq bar -; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL ; X86-LABEL: icall_reg: ; X86-DAG: movl 12(%esp), %[[fp:[^ ]*]] @@ -43,19 +43,19 @@ entry: ; X86: calll bar ; X86: movl %[[fp]], %eax ; X86: pushl %[[x]] -; X86: calll __llvm_external_retpoline_eax +; X86: calll __x86_indirect_thunk_eax ; X86: pushl %[[x]] ; X86: calll bar ; X86: movl %[[fp]], %eax ; X86: pushl %[[x]] -; X86: calll __llvm_external_retpoline_eax +; X86: calll __x86_indirect_thunk_eax ; X86-NOT: # TAILCALL ; X86FAST-LABEL: icall_reg: ; X86FAST: calll bar -; X86FAST: calll __llvm_external_retpoline_eax +; X86FAST: calll __x86_indirect_thunk_eax ; X86FAST: calll bar -; X86FAST: calll __llvm_external_retpoline_eax +; X86FAST: calll __x86_indirect_thunk_eax @global_fp = external global void (i32)* @@ -72,28 +72,28 @@ define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 { ; X64-LABEL: icall_global_fp: ; X64-DAG: movl %edi, %[[x:[^ ]*]] ; X64-DAG: movq global_fp(%rip), %r11 -; X64: callq __llvm_external_retpoline_r11 +; X64: callq __x86_indirect_thunk_r11 ; X64-DAG: movl %[[x]], %edi ; X64-DAG: movq global_fp(%rip), %r11 -; X64: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64: jmp __x86_indirect_thunk_r11 # TAILCALL ; X64FAST-LABEL: icall_global_fp: ; X64FAST: movq global_fp(%rip), %r11 -; X64FAST: callq __llvm_external_retpoline_r11 +; X64FAST: callq __x86_indirect_thunk_r11 ; X64FAST: movq global_fp(%rip), %r11 -; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL ; X86-LABEL: icall_global_fp: ; X86: movl global_fp, %eax ; X86: pushl 4(%esp) -; X86: calll __llvm_external_retpoline_eax +; X86: calll __x86_indirect_thunk_eax ; X86: addl $4, %esp ; X86: movl global_fp, %eax -; X86: jmp __llvm_external_retpoline_eax # TAILCALL +; X86: jmp __x86_indirect_thunk_eax # TAILCALL ; X86FAST-LABEL: icall_global_fp: -; X86FAST: calll __llvm_external_retpoline_eax -; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL +; X86FAST: calll __x86_indirect_thunk_eax +; X86FAST: jmp __x86_indirect_thunk_eax # TAILCALL %struct.Foo = type { void (%struct.Foo*)** } @@ -114,14 +114,14 @@ define void @vcall(%struct.Foo* %obj) #0 { ; X64: movq (%[[obj]]), %[[vptr:[^ ]*]] ; X64: movq 8(%[[vptr]]), %[[fp:[^ ]*]] ; X64: movq %[[fp]], %r11 -; X64: callq __llvm_external_retpoline_r11 +; X64: callq __x86_indirect_thunk_r11 ; X64-DAG: movq %[[obj]], %rdi ; X64-DAG: movq %[[fp]], %r11 -; X64: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64: jmp __x86_indirect_thunk_r11 # TAILCALL ; X64FAST-LABEL: vcall: -; X64FAST: callq __llvm_external_retpoline_r11 -; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL +; X64FAST: callq __x86_indirect_thunk_r11 +; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL ; X86-LABEL: vcall: ; X86: movl 8(%esp), %[[obj:[^ ]*]] @@ -129,14 +129,14 @@ define void @vcall(%struct.Foo* %obj) #0 { ; X86: movl 4(%[[vptr]]), %[[fp:[^ ]*]] ; X86: movl %[[fp]], %eax ; X86: pushl %[[obj]] -; X86: calll __llvm_external_retpoline_eax +; X86: calll __x86_indirect_thunk_eax ; X86: addl $4, %esp ; X86: movl %[[fp]], %eax -; X86: jmp __llvm_external_retpoline_eax # TAILCALL +; X86: jmp __x86_indirect_thunk_eax # TAILCALL ; X86FAST-LABEL: vcall: -; X86FAST: calll __llvm_external_retpoline_eax -; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL +; X86FAST: calll __x86_indirect_thunk_eax +; X86FAST: jmp __x86_indirect_thunk_eax # TAILCALL declare void @direct_callee() diff --git a/test/CodeGen/X86/retpoline-regparm.ll b/test/CodeGen/X86/retpoline-regparm.ll new file mode 100644 index 000000000000..13b32740b287 --- /dev/null +++ b/test/CodeGen/X86/retpoline-regparm.ll @@ -0,0 +1,42 @@ +; RUN: llc -mtriple=i686-linux < %s | FileCheck --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" %s + +; Test 32-bit retpoline when -mregparm=3 is used. This case is interesting +; because there are no available scratch registers. The Linux kernel builds +; with -mregparm=3, so we need to support it. TCO should fail because we need +; to restore EDI. + +define void @call_edi(void (i32, i32, i32)* %fp) #0 { +entry: + tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0) + ret void +} + +; CHECK-LABEL: call_edi: +; EDI is used, so it must be saved. +; CHECK: pushl %edi +; CHECK-DAG: xorl %eax, %eax +; CHECK-DAG: xorl %edx, %edx +; CHECK-DAG: xorl %ecx, %ecx +; CHECK-DAG: movl {{.*}}, %edi +; CHECK: calll __llvm_retpoline_edi +; CHECK: popl %edi +; CHECK: retl + +define void @edi_external(void (i32, i32, i32)* %fp) #1 { +entry: + tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0) + ret void +} + +; CHECK-LABEL: edi_external: +; CHECK: pushl %edi +; CHECK-DAG: xorl %eax, %eax +; CHECK-DAG: xorl %edx, %edx +; CHECK-DAG: xorl %ecx, %ecx +; CHECK-DAG: movl {{.*}}, %edi +; CHECK: calll __x86_indirect_thunk_edi +; CHECK: popl %edi +; CHECK: retl + +attributes #0 = { "target-features"="+retpoline" } +attributes #1 = { "target-features"="+retpoline-external-thunk" } diff --git a/test/CodeGen/X86/retpoline.ll b/test/CodeGen/X86/retpoline.ll index 57d3388b812a..477609e2d10b 100644 --- a/test/CodeGen/X86/retpoline.ll +++ b/test/CodeGen/X86/retpoline.ll @@ -340,10 +340,10 @@ latch: ; X86-NEXT: movl %edx, (%esp) ; X86-NEXT: retl ; -; X86-LABEL: .section .text.__llvm_retpoline_push,{{.*}},__llvm_retpoline_push,comdat -; X86-NEXT: .hidden __llvm_retpoline_push -; X86-NEXT: .weak __llvm_retpoline_push -; X86: __llvm_retpoline_push: +; X86-LABEL: .section .text.__llvm_retpoline_edi,{{.*}},__llvm_retpoline_edi,comdat +; X86-NEXT: .hidden __llvm_retpoline_edi +; X86-NEXT: .weak __llvm_retpoline_edi +; X86: __llvm_retpoline_edi: ; X86-NEXT: # {{.*}} # %entry ; X86-NEXT: calll [[CALL_TARGET:.*]] ; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken @@ -355,11 +355,7 @@ latch: ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: [[CALL_TARGET]]: # Block address taken ; X86-NEXT: # %entry -; X86-NEXT: addl $4, %esp -; X86-NEXT: pushl 4(%esp) -; X86-NEXT: pushl 4(%esp) -; X86-NEXT: popl 8(%esp) -; X86-NEXT: popl (%esp) +; X86-NEXT: movl %edi, (%esp) ; X86-NEXT: retl diff --git a/test/DebugInfo/X86/void-typedef.ll b/test/DebugInfo/X86/void-typedef.ll new file mode 100644 index 000000000000..2e6bf49bae78 --- /dev/null +++ b/test/DebugInfo/X86/void-typedef.ll @@ -0,0 +1,88 @@ +; Choosing CodeView generates debug metadata for class-scope typedefs that +; Dwarf would normally omit. Choosing both CodeView and Dwarf triggered +; assertion failures and crashes because the Dwarf handler wasn't prepared for +; those records (in particular, ones with the void type represented by a +; null pointer). +; +; This test was generated with: +; clang++ -cc1 -emit-llvm -debug-info-kind=limited -dwarf-version=4 -gcodeview -x c++ +; on the following source code: +; +; class A { +; typedef void _Nodeptr; +; }; +; class B { +; A FailedTestsCache; +; bool m_fn1(); +; }; +; bool B::m_fn1() {} +; +; CodeView generates a DIDerivedType for the _Nodeptr typedef. +; +; RUN: llc %s -o - 2>&1 | FileCheck %s +; CHECK-NOT: Assertion failed + +; ModuleID = 'bug.cpp' +source_filename = "bug.cpp" +target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" +target triple = "i686-pc-windows-msvc" + +%class.B = type { %class.A } +%class.A = type { i8 } + +; Function Attrs: noinline nounwind optnone +define x86_thiscallcc zeroext i1 @"\01?m_fn1@B@@AAE_NXZ"(%class.B* %this) #0 align 2 !dbg !9 { +entry: + %retval = alloca i1, align 1 + %this.addr = alloca %class.B*, align 4 + store %class.B* %this, %class.B** %this.addr, align 4 + call void @llvm.dbg.declare(metadata %class.B** %this.addr, metadata !22, metadata !DIExpression()), !dbg !24 + %this1 = load %class.B*, %class.B** %this.addr, align 4 + call void @llvm.trap(), !dbg !25 + unreachable, !dbg !25 + +return: ; No predecessors! + %0 = load i1, i1* %retval, align 1, !dbg !25 + ret i1 %0, !dbg !25 +} + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +; Function Attrs: noreturn nounwind +declare void @llvm.trap() #2 + +attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone speculatable } +attributes #2 = { noreturn nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5, !6, !7} +!llvm.ident = !{!8} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "", directory: "D:\5Csrc\5Cbug", checksumkind: CSK_MD5, checksum: "2216f11c5ddda8c48a6f92a6079ad4b6") +!2 = !{} +!3 = !{i32 1, !"NumRegisterParameters", i32 0} +!4 = !{i32 2, !"Dwarf Version", i32 4} +!5 = !{i32 2, !"CodeView", i32 1} +!6 = !{i32 2, !"Debug Info Version", i32 3} +!7 = !{i32 1, !"wchar_size", i32 2} +!8 = !{!"clang version 6.0.0 "} +!9 = distinct !DISubprogram(name: "m_fn1", linkageName: "\01?m_fn1@B@@AAE_NXZ", scope: !11, file: !10, line: 8, type: !18, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !17, variables: !2) +!10 = !DIFile(filename: "bug.cpp", directory: "D:\5Csrc\5Cbug", checksumkind: CSK_MD5, checksum: "2216f11c5ddda8c48a6f92a6079ad4b6") +!11 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "B", file: !10, line: 4, size: 8, elements: !12, identifier: ".?AVB@@") +!12 = !{!13, !17} +!13 = !DIDerivedType(tag: DW_TAG_member, name: "FailedTestsCache", scope: !11, file: !10, line: 5, baseType: !14, size: 8) +!14 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !10, line: 1, size: 8, elements: !15, identifier: ".?AVA@@") +!15 = !{!16} +!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "_Nodeptr", scope: !14, file: !10, line: 2, baseType: null) +!17 = !DISubprogram(name: "m_fn1", linkageName: "\01?m_fn1@B@@AAE_NXZ", scope: !11, file: !10, line: 6, type: !18, isLocal: false, isDefinition: false, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false) +!18 = !DISubroutineType(cc: DW_CC_BORLAND_thiscall, types: !19) +!19 = !{!20, !21} +!20 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean) +!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32, flags: DIFlagArtificial | DIFlagObjectPointer) +!22 = !DILocalVariable(name: "this", arg: 1, scope: !9, type: !23, flags: DIFlagArtificial | DIFlagObjectPointer) +!23 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32) +!24 = !DILocation(line: 0, scope: !9) +!25 = !DILocation(line: 8, scope: !9) diff --git a/test/MC/AsmParser/inline_macro_duplication.ll b/test/MC/AsmParser/inline_macro_duplication.ll new file mode 100644 index 000000000000..9d7e22fde7b6 --- /dev/null +++ b/test/MC/AsmParser/inline_macro_duplication.ll @@ -0,0 +1,8 @@ +; RUN: not llc < %s 2>&1 | FileCheck %s + +define void @test() { + call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1 + call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1 +; CHECK: error: macro 'FOO' is already defined + ret void +} diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s index 378af768fa99..01cd6b6fa006 100644 --- a/test/MC/X86/x86-64.s +++ b/test/MC/X86/x86-64.s @@ -622,6 +622,11 @@ movl $12, foo(%rip) // CHECK: encoding: [0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00] // CHECK: fixup A - offset: 2, value: foo-8, kind: reloc_riprel_4byte +// rdar://37247000 +movl $12, 1024(%rip) +// CHECK: movl $12, 1024(%rip) +// CHECK: encoding: [0xc7,0x05,0x00,0x04,0x00,0x00,0x0c,0x00,0x00,0x00] + movq $12, foo(%rip) // CHECK: movq $12, foo(%rip) // CHECK: encoding: [0x48,0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00] diff --git a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index f82bf81fbbf8..c8a05204bf5e 100644 --- a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -722,6 +722,114 @@ define <2 x half> @constant_rtz_pkrtz() { ret <2 x half> %cvt } +; -------------------------------------------------------------------- +; llvm.amdgcn.cvt.pknorm.i16 +; -------------------------------------------------------------------- + +declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) nounwind readnone + +; CHECK-LABEL: @undef_lhs_cvt_pknorm_i16( +; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y) +define <2 x i16> @undef_lhs_cvt_pknorm_i16(float %y) { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y) + ret <2 x i16> %cvt +} + +; CHECK-LABEL: @undef_rhs_cvt_pknorm_i16( +; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef) +define <2 x i16> @undef_rhs_cvt_pknorm_i16(float %x) { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef) + ret <2 x i16> %cvt +} + +; CHECK-LABEL: @undef_cvt_pknorm_i16( +; CHECK: ret <2 x i16> undef +define <2 x i16> @undef_cvt_pknorm_i16() { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float undef) + ret <2 x i16> %cvt +} + +; -------------------------------------------------------------------- +; llvm.amdgcn.cvt.pknorm.u16 +; -------------------------------------------------------------------- + +declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) nounwind readnone + +; CHECK-LABEL: @undef_lhs_cvt_pknorm_u16( +; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y) +define <2 x i16> @undef_lhs_cvt_pknorm_u16(float %y) { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y) + ret <2 x i16> %cvt +} + +; CHECK-LABEL: @undef_rhs_cvt_pknorm_u16( +; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef) +define <2 x i16> @undef_rhs_cvt_pknorm_u16(float %x) { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef) + ret <2 x i16> %cvt +} + +; CHECK-LABEL: @undef_cvt_pknorm_u16( +; CHECK: ret <2 x i16> undef +define <2 x i16> @undef_cvt_pknorm_u16() { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float undef) + ret <2 x i16> %cvt +} + +; -------------------------------------------------------------------- +; llvm.amdgcn.cvt.pk.i16 +; -------------------------------------------------------------------- + +declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) nounwind readnone + +; CHECK-LABEL: @undef_lhs_cvt_pk_i16( +; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y) +define <2 x i16> @undef_lhs_cvt_pk_i16(i32 %y) { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y) + ret <2 x i16> %cvt +} + +; CHECK-LABEL: @undef_rhs_cvt_pk_i16( +; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef) +define <2 x i16> @undef_rhs_cvt_pk_i16(i32 %x) { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef) + ret <2 x i16> %cvt +} + +; CHECK-LABEL: @undef_cvt_pk_i16( +; CHECK: ret <2 x i16> undef +define <2 x i16> @undef_cvt_pk_i16() { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 undef) + ret <2 x i16> %cvt +} + +; -------------------------------------------------------------------- +; llvm.amdgcn.cvt.pk.u16 +; -------------------------------------------------------------------- + +declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) nounwind readnone + +; CHECK-LABEL: @undef_lhs_cvt_pk_u16( +; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y) +define <2 x i16> @undef_lhs_cvt_pk_u16(i32 %y) { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y) + ret <2 x i16> %cvt +} + +; CHECK-LABEL: @undef_rhs_cvt_pk_u16( +; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef) +define <2 x i16> @undef_rhs_cvt_pk_u16(i32 %x) { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef) + ret <2 x i16> %cvt +} + +; CHECK-LABEL: @undef_cvt_pk_u16( +; CHECK: ret <2 x i16> undef +define <2 x i16> @undef_cvt_pk_u16() { + %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 undef) + ret <2 x i16> %cvt +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.ubfe ; --------------------------------------------------------------------