Vendor import of llvm release_60 branch r325330:

https://llvm.org/svn/llvm-project/llvm/branches/release_60@325330
This commit is contained in:
Dimitry Andric 2018-02-16 19:10:15 +00:00
parent 6d18171c19
commit 3c315f3a8e
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/vendor/llvm/dist-release_60/; revision=329394
svn path=/vendor/llvm/llvm-release_60-r325330/; revision=329395; tag=vendor/llvm/llvm-release_60-r325330
55 changed files with 2116 additions and 866 deletions

View File

@ -71,6 +71,13 @@ Non-comprehensive list of changes in this release
Changes to the LLVM IR
----------------------
Changes to the AArch64 Target
-----------------------------
During this release:
* Enabled the new GlobalISel instruction selection framework by default at ``-O0``.
Changes to the ARM Target
-------------------------
@ -80,6 +87,28 @@ During this release the ARM target has:
isn't the default.
Changes to the Hexagon Target
-----------------------------
* The Hexagon backend now supports V65 ISA.
* The ``-mhvx`` option now takes an optional value that specified the ISA
version of the HVX coprocessor. The available values are v60, v62 and v65.
By default, the value is set to be the same as the CPU version.
* The compiler option ``-mhvx-double`` is deprecated and will be removed in
the next release of the compiler. Programmers should use ``-mhvx-length``
option to specify the desired vector length: ``-mhvx-length=64b`` for
64-byte vectors and ``-mhvx-length=128b`` for 128-byte vectors. While the
current default vector length is 64 bytes, users should always specify the
length explicitly, since the default value may change in the future.
* The target feature ``hvx-double`` is deprecated and will be removed in the
next release. LLVM IR generators should use target features ``hvx-length64b``
and ``hvx-length128b`` to indicate the vector length. The length should
always be specified when HVX code generation is enabled.
Changes to the MIPS Target
--------------------------
@ -91,6 +120,15 @@ Changes to the PowerPC Target
During this release ...
Changes to the SystemZ Target
-----------------------------
During this release the SystemZ target has:
* Added support for 128-bit atomic operations.
* Added support for the "o" constraint for inline asm statements.
Changes to the X86 Target
-------------------------

View File

@ -238,6 +238,26 @@ def int_amdgcn_cvt_pkrtz : Intrinsic<
[IntrNoMem, IntrSpeculatable]
>;
def int_amdgcn_cvt_pknorm_i16 : Intrinsic<
[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
[IntrNoMem, IntrSpeculatable]
>;
def int_amdgcn_cvt_pknorm_u16 : Intrinsic<
[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
[IntrNoMem, IntrSpeculatable]
>;
def int_amdgcn_cvt_pk_i16 : Intrinsic<
[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]
>;
def int_amdgcn_cvt_pk_u16 : Intrinsic<
[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]
>;
def int_amdgcn_class : Intrinsic<
[llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]

View File

@ -3738,6 +3738,15 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_kxnor_w : // TODO: remove this intrinsic
Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
[IntrNoMem]>;
def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">,
Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
[IntrNoMem]>;
def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem]>;
def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">,
Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
[IntrNoMem]>;
def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">,
Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
[IntrNoMem]>;

View File

@ -0,0 +1,38 @@
//===- MCAsmMacro.h - Assembly Macros ---------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_MC_MCASMMACRO_H
#define LLVM_MC_MCASMMACRO_H
#include "llvm/MC/MCParser/MCAsmLexer.h"
namespace llvm {
struct MCAsmMacroParameter {
StringRef Name;
std::vector<AsmToken> Value;
bool Required = false;
bool Vararg = false;
MCAsmMacroParameter() = default;
};
typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
struct MCAsmMacro {
StringRef Name;
StringRef Body;
MCAsmMacroParameters Parameters;
public:
MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
: Name(N), Body(B), Parameters(std::move(P)) {}
};
}; // namespace llvm
#endif

View File

@ -18,6 +18,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/MC/MCAsmMacro.h"
#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/SectionKind.h"
@ -268,6 +269,9 @@ namespace llvm {
unsigned UniqueID,
const MCSymbolELF *Associated);
/// \brief Map of currently defined macros.
StringMap<MCAsmMacro> MacroMap;
public:
explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI,
const MCObjectFileInfo *MOFI,
@ -618,6 +622,17 @@ namespace llvm {
// FIXME: We should really do something about that.
LLVM_ATTRIBUTE_NORETURN void reportFatalError(SMLoc L,
const Twine &Msg);
const MCAsmMacro *lookupMacro(StringRef Name) {
StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
return (I == MacroMap.end()) ? nullptr : &I->getValue();
}
void defineMacro(StringRef Name, MCAsmMacro Macro) {
MacroMap.insert(std::make_pair(Name, std::move(Macro)));
}
void undefineMacro(StringRef Name) { MacroMap.erase(Name); }
};
} // end namespace llvm

View File

@ -698,24 +698,20 @@ struct SemiNCAInfo {
return;
// Recalculate the set of roots.
DT.Roots = FindRoots(DT, BUI);
for (const NodePtr R : DT.Roots) {
const TreeNodePtr TN = DT.getNode(R);
// A CFG node was selected as a tree root, but the corresponding tree node
// is not connected to the virtual root. This is because the incremental
// algorithm does not really know or use the set of roots and can make a
// different (implicit) decision about which nodes within an infinite loop
// becomes a root.
if (TN && !DT.isVirtualRoot(TN->getIDom())) {
DEBUG(dbgs() << "Root " << BlockNamePrinter(R)
<< " is not virtual root's child\n"
<< "The entire tree needs to be rebuilt\n");
// It should be possible to rotate the subtree instead of recalculating
// the whole tree, but this situation happens extremely rarely in
// practice.
CalculateFromScratch(DT, BUI);
return;
}
auto Roots = FindRoots(DT, BUI);
if (DT.Roots.size() != Roots.size() ||
!std::is_permutation(DT.Roots.begin(), DT.Roots.end(), Roots.begin())) {
// The roots chosen in the CFG have changed. This is because the
// incremental algorithm does not really know or use the set of roots and
// can make a different (implicit) decision about which node within an
// infinite loop becomes a root.
DEBUG(dbgs() << "Roots are different in updated trees\n"
<< "The entire tree needs to be rebuilt\n");
// It may be possible to update the tree without recalculating it, but
// we do not know yet how to do it, and it happens rarely in practise.
CalculateFromScratch(DT, BUI);
return;
}
}

View File

@ -163,7 +163,8 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
DIType *BaseType = DDTy->getBaseType().resolve();
assert(BaseType && "Unexpected invalid base type");
if (!BaseType)
return 0;
// If this is a derived type, go ahead and get the base type, unless it's a
// reference then it's just the size of the field. Pointer types have no need

View File

@ -1391,7 +1391,8 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
if (!Name.empty())
addString(MemberDie, dwarf::DW_AT_name, Name);
addType(MemberDie, resolve(DT->getBaseType()));
if (DIType *Resolved = resolve(DT->getBaseType()))
addType(MemberDie, Resolved);
addSourceLine(MemberDie, DT);

View File

@ -205,14 +205,18 @@ void LivePhysRegs::addPristines(const MachineFunction &MF) {
}
void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
if (!MBB.succ_empty()) {
// To get the live-outs we simply merge the live-ins of all successors.
for (const MachineBasicBlock *Succ : MBB.successors())
addBlockLiveIns(*Succ);
} else if (MBB.isReturnBlock()) {
// For the return block: Add all callee saved registers that are saved and
// restored (somewhere); This does not include callee saved registers that
// are unused and hence not saved and restored; they are called pristine.
// To get the live-outs we simply merge the live-ins of all successors.
for (const MachineBasicBlock *Succ : MBB.successors())
addBlockLiveIns(*Succ);
if (MBB.isReturnBlock()) {
// Return blocks are a special case because we currently don't mark up
// return instructions completely: specifically, there is no explicit
// use for callee-saved registers. So we add all callee saved registers
// that are saved and restored (somewhere). This does not include
// callee saved registers that are unused and hence not saved and
// restored; they are called pristine.
// FIXME: PEI should add explicit markings to return instructions
// instead of implicitly handling them here.
const MachineFunction &MF = *MBB.getParent();
const MachineFrameInfo &MFI = MF.getFrameInfo();
if (MFI.isCalleeSavedInfoValid()) {
@ -225,15 +229,8 @@ void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
const MachineFunction &MF = *MBB.getParent();
if (!MBB.succ_empty()) {
addPristines(MF);
addLiveOutsNoPristines(MBB);
} else if (MBB.isReturnBlock()) {
// For the return block: Add all callee saved registers.
const MachineFrameInfo &MFI = MF.getFrameInfo();
if (MFI.isCalleeSavedInfoValid())
addCalleeSavedRegs(*this, MF);
}
addPristines(MF);
addLiveOutsNoPristines(MBB);
}
void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) {

View File

@ -16409,7 +16409,9 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N1.getOperand(0).getOperand(1) == N2 &&
N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() ==
VT.getVectorNumElements()) {
VT.getVectorNumElements() &&
N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
VT.getSizeInBits()) {
return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
}

View File

@ -491,9 +491,8 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
return VNI;
}
void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
assert(ParentVNI && "Mapping NULL value");
ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI->id)];
void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) {
ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI.id)];
VNInfo *VNI = VFP.getPointer();
// ParentVNI was either unmapped or already complex mapped. Either way, just
@ -777,7 +776,7 @@ SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) {
// the source live range. The spiller also won't try to hoist this copy.
if (SpillMode && !SlotIndex::isSameInstr(ParentVNI->def, Idx) &&
MI->readsVirtualRegister(Edit->getReg())) {
forceRecompute(0, ParentVNI);
forceRecompute(0, *ParentVNI);
defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
return Idx;
}
@ -835,7 +834,7 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
// The complement interval will be extended as needed by LRCalc.extend().
if (ParentVNI)
forceRecompute(0, ParentVNI);
forceRecompute(0, *ParentVNI);
DEBUG(dbgs() << " overlapIntv [" << Start << ';' << End << "):");
RegAssign.insert(Start, End, OpenIdx);
DEBUG(dump());
@ -878,7 +877,7 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
unsigned RegIdx = AssignI.value();
if (AtBegin || !MBBI->readsVirtualRegister(Edit->getReg())) {
DEBUG(dbgs() << " cannot find simple kill of RegIdx " << RegIdx << '\n');
forceRecompute(RegIdx, Edit->getParent().getVNInfoAt(Def));
forceRecompute(RegIdx, *Edit->getParent().getVNInfoAt(Def));
} else {
SlotIndex Kill = LIS.getInstructionIndex(*MBBI).getRegSlot();
DEBUG(dbgs() << " move kill to " << Kill << '\t' << *MBBI);
@ -982,7 +981,7 @@ void SplitEditor::computeRedundantBackCopies(
}
}
if (!DominatedVNIs.empty()) {
forceRecompute(0, ParentVNI);
forceRecompute(0, *ParentVNI);
for (auto VNI : DominatedVNIs) {
BackCopies.push_back(VNI);
}
@ -1102,7 +1101,7 @@ void SplitEditor::hoistCopies() {
NotToHoistSet.count(ParentVNI->id))
continue;
BackCopies.push_back(VNI);
forceRecompute(0, ParentVNI);
forceRecompute(0, *ParentVNI);
}
// If it is not beneficial to hoist all the BackCopies, simply remove
@ -1428,6 +1427,41 @@ void SplitEditor::deleteRematVictims() {
Edit->eliminateDeadDefs(Dead, None, &AA);
}
void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) {
// Fast-path for common case.
if (!ParentVNI.isPHIDef()) {
for (unsigned I = 0, E = Edit->size(); I != E; ++I)
forceRecompute(I, ParentVNI);
return;
}
// Trace value through phis.
SmallPtrSet<const VNInfo *, 8> Visited; ///< whether VNI was/is in worklist.
SmallVector<const VNInfo *, 4> WorkList;
Visited.insert(&ParentVNI);
WorkList.push_back(&ParentVNI);
const LiveInterval &ParentLI = Edit->getParent();
const SlotIndexes &Indexes = *LIS.getSlotIndexes();
do {
const VNInfo &VNI = *WorkList.back();
WorkList.pop_back();
for (unsigned I = 0, E = Edit->size(); I != E; ++I)
forceRecompute(I, VNI);
if (!VNI.isPHIDef())
continue;
MachineBasicBlock &MBB = *Indexes.getMBBFromIndex(VNI.def);
for (const MachineBasicBlock *Pred : MBB.predecessors()) {
SlotIndex PredEnd = Indexes.getMBBEndIdx(Pred);
VNInfo *PredVNI = ParentLI.getVNInfoBefore(PredEnd);
assert(PredVNI && "Value available in PhiVNI predecessor");
if (Visited.insert(PredVNI).second)
WorkList.push_back(PredVNI);
}
} while(!WorkList.empty());
}
void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
++NumFinished;
@ -1444,8 +1478,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
// Force rematted values to be recomputed everywhere.
// The new live ranges may be truncated.
if (Edit->didRematerialize(ParentVNI))
for (unsigned i = 0, e = Edit->size(); i != e; ++i)
forceRecompute(i, ParentVNI);
forceRecomputeVNI(*ParentVNI);
}
// Hoist back-copies to the complement interval when in spill mode.

View File

@ -357,7 +357,11 @@ class LLVM_LIBRARY_VISIBILITY SplitEditor {
/// recomputed by LiveRangeCalc::extend regardless of the number of defs.
/// This is used for values whose live range doesn't match RegAssign exactly.
/// They could have rematerialized, or back-copies may have been moved.
void forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI);
void forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI);
/// Calls forceRecompute() on any affected regidx and on ParentVNI
/// predecessors in case of a phi definition.
void forceRecomputeVNI(const VNInfo &ParentVNI);
/// defFromParent - Define Reg from ParentVNI at UseIdx using either
/// rematerialization or a COPY from parent. Return the new value.

View File

@ -75,7 +75,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
Name=="ssse3.pabs.d.128" || // Added in 6.0
Name.startswith("avx512.mask.shuf.i") || // Added in 6.0
Name.startswith("avx512.mask.shuf.f") || // Added in 6.0
Name.startswith("avx512.kunpck") || //added in 6.0
Name.startswith("avx2.pabs.") || // Added in 6.0
Name.startswith("avx512.mask.pabs.") || // Added in 6.0
Name.startswith("avx512.broadcastm") || // Added in 6.0
@ -1063,12 +1062,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
CI->getArgOperand(1));
} else if (IsX86 && (Name.startswith("avx512.kunpck"))) {
uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2;
uint64_t And = (1ULL << Shift) - 1;
Value* LowBits = Builder.CreateAnd(CI->getArgOperand(0), And);
Value* HighBits = Builder.CreateShl(CI->getArgOperand(1), Shift);
Rep = Builder.CreateOr(LowBits, HighBits);
} else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
Type *I32Ty = Type::getInt32Ty(C);
Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),

View File

@ -83,27 +83,6 @@ namespace {
typedef std::vector<AsmToken> MCAsmMacroArgument;
typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
struct MCAsmMacroParameter {
StringRef Name;
MCAsmMacroArgument Value;
bool Required = false;
bool Vararg = false;
MCAsmMacroParameter() = default;
};
typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
struct MCAsmMacro {
StringRef Name;
StringRef Body;
MCAsmMacroParameters Parameters;
public:
MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
: Name(N), Body(B), Parameters(std::move(P)) {}
};
/// \brief Helper class for storing information about an active macro
/// instantiation.
struct MacroInstantiation {
@ -164,9 +143,6 @@ class AsmParser : public MCAsmParser {
/// addDirectiveHandler.
StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
/// \brief Map of currently defined macros.
StringMap<MCAsmMacro> MacroMap;
/// \brief Stack of active macro instantiations.
std::vector<MacroInstantiation*> ActiveMacros;
@ -308,17 +284,6 @@ class AsmParser : public MCAsmParser {
/// \brief Control a flag in the parser that enables or disables macros.
void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
/// \brief Lookup a previously defined macro.
/// \param Name Macro name.
/// \returns Pointer to macro. NULL if no such macro was defined.
const MCAsmMacro* lookupMacro(StringRef Name);
/// \brief Define a new macro with the given name and information.
void defineMacro(StringRef Name, MCAsmMacro Macro);
/// \brief Undefine a macro. If no such macro was defined, it's a no-op.
void undefineMacro(StringRef Name);
/// \brief Are we inside a macro instantiation?
bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}
@ -1841,7 +1806,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
// If macros are enabled, check to see if this is a macro instantiation.
if (areMacrosEnabled())
if (const MCAsmMacro *M = lookupMacro(IDVal)) {
if (const MCAsmMacro *M = getContext().lookupMacro(IDVal)) {
return handleMacroEntry(M, IDLoc);
}
@ -2720,17 +2685,6 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
return TokError("too many positional arguments");
}
const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) {
StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
return (I == MacroMap.end()) ? nullptr : &I->getValue();
}
void AsmParser::defineMacro(StringRef Name, MCAsmMacro Macro) {
MacroMap.insert(std::make_pair(Name, std::move(Macro)));
}
void AsmParser::undefineMacro(StringRef Name) { MacroMap.erase(Name); }
bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
// Arbitrarily limit macro nesting depth (default matches 'as'). We can
// eliminate this, although we should protect against infinite loops.
@ -4249,7 +4203,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
eatToEndOfStatement();
}
if (lookupMacro(Name)) {
if (getContext().lookupMacro(Name)) {
return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
}
@ -4257,7 +4211,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
const char *BodyEnd = EndToken.getLoc().getPointer();
StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
getContext().defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
return false;
}
@ -4416,10 +4370,10 @@ bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
"unexpected token in '.purgem' directive"))
return true;
if (!lookupMacro(Name))
if (!getContext().lookupMacro(Name))
return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
undefineMacro(Name);
getContext().undefineMacro(Name);
return false;
}

View File

@ -3957,6 +3957,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_F32_UBYTE2)
NODE_NAME_CASE(CVT_F32_UBYTE3)
NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
NODE_NAME_CASE(CVT_PKNORM_I16_F32)
NODE_NAME_CASE(CVT_PKNORM_U16_F32)
NODE_NAME_CASE(CVT_PK_I16_I32)
NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)
NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)

View File

@ -417,6 +417,10 @@ enum NodeType : unsigned {
// Convert two float 32 numbers into a single register holding two packed f16
// with round to zero.
CVT_PKRTZ_F16_F32,
CVT_PKNORM_I16_F32,
CVT_PKNORM_U16_F32,
CVT_PK_I16_I32,
CVT_PK_U16_U32,
// Same as the standard node, except the high bits of the resulting integer
// are known 0.

View File

@ -108,3 +108,21 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
return MCOp;
}
// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
const Value *Ptr = MMO->getValue();
// UndefValue means this is a load of a kernel input. These are uniform.
// Sometimes LDS instructions have constant pointers.
// If Ptr is null, then that means this mem operand contains a
// PseudoSourceValue like GOT.
if (!Ptr || isa<UndefValue>(Ptr) ||
isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
return true;
if (const Argument *Arg = dyn_cast<Argument>(Ptr))
return AMDGPU::isArgPassedInSGPR(Arg);
const Instruction *I = dyn_cast<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.uniform");
}

View File

@ -50,6 +50,8 @@ class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
int pseudoToMCOpcode(int Opcode) const;
static bool isUniformMMO(const MachineMemOperand *MMO);
};
} // End llvm namespace

View File

@ -35,6 +35,10 @@ def AMDGPUFPPackOp : SDTypeProfile<1, 2,
[SDTCisFP<1>, SDTCisSameAs<1, 2>]
>;
def AMDGPUIntPackOp : SDTypeProfile<1, 2,
[SDTCisInt<1>, SDTCisSameAs<1, 2>]
>;
def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
>;
@ -142,6 +146,10 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;

View File

@ -120,7 +120,7 @@ static bool isInstrUniform(const MachineInstr &MI) {
return false;
const MachineMemOperand *MMO = *MI.memoperands_begin();
return AMDGPU::isUniformMMO(MMO);
return AMDGPUInstrInfo::isUniformMMO(MMO);
}
const RegisterBankInfo::InstructionMapping &

View File

@ -205,6 +205,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@ -1085,7 +1086,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
return AMDGPU::isUniformMMO(MemNode->getMemOperand());
return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
}
TargetLoweringBase::LegalizeTypeAction
@ -3517,7 +3518,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
switch (IID) {
case Intrinsic::amdgcn_cvt_pkrtz: {
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
SDLoc SL(N);
@ -3526,6 +3528,29 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
return;
}
case Intrinsic::amdgcn_cvt_pknorm_i16:
case Intrinsic::amdgcn_cvt_pknorm_u16:
case Intrinsic::amdgcn_cvt_pk_i16:
case Intrinsic::amdgcn_cvt_pk_u16: {
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
SDLoc SL(N);
unsigned Opcode;
if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
Opcode = AMDGPUISD::CVT_PK_I16_I32;
else
Opcode = AMDGPUISD::CVT_PK_U16_U32;
SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
return;
}
}
break;
}
case ISD::SELECT: {
@ -4424,10 +4449,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_ubfe:
return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_cvt_pkrtz: {
// FIXME: Stop adding cast if v2f16 legal.
case Intrinsic::amdgcn_cvt_pkrtz:
case Intrinsic::amdgcn_cvt_pknorm_i16:
case Intrinsic::amdgcn_cvt_pknorm_u16:
case Intrinsic::amdgcn_cvt_pk_i16:
case Intrinsic::amdgcn_cvt_pk_u16: {
// FIXME: Stop adding cast if v2f16/v2i16 are legal.
EVT VT = Op.getValueType();
SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
unsigned Opcode;
if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
Opcode = AMDGPUISD::CVT_PK_I16_I32;
else
Opcode = AMDGPUISD::CVT_PK_U16_U32;
SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}

View File

@ -871,24 +871,6 @@ bool isArgPassedInSGPR(const Argument *A) {
}
}
// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
bool isUniformMMO(const MachineMemOperand *MMO) {
const Value *Ptr = MMO->getValue();
// UndefValue means this is a load of a kernel input. These are uniform.
// Sometimes LDS instructions have constant pointers.
// If Ptr is null, then that means this mem operand contains a
// PseudoSourceValue like GOT.
if (!Ptr || isa<UndefValue>(Ptr) ||
isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
return true;
if (const Argument *Arg = dyn_cast<Argument>(Ptr))
return isArgPassedInSGPR(Arg);
const Instruction *I = dyn_cast<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.uniform");
}
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
if (isGCN3Encoding(ST))
return ByteOffset;

View File

@ -363,7 +363,6 @@ LLVM_READNONE
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
bool isArgPassedInSGPR(const Argument *Arg);
bool isUniformMMO(const MachineMemOperand *MMO);
/// \returns The encoding that will be used for \p ByteOffset in the SMRD
/// offset field.

View File

@ -407,11 +407,11 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;
} // End SubtargetPredicate = isGCN

View File

@ -396,10 +396,14 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// rip-relative addressing is actually relative to the *next* instruction.
// Since an immediate can follow the mod/rm byte for an instruction, this
// means that we need to bias the immediate field of the instruction with
// the size of the immediate field. If we have this case, add it into the
// means that we need to bias the displacement field of the instruction with
// the size of the immediate field. If we have this case, add it into the
// expression to emit.
int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
// Note: rip-relative addressing using immediate displacement values should
// not be adjusted, assuming it was the user's intent.
int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags)
? X86II::getSizeOfImm(TSFlags)
: 0;
EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
CurByte, OS, Fixups, -ImmSize);

View File

@ -370,6 +370,8 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
char Mode, raw_ostream &O) {
unsigned Reg = MO.getReg();
bool EmitPercent = true;
switch (Mode) {
default: return true; // Unknown mode.
case 'b': // Print QImode register
@ -384,6 +386,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
case 'k': // Print SImode register
Reg = getX86SubSuperRegister(Reg, 32);
break;
case 'V':
EmitPercent = false;
LLVM_FALLTHROUGH;
case 'q':
// Print 64-bit register names if 64-bit integer registers are available.
// Otherwise, print 32-bit register names.
@ -391,7 +396,10 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
break;
}
O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
if (EmitPercent)
O << '%';
O << X86ATTInstPrinter::getRegisterName(Reg);
return false;
}
@ -464,6 +472,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 'w': // Print HImode register
case 'k': // Print SImode register
case 'q': // Print DImode register
case 'V': // Print native register without '%'
if (MO.isReg())
return printAsmMRegister(*this, MO, ExtraCode[0], O);
printOperand(*this, MI, OpNo, O);

View File

@ -663,8 +663,10 @@ void X86DomainReassignment::initConverters() {
createReplacer(X86::XOR32rr, X86::KXORDrr);
createReplacer(X86::XOR64rr, X86::KXORQrr);
createReplacer(X86::TEST32rr, X86::KTESTDrr);
createReplacer(X86::TEST64rr, X86::KTESTQrr);
// TODO: KTEST is not a replacement for TEST due to flag differences. Need
// to prove only Z flag is used.
//createReplacer(X86::TEST32rr, X86::KTESTDrr);
//createReplacer(X86::TEST64rr, X86::KTESTQrr);
}
if (STI->hasDQI()) {
@ -684,8 +686,10 @@ void X86DomainReassignment::initConverters() {
createReplacer(X86::SHR8ri, X86::KSHIFTRBri);
createReplacer(X86::SHL8ri, X86::KSHIFTLBri);
createReplacer(X86::TEST8rr, X86::KTESTBrr);
createReplacer(X86::TEST16rr, X86::KTESTWrr);
// TODO: KTEST is not a replacement for TEST due to flag differences. Need
// to prove only Z flag is used.
//createReplacer(X86::TEST8rr, X86::KTESTBrr);
//createReplacer(X86::TEST16rr, X86::KTESTWrr);
createReplacer(X86::XOR8rr, X86::KXORBrr);
}

View File

@ -17017,24 +17017,6 @@ static bool hasNonFlagsUse(SDValue Op) {
return false;
}
// Emit KTEST instruction for bit vectors on AVX-512
static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (Op.getOpcode() == ISD::BITCAST) {
auto hasKTEST = [&](MVT VT) {
unsigned SizeInBits = VT.getSizeInBits();
return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
(Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
};
SDValue Op0 = Op.getOperand(0);
MVT Op0VT = Op0.getValueType().getSimpleVT();
if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
hasKTEST(Op0VT))
return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
}
return SDValue();
}
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
@ -17079,9 +17061,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
// doing a separate TEST. TEST always sets OF and CF to 0, so unless
// we prove that the arithmetic won't overflow, we can't use OF or CF.
if (Op.getResNo() != 0 || NeedOF || NeedCF) {
// Emit KTEST for bit vectors
if (auto Node = EmitKTEST(Op, DAG, Subtarget))
return Node;
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
@ -17310,10 +17289,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
}
if (Opcode == 0) {
// Emit KTEST for bit vectors
if (auto Node = EmitKTEST(Op, DAG, Subtarget))
return Node;
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
@ -18093,6 +18068,34 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
return Result;
}
// Try to select this as a KTEST+SETCC if possible.
static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Only support equality comparisons.
if (CC != ISD::SETEQ && CC != ISD::SETNE)
return SDValue();
// Must be a bitcast from vXi1.
if (Op0.getOpcode() != ISD::BITCAST)
return SDValue();
Op0 = Op0.getOperand(0);
MVT VT = Op0.getSimpleValueType();
if (!(Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) &&
!(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
return SDValue();
X86::CondCode X86CC;
if (isNullConstant(Op1)) {
X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
} else
return SDValue();
SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0);
return getSETCC(X86CC, KTEST, dl, DAG);
}
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
@ -18115,6 +18118,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return NewSetCC;
}
// Try to lower using KTEST.
if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
return NewSetCC;
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
// these.
if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
@ -20525,6 +20532,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Mask = DAG.getBitcast(MaskVT, Mask);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
}
case KUNPCK: {
MVT VT = Op.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
// Arguments should be swapped.
SDValue Res = DAG.getNode(IntrData->Opc0, dl,
MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
Src2, Src1);
return DAG.getBitcast(VT, Res);
}
case MASK_BINOP: {
MVT VT = Op.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
@ -27094,28 +27113,57 @@ static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
unsigned Reg) {
if (Subtarget.useRetpolineExternalThunk()) {
// When using an external thunk for retpolines, we pick names that match the
// names GCC happens to use as well. This helps simplify the implementation
// of the thunks for kernels where they have no easy ability to create
// aliases and are doing non-trivial configuration of the thunk's body. For
// example, the Linux kernel will do boot-time hot patching of the thunk
// bodies and cannot easily export aliases of these to loaded modules.
//
// Note that at any point in the future, we may need to change the semantics
// of how we implement retpolines and at that time will likely change the
// name of the called thunk. Essentially, there is no hard guarantee that
// LLVM will generate calls to specific thunks, we merely make a best-effort
// attempt to help out kernels and other systems where duplicating the
// thunks is costly.
switch (Reg) {
case X86::EAX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_eax";
case X86::ECX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_ecx";
case X86::EDX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_edx";
case X86::EDI:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_edi";
case X86::R11:
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__x86_indirect_thunk_r11";
}
llvm_unreachable("unexpected reg for retpoline");
}
// When targeting an internal COMDAT thunk use an LLVM-specific name.
switch (Reg) {
case 0:
assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
return Subtarget.useRetpolineExternalThunk()
? "__llvm_external_retpoline_push"
: "__llvm_retpoline_push";
case X86::EAX:
return Subtarget.useRetpolineExternalThunk()
? "__llvm_external_retpoline_eax"
: "__llvm_retpoline_eax";
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_eax";
case X86::ECX:
return Subtarget.useRetpolineExternalThunk()
? "__llvm_external_retpoline_ecx"
: "__llvm_retpoline_ecx";
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_ecx";
case X86::EDX:
return Subtarget.useRetpolineExternalThunk()
? "__llvm_external_retpoline_edx"
: "__llvm_retpoline_edx";
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_edx";
case X86::EDI:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_edi";
case X86::R11:
return Subtarget.useRetpolineExternalThunk()
? "__llvm_external_retpoline_r11"
: "__llvm_retpoline_r11";
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__llvm_retpoline_r11";
}
llvm_unreachable("unexpected reg for retpoline");
}
@ -27134,15 +27182,13 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
// just use R11, but we scan for uses anyway to ensure we don't generate
// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
// already a register use operand to the call to hold the callee. If none
// are available, push the callee instead. This is less efficient, but is
// necessary for functions using 3 regparms. Such function calls are
// (currently) not eligible for tail call optimization, because there is no
// scratch register available to hold the address of the callee.
// are available, use EDI instead. EDI is chosen because EBX is the PIC base
// register and ESI is the base pointer to realigned stack frames with VLAs.
SmallVector<unsigned, 3> AvailableRegs;
if (Subtarget.is64Bit())
AvailableRegs.push_back(X86::R11);
else
AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX});
AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
// Zero out any registers that are already used.
for (const auto &MO : MI.operands()) {
@ -27160,30 +27206,18 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
break;
}
}
if (!AvailableReg)
report_fatal_error("calling convention incompatible with retpoline, no "
"available registers");
const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
if (AvailableReg == 0) {
// No register available. Use PUSH. This must not be a tailcall, and this
// must not be x64.
if (Subtarget.is64Bit())
report_fatal_error(
"Cannot make an indirect call on x86-64 using both retpoline and a "
"calling convention that preservers r11");
if (Opc != X86::CALLpcrel32)
report_fatal_error("Cannot make an indirect tail call on x86 using "
"retpoline without a preserved register");
BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg);
MI.getOperand(0).ChangeToES(Symbol);
MI.setDesc(TII->get(Opc));
} else {
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
.addReg(CalleeVReg);
MI.getOperand(0).ChangeToES(Symbol);
MI.setDesc(TII->get(Opc));
MachineInstrBuilder(*BB->getParent(), &MI)
.addReg(AvailableReg, RegState::Implicit | RegState::Kill);
}
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
.addReg(CalleeVReg);
MI.getOperand(0).ChangeToES(Symbol);
MI.setDesc(TII->get(Opc));
MachineInstrBuilder(*BB->getParent(), &MI)
.addReg(AvailableReg, RegState::Implicit | RegState::Kill);
return BB;
}
@ -30432,53 +30466,6 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
SDValue N0 = BitCast.getOperand(0);
EVT VecVT = N0->getValueType(0);
if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
N0->getOpcode() == ISD::OR) {
SDValue Op0 = N0->getOperand(0);
SDValue Op1 = N0->getOperand(1);
MVT TrunckVT;
MVT BitcastVT;
switch (VT.getSimpleVT().SimpleTy) {
default:
return SDValue();
case MVT::v16i1:
TrunckVT = MVT::i8;
BitcastVT = MVT::v8i1;
break;
case MVT::v32i1:
TrunckVT = MVT::i16;
BitcastVT = MVT::v16i1;
break;
case MVT::v64i1:
TrunckVT = MVT::i32;
BitcastVT = MVT::v32i1;
break;
}
bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
bool isArg0UndefLeft =
Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
bool isArg1UndefLeft =
Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
SDValue OpLeft;
SDValue OpRight;
if (isArg0UndefRight && isArg1UndefLeft) {
OpLeft = Op0;
OpRight = Op1;
} else if (isArg1UndefRight && isArg0UndefLeft) {
OpLeft = Op1;
OpRight = Op0;
} else
return SDValue();
SDLoc DL(BitCast);
SDValue Shr = OpLeft->getOperand(0);
SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
}
if (!VT.isScalarInteger() || !VecVT.isSimple())
return SDValue();

View File

@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
EXPAND_FROM_MEM,
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
ROUNDP, ROUNDS
};
@ -479,6 +479,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
X86ISD::FADD_RND),

View File

@ -43,7 +43,7 @@ static const char R11ThunkName[] = "__llvm_retpoline_r11";
static const char EAXThunkName[] = "__llvm_retpoline_eax";
static const char ECXThunkName[] = "__llvm_retpoline_ecx";
static const char EDXThunkName[] = "__llvm_retpoline_edx";
static const char PushThunkName[] = "__llvm_retpoline_push";
static const char EDIThunkName[] = "__llvm_retpoline_edi";
namespace {
class X86RetpolineThunks : public MachineFunctionPass {
@ -74,7 +74,6 @@ class X86RetpolineThunks : public MachineFunctionPass {
void createThunkFunction(Module &M, StringRef Name);
void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB);
void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
};
@ -127,7 +126,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
createThunkFunction(M, R11ThunkName);
else
for (StringRef Name :
{EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName})
{EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
createThunkFunction(M, Name);
InsertedThunks = true;
return true;
@ -151,9 +150,8 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
populateThunk(MF, X86::R11);
} else {
// For 32-bit targets we need to emit a collection of thunks for various
// possible scratch registers as well as a fallback that is used when
// there are no scratch registers and assumes the retpoline target has
// been pushed.
// possible scratch registers as well as a fallback that uses EDI, which is
// normally callee saved.
// __llvm_retpoline_eax:
// calll .Leax_call_target
// .Leax_capture_spec:
@ -174,32 +172,18 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
// movl %edx, (%esp)
// retl
//
// This last one is a bit more special and so needs a little extra
// handling.
// __llvm_retpoline_push:
// calll .Lpush_call_target
// .Lpush_capture_spec:
// pause
// lfence
// jmp .Lpush_capture_spec
// .align 16
// .Lpush_call_target:
// # Clear pause_loop return address.
// addl $4, %esp
// # Top of stack words are: Callee, RA. Exchange Callee and RA.
// pushl 4(%esp) # Push callee
// pushl 4(%esp) # Push RA
// popl 8(%esp) # Pop RA to final RA
// popl (%esp) # Pop callee to next top of stack
// retl # Ret to callee
// __llvm_retpoline_edi:
// ... # Same setup
// movl %edi, (%esp)
// retl
if (MF.getName() == EAXThunkName)
populateThunk(MF, X86::EAX);
else if (MF.getName() == ECXThunkName)
populateThunk(MF, X86::ECX);
else if (MF.getName() == EDXThunkName)
populateThunk(MF, X86::EDX);
else if (MF.getName() == PushThunkName)
populateThunk(MF);
else if (MF.getName() == EDIThunkName)
populateThunk(MF, X86::EDI);
else
llvm_unreachable("Invalid thunk name on x86-32!");
}
@ -240,31 +224,6 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
.addReg(Reg);
}
void X86RetpolineThunks::insert32BitPushReturnAddrClobber(
MachineBasicBlock &MBB) {
// The instruction sequence we use to replace the return address without
// a scratch register is somewhat complicated:
// # Clear capture_spec from return address.
// addl $4, %esp
// # Top of stack words are: Callee, RA. Exchange Callee and RA.
// pushl 4(%esp) # Push callee
// pushl 4(%esp) # Push RA
// popl 8(%esp) # Pop RA to final RA
// popl (%esp) # Pop callee to next top of stack
// retl # Ret to callee
BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP)
.addReg(X86::ESP)
.addImm(4);
addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
false, 4);
addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
false, 4);
addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
false, 8);
addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
false, 0);
}
void X86RetpolineThunks::populateThunk(MachineFunction &MF,
Optional<unsigned> Reg) {
// Set MF properties. We never use vregs...
@ -301,11 +260,6 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
CaptureSpec->addSuccessor(CaptureSpec);
CallTarget->setAlignment(4);
if (Reg) {
insertRegReturnAddrClobber(*CallTarget, *Reg);
} else {
assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!");
insert32BitPushReturnAddrClobber(*CallTarget);
}
insertRegReturnAddrClobber(*CallTarget, *Reg);
BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
}

View File

@ -3264,6 +3264,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
case Intrinsic::amdgcn_cvt_pknorm_i16:
case Intrinsic::amdgcn_cvt_pknorm_u16:
case Intrinsic::amdgcn_cvt_pk_i16:
case Intrinsic::amdgcn_cvt_pk_u16: {
Value *Src0 = II->getArgOperand(0);
Value *Src1 = II->getArgOperand(1);
if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
break;
}
case Intrinsic::amdgcn_ubfe:
case Intrinsic::amdgcn_sbfe: {
// Decompose simple cases into standard shifts.

View File

@ -0,0 +1,84 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]]
; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]]
define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
%r = bitcast <2 x i16> %result to i32
store i32 %r, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}s_cvt_pk_i16_samereg_i32:
; GCN: s_load_dword [[X:s[0-9]+]]
; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
define amdgpu_kernel void @s_cvt_pk_i16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %x)
%r = bitcast <2 x i16> %result to i32
store i32 %r, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_cvt_pk_i16_i32:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[A]], [[B]]
; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @v_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile i32, i32 addrspace(1)* %a.gep
%b = load volatile i32, i32 addrspace(1)* %b.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 %b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_reg_imm:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
define amdgpu_kernel void @v_cvt_pk_i16_i32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile i32, i32 addrspace(1)* %a.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 1)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_imm_reg:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, 1, [[A]]
; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, 1, [[A]]
define amdgpu_kernel void @v_cvt_pk_i16_i32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile i32, i32 addrspace(1)* %a.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 1, i32 %a)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }

View File

@ -0,0 +1,84 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]]
; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]]
define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
%r = bitcast <2 x i16> %result to i32
store i32 %r, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}s_cvt_pk_u16_samereg_i32:
; GCN: s_load_dword [[X:s[0-9]+]]
; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
define amdgpu_kernel void @s_cvt_pk_u16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %x)
%r = bitcast <2 x i16> %result to i32
store i32 %r, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_cvt_pk_u16_u32:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @v_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile i32, i32 addrspace(1)* %a.gep
%b = load volatile i32, i32 addrspace(1)* %b.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 %b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_reg_imm:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
define amdgpu_kernel void @v_cvt_pk_u16_u32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile i32, i32 addrspace(1)* %a.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 1)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_imm_reg:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, 1, [[A]]
; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, 1, [[A]]
define amdgpu_kernel void @v_cvt_pk_u16_u32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile i32, i32 addrspace(1)* %a.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 1, i32 %a)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }

View File

@ -0,0 +1,164 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]]
define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
%r = bitcast <2 x i16> %result to i32
store i32 %r, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}s_cvt_pknorm_i16_samereg_f32:
; GCN: s_load_dword [[X:s[0-9]+]]
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
define amdgpu_kernel void @s_cvt_pknorm_i16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %x)
%r = bitcast <2 x i16> %result to i32
store i32 %r, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @v_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_reg_imm:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float 1.0)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_imm_reg:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, 1.0, [[A]]
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float 1.0, float %a)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%neg.a = fsub float -0.0, %a
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_hi:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%neg.b = fsub float -0.0, %b
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %neg.b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo_hi:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%neg.a = fsub float -0.0, %a
%neg.b = fsub float -0.0, %b
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %neg.b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%fabs.a = call float @llvm.fabs.f32(float %a)
%neg.fabs.a = fsub float -0.0, %fabs.a
%neg.b = fsub float -0.0, %b
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.fabs.a, float %neg.b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) #1
declare float @llvm.fabs.f32(float) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }

View File

@ -0,0 +1,164 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]]
define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
%r = bitcast <2 x i16> %result to i32
store i32 %r, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}s_cvt_pknorm_u16_samereg_f32:
; GCN: s_load_dword [[X:s[0-9]+]]
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
define amdgpu_kernel void @s_cvt_pknorm_u16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %x)
%r = bitcast <2 x i16> %result to i32
store i32 %r, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @v_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_reg_imm:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float 1.0)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_imm_reg:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, 1.0, [[A]]
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float 1.0, float %a)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%neg.a = fsub float -0.0, %a
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_hi:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%neg.b = fsub float -0.0, %b
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %neg.b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo_hi:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%neg.a = fsub float -0.0, %a
%neg.b = fsub float -0.0, %b
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %neg.b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
%a = load volatile float, float addrspace(1)* %a.gep
%b = load volatile float, float addrspace(1)* %b.gep
%fabs.a = call float @llvm.fabs.f32(float %a)
%neg.fabs.a = fsub float -0.0, %fabs.a
%neg.b = fsub float -0.0, %b
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.fabs.a, float %neg.b)
%r = bitcast <2 x i16> %cvt to i32
store i32 %r, i32 addrspace(1)* %out.gep
ret void
}
declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) #1
declare float @llvm.fabs.f32(float) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }

View File

@ -1,4 +1,4 @@
; RUN: llc < %s
; RUN: llc -verify-machineinstrs < %s
; PR25838
target triple = "armv7--linux-android"

View File

@ -0,0 +1,245 @@
; RUN: llc -o - %s | FileCheck %s
; Make sure RegAllocGreedy/SplitKit do not produce invalid liveness information
; and crash when splitting a liverange twice and rematerializing each time.
; (Sorry for the testcase; this was ran through bugpoint and then manually
; reduced for several hours but is still big...)
target triple = "thumbv7-apple-ios"
%struct.ham = type { %struct.wombat.0 }
%struct.wombat.0 = type { %struct.barney }
%struct.barney = type { %struct.snork.1 }
%struct.snork.1 = type { %struct.wobble.2 }
%struct.wobble.2 = type { %struct.blam }
%struct.blam = type { i32, i32, i8* }
%struct.ham.3 = type { %struct.pluto }
%struct.pluto = type { %struct.zot*, %struct.snork.5, %struct.wibble }
%struct.zot = type { %struct.blam.4* }
%struct.blam.4 = type <{ %struct.zot, %struct.blam.4*, %struct.zot*, i8, [3 x i8] }>
%struct.snork.5 = type { %struct.quux }
%struct.quux = type { %struct.zot }
%struct.wibble = type { %struct.widget }
%struct.widget = type { i32 }
%struct.bar = type { %struct.spam }
%struct.spam = type { %struct.zot*, %struct.wobble, %struct.zot.7 }
%struct.wobble = type { %struct.wibble.6 }
%struct.wibble.6 = type { %struct.zot }
%struct.zot.7 = type { %struct.ham.8 }
%struct.ham.8 = type { i32 }
%struct.hoge = type { %struct.ham, %struct.foo }
%struct.foo = type { float, float }
%struct.wombat = type { %struct.ham, float }
%struct.snork = type { %struct.ham.9, [11 x i8] }
%struct.ham.9 = type { i8 }
@global = external global i8
@global.1 = private constant [20 x i8] c"aaaaaaaaaaaaaaaaaa0\00"
@global.2 = external constant [27 x i8]
@global.3 = external global %struct.ham
@global.4 = external constant [47 x i8]
@global.5 = external constant [61 x i8]
@global.6 = external constant [40 x i8]
@global.7 = external constant [24 x i8]
@global.8 = external constant [20 x i8]
@global.9 = external global %struct.ham
@global.10 = external global %struct.ham
@global.11 = external global %struct.ham
@global.12 = external global %struct.ham
@global.13 = external global %struct.ham
@global.14 = external global %struct.ham
@global.15 = external global %struct.ham
@global.16 = external global %struct.ham
@global.17 = external global %struct.ham
@global.18 = external constant [35 x i8]
@global.19 = external global %struct.ham
@global.20 = external constant [53 x i8]
@global.21 = external global %struct.ham
@global.22 = external global %struct.ham
@global.23 = external global %struct.ham
@global.24 = external constant [32 x i8]
@global.25 = external global %struct.ham
@global.26 = external constant [47 x i8]
@global.27 = external global %struct.ham
@global.28 = external constant [45 x i8]
@global.29 = external global %struct.ham
@global.30 = external global %struct.ham
@global.31 = external constant [24 x i8]
@global.32 = external global %struct.ham
@global.33 = external global %struct.ham
@global.34 = external global %struct.ham
@global.35 = external global %struct.ham
@global.36 = external constant [27 x i8]
@global.37 = external global %struct.ham
@global.38 = external constant [10 x i8]
@global.39 = external global %struct.ham
@global.40 = external global %struct.ham
@global.41 = external global %struct.ham
@global.42 = external global %struct.ham
@global.43 = external global %struct.ham
@global.44 = external constant [41 x i8]
@global.45 = external global %struct.ham
@global.46 = external global %struct.ham
@global.47 = external global %struct.ham
@global.48 = external global %struct.ham
@global.49 = external constant [52 x i8]
@global.50 = external constant [47 x i8]
@global.51 = external global %struct.ham
@global.52 = external global %struct.ham
@global.53 = external global %struct.ham
@global.54 = external global %struct.ham
@global.55 = external global %struct.ham.3
@global.56 = external global %struct.bar
@global.57 = external global i8
declare %struct.ham* @bar(%struct.ham* returned)
declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*)
declare %struct.ham* @wobble(%struct.ham* returned, %struct.ham* )
declare i32 @quux(...)
declare i8* @_Znwm(i32)
declare i32 @wobble.58(%struct.pluto*, [1 x i32], %struct.ham* , %struct.hoge* )
declare i32 @widget(%struct.spam*, [1 x i32], %struct.ham* , %struct.wombat* )
; Just check we didn't crash and did output something...
; CHECK-LABEL: func:
; CHECK: trap
define internal void @func() section "__TEXT,__StaticInit,regular,pure_instructions" personality i32 (...)* @quux {
%tmp = tail call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.3 to i8*), i8* @global) #0
%tmp2 = invoke %struct.ham* @wobble(%struct.ham* undef, %struct.ham* @global.9)
to label %bb14 unwind label %bbunwind
bb14:
%tmp15 = getelementptr i8, i8* undef, i32 12
store i8 0, i8* %tmp15
%tmp16 = icmp eq i8 undef, 0
br i1 %tmp16, label %bb28, label %bb18
bb18:
br i1 undef, label %bb21, label %bb29
bb21:
%tmp22 = call i8* @_Znwm(i32 16)
store i32 17, i32* getelementptr (%struct.ham, %struct.ham* @global.10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tmp23 = call i8* @_Znwm(i32 32)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([27 x i8], [27 x i8]* @global.2, i32 0, i32 0), i32 26, i32 1, i1 false)
store i32 33, i32* getelementptr (%struct.ham, %struct.ham* @global.11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
store i32 23, i32* getelementptr (%struct.ham, %struct.ham* @global.11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([24 x i8], [24 x i8]* @global.7, i32 0, i32 0), i32 23, i32 1, i1 false)
%tmp24 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.11 to i8*), i8* @global) #0
store i32 49, i32* getelementptr (%struct.ham, %struct.ham* @global.12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
store i32 37, i32* getelementptr (%struct.ham, %struct.ham* @global.13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.14 to i8*), i8 0, i32 12, i32 1, i1 false)
%tmp25 = call i8* @_Znwm(i32 48)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp25, i8* align 1 getelementptr ([40 x i8], [40 x i8]* @global.6, i32 0, i32 0), i32 39, i32 1, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([47 x i8], [47 x i8]* @global.4, i32 0, i32 0), i32 46, i32 1, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([61 x i8], [61 x i8]* @global.5, i32 0, i32 0), i32 60, i32 1, i1 false)
%tmp26 = call i8* @_Znwm(i32 48)
store i32 65, i32* getelementptr (%struct.ham, %struct.ham* @global.15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tmp27 = icmp eq i8 undef, 0
br i1 %tmp27, label %bb30, label %bb33
bb28:
call void @llvm.trap()
unreachable
bb29:
call void @llvm.trap()
unreachable
bb30:
%tmp31 = icmp eq i32 undef, 37
br i1 %tmp31, label %bb32, label %bb30
bb32:
store i8 1, i8* @global.57
br label %bb33
bb33:
%tmp34 = call i8* @_Znwm(i32 32)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([20 x i8], [20 x i8]* @global.1, i32 0, i32 0), i32 19, i32 1, i1 false)
store i32 17, i32* getelementptr (%struct.ham, %struct.ham* @global.16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
store i32 65, i32* getelementptr (%struct.ham, %struct.ham* @global.17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([35 x i8], [35 x i8]* @global.18, i32 0, i32 0), i32 34, i32 1, i1 false)
store i32 65, i32* getelementptr (%struct.ham, %struct.ham* @global.19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([53 x i8], [53 x i8]* @global.20, i32 0, i32 0), i32 52, i32 1, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([20 x i8], [20 x i8]* @global.8, i32 0, i32 0), i32 19, i32 1, i1 false)
store i32 37, i32* getelementptr (%struct.ham, %struct.ham* @global.21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
%tmp35 = call i8* @_Znwm(i32 32)
store i8 16, i8* bitcast (%struct.ham* @global.22 to i8*)
%tmp36 = call i8* @_Znwm(i32 32)
store i32 31, i32* getelementptr (%struct.ham, %struct.ham* @global.23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp36, i8* align 1 getelementptr ([32 x i8], [32 x i8]* @global.24, i32 0, i32 0), i32 31, i32 1, i1 false)
%tmp37 = getelementptr i8, i8* %tmp36, i32 31
store i8 0, i8* %tmp37
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([47 x i8], [47 x i8]* @global.26, i32 0, i32 0), i32 46, i32 1, i1 false)
%tmp38 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.25 to i8*), i8* @global) #0
%tmp39 = call i8* @_Znwm(i32 48)
store i32 44, i32* getelementptr (%struct.ham, %struct.ham* @global.27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp39, i8* align 1 getelementptr ([45 x i8], [45 x i8]* @global.28, i32 0, i32 0), i32 44, i32 1, i1 false)
%tmp40 = getelementptr i8, i8* %tmp39, i32 44
store i8 0, i8* %tmp40
call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.29 to i8*), i8 0, i32 12, i32 1, i1 false)
%tmp41 = call i8* @_Znwm(i32 32)
store i32 23, i32* getelementptr (%struct.ham, %struct.ham* @global.30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp41, i8* align 1 getelementptr ([24 x i8], [24 x i8]* @global.31, i32 0, i32 0), i32 23, i32 1, i1 false)
%tmp42 = getelementptr i8, i8* %tmp41, i32 23
store i8 0, i8* %tmp42
call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.32 to i8*), i8 0, i32 12, i32 1, i1 false)
store i8 16, i8* bitcast (%struct.ham* @global.32 to i8*)
%tmp43 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.33 to i8*), i8* @global) #0
%tmp44 = call i8* @_Znwm(i32 16)
call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.34 to i8*), i8 0, i32 12, i32 1, i1 false)
call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.9 to i8*), i8 0, i32 12, i32 1, i1 false)
%tmp45 = call i8* @_Znwm(i32 32)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp45, i8* align 1 getelementptr ([27 x i8], [27 x i8]* @global.36, i32 0, i32 0), i32 26, i32 1, i1 false)
call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (%struct.ham* @global.37 to i8*), i8 0, i32 12, i32 1, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 getelementptr (%struct.snork, %struct.snork* bitcast (%struct.ham* @global.37 to %struct.snork*), i32 0, i32 1, i32 0), i8* align 1 getelementptr ([10 x i8], [10 x i8]* @global.38, i32 0, i32 0), i32 9, i32 1, i1 false)
store i32 17, i32* getelementptr (%struct.ham, %struct.ham* @global.39, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tmp46 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.40 to i8*), i8* @global) #0
%tmp47 = call i8* @_Znwm(i32 32)
%tmp48 = getelementptr i8, i8* %tmp47, i32 21
store i8 0, i8* %tmp48
store i32 33, i32* getelementptr (%struct.ham, %struct.ham* @global.41, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
store i32 15, i32* getelementptr (%struct.ham, %struct.ham* @global.42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
%tmp49 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.43 to i8*), i8* @global) #0
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([41 x i8], [41 x i8]* @global.44, i32 0, i32 0), i32 40, i32 1, i1 false)
%tmp50 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.45 to i8*), i8* @global) #0
%tmp51 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.46 to i8*), i8* @global) #0
%tmp52 = call i8* @_Znwm(i32 32)
store i8* %tmp52, i8** getelementptr (%struct.ham, %struct.ham* @global.47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([52 x i8], [52 x i8]* @global.49, i32 0, i32 0), i32 51, i32 1, i1 false)
%tmp53 = call i32 @__cxa_atexit(void (i8*)* bitcast (%struct.ham* (%struct.ham*)* @bar to void (i8*)*), i8* bitcast (%struct.ham* @global.48 to i8*), i8* @global) #0
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 undef, i8* align 1 getelementptr ([47 x i8], [47 x i8]* @global.50, i32 0, i32 0), i32 46, i32 1, i1 false)
store i32 33, i32* getelementptr (%struct.ham, %struct.ham* @global.51, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
store i32 37, i32* getelementptr (%struct.ham, %struct.ham* @global.52, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
%tmp54 = invoke %struct.ham* @wobble(%struct.ham* undef, %struct.ham* @global.54)
to label %bb58 unwind label %bbunwind
bb58:
%tmp59 = invoke i32 @wobble.58(%struct.pluto* getelementptr (%struct.ham.3, %struct.ham.3* @global.55, i32 0, i32 0), [1 x i32] [i32 ptrtoint (%struct.zot* getelementptr (%struct.ham.3, %struct.ham.3* @global.55, i32 0, i32 0, i32 1, i32 0, i32 0) to i32)], %struct.ham* undef, %struct.hoge* undef)
to label %bb71 unwind label %bbunwind
bb71:
%tmp72 = invoke i32 @widget(%struct.spam* getelementptr (%struct.bar, %struct.bar* @global.56, i32 0, i32 0), [1 x i32] [i32 ptrtoint (%struct.zot* getelementptr (%struct.bar, %struct.bar* @global.56, i32 0, i32 0, i32 1, i32 0, i32 0) to i32)], %struct.ham* undef, %struct.wombat* undef)
to label %bb73 unwind label %bbunwind
bb73:
ret void
bbunwind:
%tmp75 = landingpad { i8*, i32 }
cleanup
resume { i8*, i32 } undef
}
declare void @llvm.trap()
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* , i8* , i32, i32, i1)
declare void @llvm.memset.p0i8.i32(i8* , i8, i32, i32, i1)
attributes #0 = { nounwind }

View File

@ -0,0 +1,46 @@
; RUN: llc < %s | FileCheck %s
target triple = "thumbv6---gnueabi"
; Use STM to save the three registers
; CHECK-LABEL: use_stm:
; CHECK: .save {r7, lr}
; CHECK: .setfp r7, sp
; CHECK: stm r3!, {r0, r1, r2}
; CHECK: bl throws_1
define void @use_stm(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr noreturn "no-frame-pointer-elim"="true" {
entry:
%arrayidx = getelementptr inbounds i32, i32* %d, i32 2
store i32 %a, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %d, i32 3
store i32 %b, i32* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %d, i32 4
store i32 %c, i32* %arrayidx2, align 4
tail call void @throws_1(i32 %a, i32 %b, i32 %c) noreturn
unreachable
}
; Don't use STM: there is no available register to store
; the address. We could transform this with some extra math, but
; that currently isn't implemented.
; CHECK-LABEL: no_stm:
; CHECK: .save {r7, lr}
; CHECK: .setfp r7, sp
; CHECK: str r0,
; CHECK: str r1,
; CHECK: str r2,
; CHECK: bl throws_2
define void @no_stm(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr noreturn "no-frame-pointer-elim"="true" {
entry:
%arrayidx = getelementptr inbounds i32, i32* %d, i32 2
store i32 %a, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %d, i32 3
store i32 %b, i32* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %d, i32 4
store i32 %c, i32* %arrayidx2, align 4
tail call void @throws_2(i32 %a, i32 %b, i32 %c, i32* %d) noreturn
unreachable
}
declare void @throws_1(i32, i32, i32) noreturn
declare void @throws_2(i32, i32, i32, i32*) noreturn

View File

@ -5,59 +5,6 @@
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
; X32-LABEL: test_mm512_kunpackb:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebp
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: .cfi_def_cfa_register %ebp
; X32-NEXT: andl $-64, %esp
; X32-NEXT: subl $64, %esp
; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
; X32-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
; X32-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
; X32-NEXT: kunpckbw %k0, %k1, %k1
; X32-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
; X32-NEXT: kmovw %k0, %eax
; X32-NEXT: movzwl %ax, %eax
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_kunpackb:
; X64: # %bb.0: # %entry
; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
; X64-NEXT: kunpckbw %k0, %k1, %k1
; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
; X64-NEXT: kmovw %k0, %eax
; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__A to <16 x i32>
%1 = bitcast <8 x i64> %__B to <16 x i32>
%2 = icmp ne <16 x i32> %0, %1
%3 = bitcast <16 x i1> %2 to i16
%4 = bitcast <8 x i64> %__C to <16 x i32>
%5 = bitcast <8 x i64> %__D to <16 x i32>
%6 = icmp ne <16 x i32> %4, %5
%7 = bitcast <16 x i1> %6 to i16
%8 = and i16 %7, 255
%shl.i = shl i16 %3, 8
%or.i = or i16 %8, %shl.i
%9 = bitcast <8 x i64> %__E to <16 x i32>
%10 = bitcast <8 x i64> %__F to <16 x i32>
%11 = icmp ne <16 x i32> %9, %10
%12 = bitcast i16 %or.i to <16 x i1>
%13 = and <16 x i1> %11, %12
%14 = bitcast <16 x i1> %13 to i16
ret i16 %14
}
define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
; X32-LABEL: test_mm512_shuffle_f32x4:
; X32: # %bb.0: # %entry

View File

@ -1,20 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
define i16 @unpckbw_test(i16 %a0, i16 %a1) {
; CHECK-LABEL: unpckbw_test:
; CHECK: ## %bb.0:
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: shll $8, %esi
; CHECK-NEXT: orl %esi, %eax
; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
ret i16 %res
}
define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
; CHECK: ## %bb.0:

View File

@ -96,6 +96,21 @@ define i16 @test_kor(i16 %a0, i16 %a1) {
ret i16 %t2
}
declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
define i16 @unpckbw_test(i16 %a0, i16 %a1) {
; CHECK-LABEL: unpckbw_test:
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: kunpckbw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
ret i16 %res
}
declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
; TODO: the two kxnor instructions here a no op and should be elimintaed,
; probably by FoldConstantArithmetic in SelectionDAG.

View File

@ -2775,3 +2775,99 @@ define i8 @test_v8i1_mul(i8 %x, i8 %y) {
%ret = bitcast <8 x i1> %m2 to i8
ret i8 %ret
}
; Make sure we don't emit a ktest for signed comparisons.
define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
; KNL-LABEL: ktest_signed:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rax
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0
; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testw %ax, %ax
; KNL-NEXT: jle LBB63_1
; KNL-NEXT: ## %bb.2: ## %bb.2
; KNL-NEXT: popq %rax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB63_1: ## %bb.1
; KNL-NEXT: vzeroupper
; KNL-NEXT: callq _foo
; KNL-NEXT: popq %rax
; KNL-NEXT: retq
;
; SKX-LABEL: ktest_signed:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rax
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testw %ax, %ax
; SKX-NEXT: jle LBB63_1
; SKX-NEXT: ## %bb.2: ## %bb.2
; SKX-NEXT: popq %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB63_1: ## %bb.1
; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _foo
; SKX-NEXT: popq %rax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: ktest_signed:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: pushq %rax
; AVX512BW-NEXT: .cfi_def_cfa_offset 16
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testw %ax, %ax
; AVX512BW-NEXT: jle LBB63_1
; AVX512BW-NEXT: ## %bb.2: ## %bb.2
; AVX512BW-NEXT: popq %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB63_1: ## %bb.1
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: callq _foo
; AVX512BW-NEXT: popq %rax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: ktest_signed:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rax
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: testw %ax, %ax
; AVX512DQ-NEXT: jle LBB63_1
; AVX512DQ-NEXT: ## %bb.2: ## %bb.2
; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB63_1: ## %bb.1
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: callq _foo
; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: retq
%a = icmp eq <16 x i32> %x, zeroinitializer
%b = icmp eq <16 x i32> %y, zeroinitializer
%c = and <16 x i1> %a, %b
%d = bitcast <16 x i1> %c to i16
%e = icmp sgt i16 %d, 0
br i1 %e, label %bb.2, label %bb.1
bb.1:
call void @foo()
br label %bb.2
bb.2:
ret void
}
declare void @foo()

View File

@ -4,117 +4,6 @@
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
; X32-LABEL: test_mm512_kunpackd:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebp
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: .cfi_def_cfa_register %ebp
; X32-NEXT: andl $-64, %esp
; X32-NEXT: subl $64, %esp
; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
; X32-NEXT: vmovdqa64 72(%ebp), %zmm4
; X32-NEXT: vmovdqa64 8(%ebp), %zmm5
; X32-NEXT: vpcmpneqb %zmm0, %zmm1, %k0
; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; X32-NEXT: vpcmpneqb %zmm5, %zmm2, %k0
; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X32-NEXT: kunpckdq %k0, %k1, %k1
; X32-NEXT: vpcmpneqb %zmm3, %zmm4, %k0 {%k1}
; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_kunpackd:
; X64: # %bb.0: # %entry
; X64-NEXT: vpcmpneqb %zmm0, %zmm1, %k0
; X64-NEXT: vpcmpneqb %zmm3, %zmm2, %k1
; X64-NEXT: kunpckdq %k0, %k1, %k1
; X64-NEXT: vpcmpneqb %zmm5, %zmm4, %k0 {%k1}
; X64-NEXT: kmovq %k0, %rax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__B to <64 x i8>
%1 = bitcast <8 x i64> %__A to <64 x i8>
%2 = icmp ne <64 x i8> %0, %1
%3 = bitcast <64 x i1> %2 to i64
%4 = bitcast <8 x i64> %__C to <64 x i8>
%5 = bitcast <8 x i64> %__D to <64 x i8>
%6 = icmp ne <64 x i8> %4, %5
%7 = bitcast <64 x i1> %6 to i64
%and.i = and i64 %7, 4294967295
%shl.i = shl i64 %3, 32
%or.i = or i64 %and.i, %shl.i
%8 = bitcast <8 x i64> %__E to <64 x i8>
%9 = bitcast <8 x i64> %__F to <64 x i8>
%10 = icmp ne <64 x i8> %8, %9
%11 = bitcast i64 %or.i to <64 x i1>
%12 = and <64 x i1> %10, %11
%13 = bitcast <64 x i1> %12 to i64
ret i64 %13
}
define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
; X32-LABEL: test_mm512_kunpackw:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebp
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: .cfi_def_cfa_register %ebp
; X32-NEXT: andl $-64, %esp
; X32-NEXT: subl $64, %esp
; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
; X32-NEXT: vpcmpneqw %zmm0, %zmm1, %k0
; X32-NEXT: vpcmpneqw 8(%ebp), %zmm2, %k1
; X32-NEXT: kunpckwd %k0, %k1, %k1
; X32-NEXT: vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
; X32-NEXT: kmovd %k0, %eax
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_kunpackw:
; X64: # %bb.0: # %entry
; X64-NEXT: vpcmpneqw %zmm0, %zmm1, %k0
; X64-NEXT: vpcmpneqw %zmm3, %zmm2, %k1
; X64-NEXT: kunpckwd %k0, %k1, %k1
; X64-NEXT: vpcmpneqw %zmm5, %zmm4, %k0 {%k1}
; X64-NEXT: kmovd %k0, %eax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__B to <32 x i16>
%1 = bitcast <8 x i64> %__A to <32 x i16>
%2 = icmp ne <32 x i16> %0, %1
%3 = bitcast <32 x i1> %2 to i32
%4 = bitcast <8 x i64> %__C to <32 x i16>
%5 = bitcast <8 x i64> %__D to <32 x i16>
%6 = icmp ne <32 x i16> %4, %5
%7 = bitcast <32 x i1> %6 to i32
%and.i = and i32 %7, 65535
%shl.i = shl i32 %3, 16
%or.i = or i32 %and.i, %shl.i
%8 = bitcast <8 x i64> %__E to <32 x i16>
%9 = bitcast <8 x i64> %__F to <32 x i16>
%10 = icmp ne <32 x i16> %8, %9
%11 = bitcast i32 %or.i to <32 x i1>
%12 = and <32 x i1> %10, %11
%13 = bitcast <32 x i1> %12 to i32
ret i32 %13
}
define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A) {
; X32-LABEL: test_mm512_mask_set1_epi8:
; X32: # %bb.0: # %entry
@ -189,46 +78,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: movb %ch, %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $55, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $9, %k0, %k1
; X32-NEXT: andb $2, %al
; X32-NEXT: shrb %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $54, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $10, %k0, %k1
; X32-NEXT: movb %ch, %al
; X32-NEXT: andb $15, %al
; X32-NEXT: movl %eax, %edx
; X32-NEXT: shrb $2, %dl
; X32-NEXT: kmovd %edx, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $53, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $11, %k0, %k1
; X32-NEXT: kmovd %edx, %k3
; X32-NEXT: shrb $3, %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $12, %eax
; X32-NEXT: andl $15, %eax
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kmovd %eax, %k4
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $13, %eax
; X32-NEXT: andb $1, %al
; X32-NEXT: kmovd %eax, %k3
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $14, %eax
; X32-NEXT: andl $3, %eax
; X32-NEXT: kmovd %eax, %k4
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $15, %eax
; X32-NEXT: andl $1, %eax
; X32-NEXT: kmovd %eax, %k5
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrl $16, %edx
@ -243,25 +105,52 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kmovd %eax, %k7
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $55, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $9, %k0, %k1
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $54, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $10, %k0, %k1
; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $53, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $11, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $52, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $12, %k0, %k1
; X32-NEXT: movl %ecx, %esi
; X32-NEXT: shrl $12, %esi
; X32-NEXT: andl $15, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $51, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $13, %k0, %k1
; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $50, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $14, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: movl %ecx, %esi
; X32-NEXT: shrl $14, %esi
; X32-NEXT: andl $3, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $49, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $15, %k0, %k1
; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: movl %ecx, %esi
; X32-NEXT: shrl $15, %esi
; X32-NEXT: andl $1, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $48, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@ -494,22 +383,14 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $43, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $12, %esi
; X32-NEXT: andl $15, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $14, %esi
; X32-NEXT: andl $3, %esi
; X32-NEXT: kmovd %esi, %k3
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $15, %esi
; X32-NEXT: andl $1, %esi
; X32-NEXT: kmovd %esi, %k4
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $20, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $44, %k0, %k1
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $12, %esi
; X32-NEXT: andl $15, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $19, %k1, %k1
@ -520,12 +401,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kshiftrq $18, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $46, %k0, %k1
; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $14, %esi
; X32-NEXT: andl $3, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $17, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $47, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $15, %esi
; X32-NEXT: andl $1, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $16, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@ -551,8 +440,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $12, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k4
; X32-NEXT: kshiftrq $52, %k4, %k0
; X32-NEXT: kxorq %k0, %k1, %k3
; X32-NEXT: kshiftrq $52, %k3, %k0
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $4, %dl
; X32-NEXT: kmovd %edx, %k1
@ -576,19 +465,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: andb $15, %cl
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $2, %dl
; X32-NEXT: kmovd %edx, %k3
; X32-NEXT: kmovd %edx, %k4
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $11, %k5, %k5
; X32-NEXT: kxorq %k4, %k5, %k4
; X32-NEXT: kshiftrq $53, %k4, %k5
; X32-NEXT: kxorq %k3, %k5, %k3
; X32-NEXT: kshiftrq $53, %k3, %k5
; X32-NEXT: kxorq %k6, %k5, %k5
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $10, %k5, %k5
; X32-NEXT: kxorq %k4, %k5, %k5
; X32-NEXT: kshiftrq $54, %k5, %k4
; X32-NEXT: kxorq %k7, %k4, %k6
; X32-NEXT: kxorq %k3, %k5, %k5
; X32-NEXT: kshiftrq $54, %k5, %k3
; X32-NEXT: kxorq %k7, %k3, %k6
; X32-NEXT: shrb $3, %cl
; X32-NEXT: kmovd %ecx, %k4
; X32-NEXT: kmovd %ecx, %k3
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $29, %ecx
; X32-NEXT: andb $1, %cl
@ -603,12 +492,6 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kxorq %k5, %k0, %k0
; X32-NEXT: kshiftrq $56, %k0, %k5
; X32-NEXT: kxorq %k1, %k5, %k1
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $28, %ecx
; X32-NEXT: kmovd %ecx, %k5
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $30, %ecx
; X32-NEXT: kmovd %ecx, %k6
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $7, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@ -618,17 +501,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kshiftrq $6, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $58, %k0, %k1
; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $5, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $59, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $4, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $60, %k0, %k1
; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $28, %ecx
; X32-NEXT: kmovd %ecx, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $3, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@ -638,7 +524,10 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kshiftrq $2, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $62, %k0, %k1
; X32-NEXT: kxorq %k6, %k1, %k1
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $30, %ecx
; X32-NEXT: kmovd %ecx, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: shrl $31, %eax
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
@ -743,46 +632,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: movb %ch, %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $55, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $9, %k0, %k1
; X32-NEXT: andb $2, %al
; X32-NEXT: shrb %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $54, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $10, %k0, %k1
; X32-NEXT: movb %ch, %al
; X32-NEXT: andb $15, %al
; X32-NEXT: movl %eax, %edx
; X32-NEXT: shrb $2, %dl
; X32-NEXT: kmovd %edx, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $53, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $11, %k0, %k1
; X32-NEXT: kmovd %edx, %k3
; X32-NEXT: shrb $3, %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $12, %eax
; X32-NEXT: andl $15, %eax
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kmovd %eax, %k4
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $13, %eax
; X32-NEXT: andb $1, %al
; X32-NEXT: kmovd %eax, %k3
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $14, %eax
; X32-NEXT: andl $3, %eax
; X32-NEXT: kmovd %eax, %k4
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $15, %eax
; X32-NEXT: andl $1, %eax
; X32-NEXT: kmovd %eax, %k5
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrl $16, %edx
@ -797,25 +659,52 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kmovd %eax, %k7
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $55, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $9, %k0, %k1
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $54, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $10, %k0, %k1
; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $53, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $11, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $52, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $12, %k0, %k1
; X32-NEXT: movl %ecx, %esi
; X32-NEXT: shrl $12, %esi
; X32-NEXT: andl $15, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $51, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $13, %k0, %k1
; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $50, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $14, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: movl %ecx, %esi
; X32-NEXT: shrl $14, %esi
; X32-NEXT: andl $3, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $49, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $15, %k0, %k1
; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: movl %ecx, %esi
; X32-NEXT: shrl $15, %esi
; X32-NEXT: andl $1, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $48, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@ -1048,22 +937,14 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $43, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $12, %esi
; X32-NEXT: andl $15, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $14, %esi
; X32-NEXT: andl $3, %esi
; X32-NEXT: kmovd %esi, %k3
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $15, %esi
; X32-NEXT: andl $1, %esi
; X32-NEXT: kmovd %esi, %k4
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $20, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $44, %k0, %k1
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $12, %esi
; X32-NEXT: andl $15, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $19, %k1, %k1
@ -1074,12 +955,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kshiftrq $18, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $46, %k0, %k1
; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $14, %esi
; X32-NEXT: andl $3, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $17, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $47, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: movl %eax, %esi
; X32-NEXT: shrl $15, %esi
; X32-NEXT: andl $1, %esi
; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $16, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@ -1105,8 +994,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $12, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k4
; X32-NEXT: kshiftrq $52, %k4, %k0
; X32-NEXT: kxorq %k0, %k1, %k3
; X32-NEXT: kshiftrq $52, %k3, %k0
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $4, %dl
; X32-NEXT: kmovd %edx, %k1
@ -1130,19 +1019,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: andb $15, %cl
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $2, %dl
; X32-NEXT: kmovd %edx, %k3
; X32-NEXT: kmovd %edx, %k4
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $11, %k5, %k5
; X32-NEXT: kxorq %k4, %k5, %k4
; X32-NEXT: kshiftrq $53, %k4, %k5
; X32-NEXT: kxorq %k3, %k5, %k3
; X32-NEXT: kshiftrq $53, %k3, %k5
; X32-NEXT: kxorq %k6, %k5, %k5
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $10, %k5, %k5
; X32-NEXT: kxorq %k4, %k5, %k5
; X32-NEXT: kshiftrq $54, %k5, %k4
; X32-NEXT: kxorq %k7, %k4, %k6
; X32-NEXT: kxorq %k3, %k5, %k5
; X32-NEXT: kshiftrq $54, %k5, %k3
; X32-NEXT: kxorq %k7, %k3, %k6
; X32-NEXT: shrb $3, %cl
; X32-NEXT: kmovd %ecx, %k4
; X32-NEXT: kmovd %ecx, %k3
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $29, %ecx
; X32-NEXT: andb $1, %cl
@ -1157,12 +1046,6 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kxorq %k5, %k0, %k0
; X32-NEXT: kshiftrq $56, %k0, %k5
; X32-NEXT: kxorq %k1, %k5, %k1
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $28, %ecx
; X32-NEXT: kmovd %ecx, %k5
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $30, %ecx
; X32-NEXT: kmovd %ecx, %k6
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $7, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@ -1172,17 +1055,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kshiftrq $6, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $58, %k0, %k1
; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $5, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $59, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $4, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $60, %k0, %k1
; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $28, %ecx
; X32-NEXT: kmovd %ecx, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $3, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@ -1192,7 +1078,10 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kshiftrq $2, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $62, %k0, %k1
; X32-NEXT: kxorq %k6, %k1, %k1
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $30, %ecx
; X32-NEXT: kmovd %ecx, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: shrl $31, %eax
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: movb {{[0-9]+}}(%esp), %al

View File

@ -2,46 +2,6 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movzwl %di, %eax
; AVX512BW-NEXT: shll $16, %esi
; AVX512BW-NEXT: orl %esi, %eax
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: shll $16, %eax
; AVX512F-32-NEXT: orl %ecx, %eax
; AVX512F-32-NEXT: retl
%res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
ret i32 %res
}
declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movl %edi, %eax
; AVX512BW-NEXT: shlq $32, %rsi
; AVX512BW-NEXT: orq %rsi, %rax
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
ret i64 %res
}
declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {

View File

@ -1455,6 +1455,55 @@ define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8>
ret <8 x i64> %res2
}
declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kunpckwd %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: retl
%res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
ret i32 %res
}
declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k0
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: kunpckdq %k1, %k0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: subl $12, %esp
; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
ret i64 %res
}
declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)
define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {

View File

@ -1,22 +1,23 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -run-pass x86-domain-reassignment -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq -o - %s | FileCheck %s
--- |
; ModuleID = '../test/CodeGen/X86/gpr-to-mask.ll'
source_filename = "../test/CodeGen/X86/gpr-to-mask.ll"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define void @test_fcmp_storefloat(i1 %cond, float* %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) #0 {
entry:
br i1 %cond, label %if, label %else
if: ; preds = %entry
%cmp1 = fcmp oeq float %f3, %f4
br label %exit
else: ; preds = %entry
%cmp2 = fcmp oeq float %f5, %f6
br label %exit
exit: ; preds = %else, %if
%val = phi i1 [ %cmp1, %if ], [ %cmp2, %else ]
%selected = select i1 %val, float %f1, float %f2
@ -48,14 +49,13 @@
...
---
name: test_fcmp_storefloat
# CHECK-LABEL: name: test_fcmp_storefloat
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
registers:
- { id: 0, class: gr8, preferred-register: '' }
- { id: 1, class: gr8, preferred-register: '' }
- { id: 2, class: gr8, preferred-register: '' }
@ -79,7 +79,7 @@ registers:
- { id: 20, class: fr128, preferred-register: '' }
- { id: 21, class: fr128, preferred-register: '' }
- { id: 22, class: fr32x, preferred-register: '' }
liveins:
liveins:
- { reg: '%edi', virtual-reg: '%3' }
- { reg: '%rsi', virtual-reg: '%4' }
- { reg: '%xmm0', virtual-reg: '%5' }
@ -88,7 +88,7 @@ liveins:
- { reg: '%xmm3', virtual-reg: '%8' }
- { reg: '%xmm4', virtual-reg: '%9' }
- { reg: '%xmm5', virtual-reg: '%10' }
frameInfo:
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@ -105,14 +105,51 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
fixedStack:
stack:
constants:
fixedStack:
stack:
constants:
body: |
; CHECK-LABEL: name: test_fcmp_storefloat
; CHECK: bb.0.entry:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
; CHECK: [[COPY:%[0-9]+]]:fr32x = COPY %xmm5
; CHECK: [[COPY1:%[0-9]+]]:fr32x = COPY %xmm4
; CHECK: [[COPY2:%[0-9]+]]:fr32x = COPY %xmm3
; CHECK: [[COPY3:%[0-9]+]]:fr32x = COPY %xmm2
; CHECK: [[COPY4:%[0-9]+]]:fr32x = COPY %xmm1
; CHECK: [[COPY5:%[0-9]+]]:vr128x = COPY %xmm0
; CHECK: [[COPY6:%[0-9]+]]:gr64 = COPY %rsi
; CHECK: [[COPY7:%[0-9]+]]:gr32 = COPY %edi
; CHECK: [[COPY8:%[0-9]+]]:gr8 = COPY [[COPY7]].sub_8bit
; CHECK: TEST8ri killed [[COPY8]], 1, implicit-def %eflags
; CHECK: JE_1 %bb.2, implicit %eflags
; CHECK: JMP_1 %bb.1
; CHECK: bb.1.if:
; CHECK: successors: %bb.3(0x80000000)
; CHECK: [[VCMPSSZrr:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY3]], [[COPY2]], 0
; CHECK: [[COPY9:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr]]
; CHECK: [[COPY10:%[0-9]+]]:vk8 = COPY [[COPY9]]
; CHECK: JMP_1 %bb.3
; CHECK: bb.2.else:
; CHECK: successors: %bb.3(0x80000000)
; CHECK: [[VCMPSSZrr1:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY1]], [[COPY]], 0
; CHECK: [[COPY11:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr1]]
; CHECK: [[COPY12:%[0-9]+]]:vk8 = COPY [[COPY11]]
; CHECK: bb.3.exit:
; CHECK: [[PHI:%[0-9]+]]:vk8 = PHI [[COPY12]], %bb.2, [[COPY10]], %bb.1
; CHECK: [[COPY13:%[0-9]+]]:vk32 = COPY [[PHI]]
; CHECK: [[COPY14:%[0-9]+]]:vk1wm = COPY [[COPY13]]
; CHECK: [[COPY15:%[0-9]+]]:vr128x = COPY [[COPY4]]
; CHECK: [[DEF:%[0-9]+]]:fr128 = IMPLICIT_DEF
; CHECK: [[VMOVSSZrrk:%[0-9]+]]:fr128 = VMOVSSZrrk [[COPY15]], killed [[COPY14]], killed [[DEF]], [[COPY5]]
; CHECK: [[COPY16:%[0-9]+]]:fr32x = COPY [[VMOVSSZrrk]]
; CHECK: VMOVSSZmr [[COPY6]], 1, %noreg, 0, %noreg, killed [[COPY16]] :: (store 4 into %ir.fptr)
; CHECK: RET 0
bb.0.entry:
successors: %bb.1(0x40000000), %bb.2(0x40000000)
liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
%10 = COPY %xmm5
%9 = COPY %xmm4
%8 = COPY %xmm3
@ -125,38 +162,31 @@ body: |
TEST8ri killed %11, 1, implicit-def %eflags
JE_1 %bb.2, implicit %eflags
JMP_1 %bb.1
bb.1.if:
successors: %bb.3(0x80000000)
%14 = VCMPSSZrr %7, %8, 0
; check that cross domain copies are replaced with same domain copies.
; CHECK: %15:vk32 = COPY %14
; CHECK: %0:vk8 = COPY %15
%15 = COPY %14
%0 = COPY %15.sub_8bit
JMP_1 %bb.3
bb.2.else:
successors: %bb.3(0x80000000)
%12 = VCMPSSZrr %9, %10, 0
; check that cross domain copies are replaced with same domain copies.
; CHECK: %13:vk32 = COPY %12
; CHECK: %1:vk8 = COPY %13
%13 = COPY %12
%1 = COPY %13.sub_8bit
bb.3.exit:
; check PHI, IMPLICIT_DEF, and INSERT_SUBREG replacers.
; CHECK: %2:vk8 = PHI %1, %bb.2, %0, %bb.1
; CHECK: %16:vk32 = COPY %2
; CHECK: %18:vk1wm = COPY %16
%2 = PHI %1, %bb.2, %0, %bb.1
%17 = IMPLICIT_DEF
%16 = INSERT_SUBREG %17, %2, 1
@ -171,14 +201,13 @@ body: |
...
---
name: test_8bitops
# CHECK-LABEL: name: test_8bitops
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@ -198,13 +227,13 @@ registers:
- { id: 16, class: gr8, preferred-register: '' }
- { id: 17, class: gr8, preferred-register: '' }
- { id: 18, class: gr8, preferred-register: '' }
liveins:
liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
- { reg: '%zmm2', virtual-reg: '%3' }
- { reg: '%zmm3', virtual-reg: '%4' }
frameInfo:
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@ -221,32 +250,50 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
fixedStack:
stack:
constants:
fixedStack:
stack:
constants:
body: |
; CHECK-LABEL: name: test_8bitops
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x80000000)
; CHECK: liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
; CHECK: [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2
; CHECK: [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3
; CHECK: [[VCMPPDZrri:%[0-9]+]]:vk8 = VCMPPDZrri [[COPY3]], [[COPY4]], 0
; CHECK: [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPDZrri]]
; CHECK: [[COPY6:%[0-9]+]]:vk8 = COPY [[COPY5]]
; CHECK: [[KSHIFTRBri:%[0-9]+]]:vk8 = KSHIFTRBri [[COPY6]], 2
; CHECK: [[KSHIFTLBri:%[0-9]+]]:vk8 = KSHIFTLBri [[KSHIFTRBri]], 1
; CHECK: [[KNOTBrr:%[0-9]+]]:vk8 = KNOTBrr [[KSHIFTLBri]]
; CHECK: [[KORBrr:%[0-9]+]]:vk8 = KORBrr [[KNOTBrr]], [[KSHIFTRBri]]
; CHECK: [[KANDBrr:%[0-9]+]]:vk8 = KANDBrr [[KORBrr]], [[KSHIFTLBri]]
; CHECK: [[KXORBrr:%[0-9]+]]:vk8 = KXORBrr [[KANDBrr]], [[KSHIFTRBri]]
; CHECK: [[KADDBrr:%[0-9]+]]:vk8 = KADDBrr [[KXORBrr]], [[KNOTBrr]]
; CHECK: [[COPY7:%[0-9]+]]:vk32 = COPY [[KADDBrr]]
; CHECK: [[COPY8:%[0-9]+]]:vk8wm = COPY [[COPY7]]
; CHECK: [[VMOVAPDZrrk:%[0-9]+]]:vr512 = VMOVAPDZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
; CHECK: VMOVAPDZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPDZrrk]]
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: bb.2:
; CHECK: RET 0
bb.0:
liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
%3 = COPY %zmm2
%4 = COPY %zmm3
%5 = VCMPPDZrri %3, %4, 0
; CHECK: %6:vk32 = COPY %5
; CHECK: %7:vk8 = COPY %6
%6 = COPY %5
%7 = COPY %6.sub_8bit
; CHECK: %12:vk8 = KSHIFTRBri %7, 2
; CHECK: %13:vk8 = KSHIFTLBri %12, 1
; CHECK: %14:vk8 = KNOTBrr %13
; CHECK: %15:vk8 = KORBrr %14, %12
; CHECK: %16:vk8 = KANDBrr %15, %13
; CHECK: %17:vk8 = KXORBrr %16, %12
; CHECK: %18:vk8 = KADDBrr %17, %14
%12 = SHR8ri %7, 2, implicit-def dead %eflags
%13 = SHL8ri %12, 1, implicit-def dead %eflags
%14 = NOT8r %13
@ -254,19 +301,17 @@ body: |
%16 = AND8rr %15, %13, implicit-def dead %eflags
%17 = XOR8rr %16, %12, implicit-def dead %eflags
%18 = ADD8rr %17, %14, implicit-def dead %eflags
; CHECK: %9:vk32 = COPY %18
; CHECK: %10:vk8wm = COPY %9
%8 = IMPLICIT_DEF
%9 = INSERT_SUBREG %8, %18, 1
%10 = COPY %9
%11 = VMOVAPDZrrk %2, killed %10, %1
VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11
VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11
; CHECK: KTESTBrr %18, %18, implicit-def %eflags
TEST8rr %18, %18, implicit-def %eflags
JE_1 %bb.1, implicit %eflags
JMP_1 %bb.2
; FIXME We can't replace TEST with KTEST due to flag differences
; TEST8rr %18, %18, implicit-def %eflags
; JE_1 %bb.1, implicit %eflags
; JMP_1 %bb.2
bb.1:
@ -276,14 +321,13 @@ body: |
...
---
name: test_16bitops
# CHECK-LABEL: name: test_16bitops
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@ -302,13 +346,13 @@ registers:
- { id: 15, class: gr16, preferred-register: '' }
- { id: 16, class: gr16, preferred-register: '' }
- { id: 17, class: gr16, preferred-register: '' }
liveins:
liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
- { reg: '%zmm2', virtual-reg: '%3' }
- { reg: '%zmm3', virtual-reg: '%4' }
frameInfo:
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@ -325,50 +369,66 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
fixedStack:
stack:
constants:
fixedStack:
stack:
constants:
body: |
; CHECK-LABEL: name: test_16bitops
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x80000000)
; CHECK: liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
; CHECK: [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2
; CHECK: [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3
; CHECK: [[VCMPPSZrri:%[0-9]+]]:vk16 = VCMPPSZrri [[COPY3]], [[COPY4]], 0
; CHECK: [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPSZrri]]
; CHECK: [[COPY6:%[0-9]+]]:vk16 = COPY [[COPY5]]
; CHECK: [[KSHIFTRWri:%[0-9]+]]:vk16 = KSHIFTRWri [[COPY6]], 2
; CHECK: [[KSHIFTLWri:%[0-9]+]]:vk16 = KSHIFTLWri [[KSHIFTRWri]], 1
; CHECK: [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[KSHIFTLWri]]
; CHECK: [[KORWrr:%[0-9]+]]:vk16 = KORWrr [[KNOTWrr]], [[KSHIFTRWri]]
; CHECK: [[KANDWrr:%[0-9]+]]:vk16 = KANDWrr [[KORWrr]], [[KSHIFTLWri]]
; CHECK: [[KXORWrr:%[0-9]+]]:vk16 = KXORWrr [[KANDWrr]], [[KSHIFTRWri]]
; CHECK: [[COPY7:%[0-9]+]]:vk32 = COPY [[KXORWrr]]
; CHECK: [[COPY8:%[0-9]+]]:vk16wm = COPY [[COPY7]]
; CHECK: [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
; CHECK: VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]]
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: bb.2:
; CHECK: RET 0
bb.0:
liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
%3 = COPY %zmm2
%4 = COPY %zmm3
%5 = VCMPPSZrri %3, %4, 0
; CHECK: %6:vk32 = COPY %5
; CHECK: %7:vk16 = COPY %6
%6 = COPY %5
%7 = COPY %6.sub_16bit
; CHECK: %12:vk16 = KSHIFTRWri %7, 2
; CHECK: %13:vk16 = KSHIFTLWri %12, 1
; CHECK: %14:vk16 = KNOTWrr %13
; CHECK: %15:vk16 = KORWrr %14, %12
; CHECK: %16:vk16 = KANDWrr %15, %13
; CHECK: %17:vk16 = KXORWrr %16, %12
%12 = SHR16ri %7, 2, implicit-def dead %eflags
%13 = SHL16ri %12, 1, implicit-def dead %eflags
%14 = NOT16r %13
%15 = OR16rr %14, %12, implicit-def dead %eflags
%16 = AND16rr %15, %13, implicit-def dead %eflags
%17 = XOR16rr %16, %12, implicit-def dead %eflags
; CHECK: %9:vk32 = COPY %17
; CHECK: %10:vk16wm = COPY %9
%8 = IMPLICIT_DEF
%9 = INSERT_SUBREG %8, %17, 3
%10 = COPY %9
%11 = VMOVAPSZrrk %2, killed %10, %1
VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11
VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11
; CHECK: KTESTWrr %17, %17, implicit-def %eflags
TEST16rr %17, %17, implicit-def %eflags
JE_1 %bb.1, implicit %eflags
JMP_1 %bb.2
; FIXME We can't replace TEST with KTEST due to flag differences
; TEST16rr %17, %17, implicit-def %eflags
; JE_1 %bb.1, implicit %eflags
; JMP_1 %bb.2
bb.1:
@ -378,14 +438,13 @@ body: |
...
---
name: test_32bitops
# CHECK-LABEL: name: test_32bitops
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@ -400,11 +459,11 @@ registers:
- { id: 11, class: gr32, preferred-register: '' }
- { id: 12, class: gr32, preferred-register: '' }
- { id: 13, class: gr32, preferred-register: '' }
liveins:
liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
frameInfo:
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@ -421,26 +480,40 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
fixedStack:
stack:
constants:
fixedStack:
stack:
constants:
body: |
; CHECK-LABEL: name: test_32bitops
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x80000000)
; CHECK: liveins: %rdi, %zmm0, %zmm1
; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
; CHECK: [[KMOVDkm:%[0-9]+]]:vk32 = KMOVDkm [[COPY]], 1, %noreg, 0, %noreg
; CHECK: [[KSHIFTRDri:%[0-9]+]]:vk32 = KSHIFTRDri [[KMOVDkm]], 2
; CHECK: [[KSHIFTLDri:%[0-9]+]]:vk32 = KSHIFTLDri [[KSHIFTRDri]], 1
; CHECK: [[KNOTDrr:%[0-9]+]]:vk32 = KNOTDrr [[KSHIFTLDri]]
; CHECK: [[KORDrr:%[0-9]+]]:vk32 = KORDrr [[KNOTDrr]], [[KSHIFTRDri]]
; CHECK: [[KANDDrr:%[0-9]+]]:vk32 = KANDDrr [[KORDrr]], [[KSHIFTLDri]]
; CHECK: [[KXORDrr:%[0-9]+]]:vk32 = KXORDrr [[KANDDrr]], [[KSHIFTRDri]]
; CHECK: [[KANDNDrr:%[0-9]+]]:vk32 = KANDNDrr [[KXORDrr]], [[KORDrr]]
; CHECK: [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[KANDNDrr]], [[KXORDrr]]
; CHECK: [[COPY3:%[0-9]+]]:vk32wm = COPY [[KADDDrr]]
; CHECK: [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]]
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: bb.2:
; CHECK: RET 0
bb.0:
liveins: %rdi, %zmm0, %zmm1
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
; CHECK: %5:vk32 = KMOVDkm %0, 1, %noreg, 0, %noreg
; CHECK: %6:vk32 = KSHIFTRDri %5, 2
; CHECK: %7:vk32 = KSHIFTLDri %6, 1
; CHECK: %8:vk32 = KNOTDrr %7
; CHECK: %9:vk32 = KORDrr %8, %6
; CHECK: %10:vk32 = KANDDrr %9, %7
; CHECK: %11:vk32 = KXORDrr %10, %6
; CHECK: %12:vk32 = KANDNDrr %11, %9
; CHECK: %13:vk32 = KADDDrr %12, %11
%5 = MOV32rm %0, 1, %noreg, 0, %noreg
%6 = SHR32ri %5, 2, implicit-def dead %eflags
%7 = SHL32ri %6, 1, implicit-def dead %eflags
@ -450,16 +523,15 @@ body: |
%11 = XOR32rr %10, %6, implicit-def dead %eflags
%12 = ANDN32rr %11, %9, implicit-def dead %eflags
%13 = ADD32rr %12, %11, implicit-def dead %eflags
; CHECK: %3:vk32wm = COPY %13
%3 = COPY %13
%4 = VMOVDQU16Zrrk %2, killed %3, %1
VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
; CHECK: KTESTDrr %13, %13, implicit-def %eflags
TEST32rr %13, %13, implicit-def %eflags
JE_1 %bb.1, implicit %eflags
JMP_1 %bb.2
; FIXME We can't replace TEST with KTEST due to flag differences
; TEST32rr %13, %13, implicit-def %eflags
; JE_1 %bb.1, implicit %eflags
; JMP_1 %bb.2
bb.1:
@ -469,14 +541,13 @@ body: |
...
---
name: test_64bitops
# CHECK-LABEL: name: test_64bitops
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@ -491,11 +562,11 @@ registers:
- { id: 11, class: gr64, preferred-register: '' }
- { id: 12, class: gr64, preferred-register: '' }
- { id: 13, class: gr64, preferred-register: '' }
liveins:
liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
frameInfo:
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@ -512,26 +583,40 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
fixedStack:
stack:
constants:
fixedStack:
stack:
constants:
body: |
; CHECK-LABEL: name: test_64bitops
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x80000000)
; CHECK: liveins: %rdi, %zmm0, %zmm1
; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
; CHECK: [[KMOVQkm:%[0-9]+]]:vk64 = KMOVQkm [[COPY]], 1, %noreg, 0, %noreg
; CHECK: [[KSHIFTRQri:%[0-9]+]]:vk64 = KSHIFTRQri [[KMOVQkm]], 2
; CHECK: [[KSHIFTLQri:%[0-9]+]]:vk64 = KSHIFTLQri [[KSHIFTRQri]], 1
; CHECK: [[KNOTQrr:%[0-9]+]]:vk64 = KNOTQrr [[KSHIFTLQri]]
; CHECK: [[KORQrr:%[0-9]+]]:vk64 = KORQrr [[KNOTQrr]], [[KSHIFTRQri]]
; CHECK: [[KANDQrr:%[0-9]+]]:vk64 = KANDQrr [[KORQrr]], [[KSHIFTLQri]]
; CHECK: [[KXORQrr:%[0-9]+]]:vk64 = KXORQrr [[KANDQrr]], [[KSHIFTRQri]]
; CHECK: [[KANDNQrr:%[0-9]+]]:vk64 = KANDNQrr [[KXORQrr]], [[KORQrr]]
; CHECK: [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[KANDNQrr]], [[KXORQrr]]
; CHECK: [[COPY3:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
; CHECK: [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]]
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: bb.2:
; CHECK: RET 0
bb.0:
liveins: %rdi, %zmm0, %zmm1
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
; CHECK: %5:vk64 = KMOVQkm %0, 1, %noreg, 0, %noreg
; CHECK: %6:vk64 = KSHIFTRQri %5, 2
; CHECK: %7:vk64 = KSHIFTLQri %6, 1
; CHECK: %8:vk64 = KNOTQrr %7
; CHECK: %9:vk64 = KORQrr %8, %6
; CHECK: %10:vk64 = KANDQrr %9, %7
; CHECK: %11:vk64 = KXORQrr %10, %6
; CHECK: %12:vk64 = KANDNQrr %11, %9
; CHECK: %13:vk64 = KADDQrr %12, %11
%5 = MOV64rm %0, 1, %noreg, 0, %noreg
%6 = SHR64ri %5, 2, implicit-def dead %eflags
%7 = SHL64ri %6, 1, implicit-def dead %eflags
@ -541,16 +626,15 @@ body: |
%11 = XOR64rr %10, %6, implicit-def dead %eflags
%12 = ANDN64rr %11, %9, implicit-def dead %eflags
%13 = ADD64rr %12, %11, implicit-def dead %eflags
; CHECK: %3:vk64wm = COPY %13
%3 = COPY %13
%4 = VMOVDQU8Zrrk %2, killed %3, %1
VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
; CHECK: KTESTQrr %13, %13, implicit-def %eflags
TEST64rr %13, %13, implicit-def %eflags
JE_1 %bb.1, implicit %eflags
JMP_1 %bb.2
; FIXME We can't replace TEST with KTEST due to flag differences
; TEST64rr %13, %13, implicit-def %eflags
; JE_1 %bb.1, implicit %eflags
; JMP_1 %bb.2
bb.1:
@ -560,14 +644,13 @@ body: |
...
---
name: test_16bitext
# CHECK-LABEL: name: test_16bitext
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@ -575,11 +658,11 @@ registers:
- { id: 4, class: vr512, preferred-register: '' }
- { id: 5, class: gr16, preferred-register: '' }
- { id: 6, class: gr16, preferred-register: '' }
liveins:
liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
frameInfo:
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@ -596,24 +679,32 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
fixedStack:
stack:
constants:
fixedStack:
stack:
constants:
body: |
bb.0:
liveins: %rdi, %zmm0, %zmm1
; CHECK-LABEL: name: test_16bitext
; CHECK: liveins: %rdi, %zmm0, %zmm1
; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
; CHECK: [[COPY3:%[0-9]+]]:vk16 = COPY [[KMOVBkm]]
; CHECK: [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[COPY3]]
; CHECK: [[COPY4:%[0-9]+]]:vk16wm = COPY [[KNOTWrr]]
; CHECK: [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY4]], [[COPY1]]
; CHECK: VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]]
; CHECK: RET 0
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
; CHECK: %7:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
; CHECK: %5:vk16 = COPY %7
; CHECK: %6:vk16 = KNOTWrr %5
%5 = MOVZX16rm8 %0, 1, %noreg, 0, %noreg
%6 = NOT16r %5
; CHECK: %3:vk16wm = COPY %6
%3 = COPY %6
%4 = VMOVAPSZrrk %2, killed %3, %1
VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %4
@ -622,14 +713,13 @@ body: |
...
---
name: test_32bitext
# CHECK-LABEL: name: test_32bitext
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@ -638,11 +728,11 @@ registers:
- { id: 5, class: gr32, preferred-register: '' }
- { id: 6, class: gr32, preferred-register: '' }
- { id: 7, class: gr32, preferred-register: '' }
liveins:
liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
frameInfo:
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@ -659,27 +749,35 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
fixedStack:
stack:
constants:
fixedStack:
stack:
constants:
body: |
bb.0:
liveins: %rdi, %zmm0, %zmm1
; CHECK-LABEL: name: test_32bitext
; CHECK: liveins: %rdi, %zmm0, %zmm1
; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
; CHECK: [[COPY3:%[0-9]+]]:vk32 = COPY [[KMOVBkm]]
; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg
; CHECK: [[COPY4:%[0-9]+]]:vk32 = COPY [[KMOVWkm]]
; CHECK: [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[COPY3]], [[COPY4]]
; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDDrr]]
; CHECK: [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]]
; CHECK: RET 0
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
; CHECK: %5:vk32 = COPY %8
; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
; CHECK: %6:vk32 = COPY %9
; CHECK: %7:vk32 = KADDDrr %5, %6
%5 = MOVZX32rm8 %0, 1, %noreg, 0, %noreg
%6 = MOVZX32rm16 %0, 1, %noreg, 0, %noreg
%7 = ADD32rr %5, %6, implicit-def dead %eflags
; CHECK: %3:vk64wm = COPY %7
%3 = COPY %7
%4 = VMOVDQU16Zrrk %2, killed %3, %1
VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
@ -688,14 +786,13 @@ body: |
...
---
name: test_64bitext
# CHECK-LABEL: name: test_64bitext
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
registers:
- { id: 0, class: gr64, preferred-register: '' }
- { id: 1, class: vr512, preferred-register: '' }
- { id: 2, class: vr512, preferred-register: '' }
@ -704,11 +801,11 @@ registers:
- { id: 5, class: gr64, preferred-register: '' }
- { id: 6, class: gr64, preferred-register: '' }
- { id: 7, class: gr64, preferred-register: '' }
liveins:
liveins:
- { reg: '%rdi', virtual-reg: '%0' }
- { reg: '%zmm0', virtual-reg: '%1' }
- { reg: '%zmm1', virtual-reg: '%2' }
frameInfo:
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@ -725,27 +822,35 @@ frameInfo:
hasMustTailInVarArgFunc: false
savePoint: ''
restorePoint: ''
fixedStack:
stack:
constants:
fixedStack:
stack:
constants:
body: |
bb.0:
liveins: %rdi, %zmm0, %zmm1
; CHECK-LABEL: name: test_64bitext
; CHECK: liveins: %rdi, %zmm0, %zmm1
; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
; CHECK: [[COPY3:%[0-9]+]]:vk64 = COPY [[KMOVBkm]]
; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg
; CHECK: [[COPY4:%[0-9]+]]:vk64 = COPY [[KMOVWkm]]
; CHECK: [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[COPY3]], [[COPY4]]
; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
; CHECK: [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]]
; CHECK: RET 0
%0 = COPY %rdi
%1 = COPY %zmm0
%2 = COPY %zmm1
; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
; CHECK: %5:vk64 = COPY %8
; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
; CHECK: %6:vk64 = COPY %9
; CHECK: %7:vk64 = KADDQrr %5, %6
%5 = MOVZX64rm8 %0, 1, %noreg, 0, %noreg
%6 = MOVZX64rm16 %0, 1, %noreg, 0, %noreg
%7 = ADD64rr %5, %6, implicit-def dead %eflags
; CHECK: %3:vk64wm = COPY %7
%3 = COPY %7
%4 = VMOVDQU8Zrrk %2, killed %3, %1
VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4

View File

@ -0,0 +1,14 @@
; RUN: llc < %s -mtriple=i686-- -no-integrated-as | FileCheck -check-prefix=X86 %s
; RUN: llc < %s -mtriple=x86_64-- -no-integrated-as | FileCheck -check-prefix=X64 %s
; If the target does not have 64-bit integer registers, emit 32-bit register
; names.
; X86: call __x86_indirect_thunk_e{{[abcd]}}x
; X64: call __x86_indirect_thunk_r
define void @q_modifier(i32* %p) {
entry:
tail call void asm sideeffect "call __x86_indirect_thunk_${0:V}", "r,~{dirflag},~{fpsr},~{flags}"(i32* %p)
ret void
}

View File

@ -0,0 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s
define void @foo() unnamed_addr #0 {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
; CHECK-NEXT: vaddps %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; CHECK-NEXT: vmovups %zmm0, (%rax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%1 = fadd <16 x float> undef, undef
%bc256 = bitcast <16 x float> %1 to <4 x i128>
%2 = extractelement <4 x i128> %bc256, i32 0
%3 = bitcast i128 %2 to <4 x float>
%4 = shufflevector <4 x float> %3, <4 x float> undef, <16 x i32> <i32 0, i32
1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0,
i32 1, i32 2, i32 3>
store <16 x float> %4, <16 x float>* undef, align 4
ret void
}

View File

@ -23,18 +23,18 @@ entry:
; X64: callq bar
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq %[[fp]], %r11
; X64: callq __llvm_external_retpoline_r11
; X64: callq __x86_indirect_thunk_r11
; X64: movl %[[x]], %edi
; X64: callq bar
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq %[[fp]], %r11
; X64: jmp __llvm_external_retpoline_r11 # TAILCALL
; X64: jmp __x86_indirect_thunk_r11 # TAILCALL
; X64FAST-LABEL: icall_reg:
; X64FAST: callq bar
; X64FAST: callq __llvm_external_retpoline_r11
; X64FAST: callq __x86_indirect_thunk_r11
; X64FAST: callq bar
; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL
; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL
; X86-LABEL: icall_reg:
; X86-DAG: movl 12(%esp), %[[fp:[^ ]*]]
@ -43,19 +43,19 @@ entry:
; X86: calll bar
; X86: movl %[[fp]], %eax
; X86: pushl %[[x]]
; X86: calll __llvm_external_retpoline_eax
; X86: calll __x86_indirect_thunk_eax
; X86: pushl %[[x]]
; X86: calll bar
; X86: movl %[[fp]], %eax
; X86: pushl %[[x]]
; X86: calll __llvm_external_retpoline_eax
; X86: calll __x86_indirect_thunk_eax
; X86-NOT: # TAILCALL
; X86FAST-LABEL: icall_reg:
; X86FAST: calll bar
; X86FAST: calll __llvm_external_retpoline_eax
; X86FAST: calll __x86_indirect_thunk_eax
; X86FAST: calll bar
; X86FAST: calll __llvm_external_retpoline_eax
; X86FAST: calll __x86_indirect_thunk_eax
@global_fp = external global void (i32)*
@ -72,28 +72,28 @@ define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 {
; X64-LABEL: icall_global_fp:
; X64-DAG: movl %edi, %[[x:[^ ]*]]
; X64-DAG: movq global_fp(%rip), %r11
; X64: callq __llvm_external_retpoline_r11
; X64: callq __x86_indirect_thunk_r11
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq global_fp(%rip), %r11
; X64: jmp __llvm_external_retpoline_r11 # TAILCALL
; X64: jmp __x86_indirect_thunk_r11 # TAILCALL
; X64FAST-LABEL: icall_global_fp:
; X64FAST: movq global_fp(%rip), %r11
; X64FAST: callq __llvm_external_retpoline_r11
; X64FAST: callq __x86_indirect_thunk_r11
; X64FAST: movq global_fp(%rip), %r11
; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL
; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL
; X86-LABEL: icall_global_fp:
; X86: movl global_fp, %eax
; X86: pushl 4(%esp)
; X86: calll __llvm_external_retpoline_eax
; X86: calll __x86_indirect_thunk_eax
; X86: addl $4, %esp
; X86: movl global_fp, %eax
; X86: jmp __llvm_external_retpoline_eax # TAILCALL
; X86: jmp __x86_indirect_thunk_eax # TAILCALL
; X86FAST-LABEL: icall_global_fp:
; X86FAST: calll __llvm_external_retpoline_eax
; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL
; X86FAST: calll __x86_indirect_thunk_eax
; X86FAST: jmp __x86_indirect_thunk_eax # TAILCALL
%struct.Foo = type { void (%struct.Foo*)** }
@ -114,14 +114,14 @@ define void @vcall(%struct.Foo* %obj) #0 {
; X64: movq (%[[obj]]), %[[vptr:[^ ]*]]
; X64: movq 8(%[[vptr]]), %[[fp:[^ ]*]]
; X64: movq %[[fp]], %r11
; X64: callq __llvm_external_retpoline_r11
; X64: callq __x86_indirect_thunk_r11
; X64-DAG: movq %[[obj]], %rdi
; X64-DAG: movq %[[fp]], %r11
; X64: jmp __llvm_external_retpoline_r11 # TAILCALL
; X64: jmp __x86_indirect_thunk_r11 # TAILCALL
; X64FAST-LABEL: vcall:
; X64FAST: callq __llvm_external_retpoline_r11
; X64FAST: jmp __llvm_external_retpoline_r11 # TAILCALL
; X64FAST: callq __x86_indirect_thunk_r11
; X64FAST: jmp __x86_indirect_thunk_r11 # TAILCALL
; X86-LABEL: vcall:
; X86: movl 8(%esp), %[[obj:[^ ]*]]
@ -129,14 +129,14 @@ define void @vcall(%struct.Foo* %obj) #0 {
; X86: movl 4(%[[vptr]]), %[[fp:[^ ]*]]
; X86: movl %[[fp]], %eax
; X86: pushl %[[obj]]
; X86: calll __llvm_external_retpoline_eax
; X86: calll __x86_indirect_thunk_eax
; X86: addl $4, %esp
; X86: movl %[[fp]], %eax
; X86: jmp __llvm_external_retpoline_eax # TAILCALL
; X86: jmp __x86_indirect_thunk_eax # TAILCALL
; X86FAST-LABEL: vcall:
; X86FAST: calll __llvm_external_retpoline_eax
; X86FAST: jmp __llvm_external_retpoline_eax # TAILCALL
; X86FAST: calll __x86_indirect_thunk_eax
; X86FAST: jmp __x86_indirect_thunk_eax # TAILCALL
declare void @direct_callee()

View File

@ -0,0 +1,42 @@
; RUN: llc -mtriple=i686-linux < %s | FileCheck --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" %s
; Test 32-bit retpoline when -mregparm=3 is used. This case is interesting
; because there are no available scratch registers. The Linux kernel builds
; with -mregparm=3, so we need to support it. TCO should fail because we need
; to restore EDI.
define void @call_edi(void (i32, i32, i32)* %fp) #0 {
entry:
tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0)
ret void
}
; CHECK-LABEL: call_edi:
; EDI is used, so it must be saved.
; CHECK: pushl %edi
; CHECK-DAG: xorl %eax, %eax
; CHECK-DAG: xorl %edx, %edx
; CHECK-DAG: xorl %ecx, %ecx
; CHECK-DAG: movl {{.*}}, %edi
; CHECK: calll __llvm_retpoline_edi
; CHECK: popl %edi
; CHECK: retl
define void @edi_external(void (i32, i32, i32)* %fp) #1 {
entry:
tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0)
ret void
}
; CHECK-LABEL: edi_external:
; CHECK: pushl %edi
; CHECK-DAG: xorl %eax, %eax
; CHECK-DAG: xorl %edx, %edx
; CHECK-DAG: xorl %ecx, %ecx
; CHECK-DAG: movl {{.*}}, %edi
; CHECK: calll __x86_indirect_thunk_edi
; CHECK: popl %edi
; CHECK: retl
attributes #0 = { "target-features"="+retpoline" }
attributes #1 = { "target-features"="+retpoline-external-thunk" }

View File

@ -340,10 +340,10 @@ latch:
; X86-NEXT: movl %edx, (%esp)
; X86-NEXT: retl
;
; X86-LABEL: .section .text.__llvm_retpoline_push,{{.*}},__llvm_retpoline_push,comdat
; X86-NEXT: .hidden __llvm_retpoline_push
; X86-NEXT: .weak __llvm_retpoline_push
; X86: __llvm_retpoline_push:
; X86-LABEL: .section .text.__llvm_retpoline_edi,{{.*}},__llvm_retpoline_edi,comdat
; X86-NEXT: .hidden __llvm_retpoline_edi
; X86-NEXT: .weak __llvm_retpoline_edi
; X86: __llvm_retpoline_edi:
; X86-NEXT: # {{.*}} # %entry
; X86-NEXT: calll [[CALL_TARGET:.*]]
; X86-NEXT: [[CAPTURE_SPEC:.*]]: # Block address taken
@ -355,11 +355,7 @@ latch:
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: [[CALL_TARGET]]: # Block address taken
; X86-NEXT: # %entry
; X86-NEXT: addl $4, %esp
; X86-NEXT: pushl 4(%esp)
; X86-NEXT: pushl 4(%esp)
; X86-NEXT: popl 8(%esp)
; X86-NEXT: popl (%esp)
; X86-NEXT: movl %edi, (%esp)
; X86-NEXT: retl

View File

@ -0,0 +1,88 @@
; Choosing CodeView generates debug metadata for class-scope typedefs that
; Dwarf would normally omit. Choosing both CodeView and Dwarf triggered
; assertion failures and crashes because the Dwarf handler wasn't prepared for
; those records (in particular, ones with the void type represented by a
; null pointer).
;
; This test was generated with:
; clang++ -cc1 -emit-llvm -debug-info-kind=limited -dwarf-version=4 -gcodeview -x c++
; on the following source code:
;
; class A {
; typedef void _Nodeptr;
; };
; class B {
; A FailedTestsCache;
; bool m_fn1();
; };
; bool B::m_fn1() {}
;
; CodeView generates a DIDerivedType for the _Nodeptr typedef.
;
; RUN: llc %s -o - 2>&1 | FileCheck %s
; CHECK-NOT: Assertion failed
; ModuleID = 'bug.cpp'
source_filename = "bug.cpp"
target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
target triple = "i686-pc-windows-msvc"
%class.B = type { %class.A }
%class.A = type { i8 }
; Function Attrs: noinline nounwind optnone
define x86_thiscallcc zeroext i1 @"\01?m_fn1@B@@AAE_NXZ"(%class.B* %this) #0 align 2 !dbg !9 {
entry:
%retval = alloca i1, align 1
%this.addr = alloca %class.B*, align 4
store %class.B* %this, %class.B** %this.addr, align 4
call void @llvm.dbg.declare(metadata %class.B** %this.addr, metadata !22, metadata !DIExpression()), !dbg !24
%this1 = load %class.B*, %class.B** %this.addr, align 4
call void @llvm.trap(), !dbg !25
unreachable, !dbg !25
return: ; No predecessors!
%0 = load i1, i1* %retval, align 1, !dbg !25
ret i1 %0, !dbg !25
}
; Function Attrs: nounwind readnone speculatable
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
; Function Attrs: noreturn nounwind
declare void @llvm.trap() #2
attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { noreturn nounwind }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5, !6, !7}
!llvm.ident = !{!8}
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "<stdin>", directory: "D:\5Csrc\5Cbug", checksumkind: CSK_MD5, checksum: "2216f11c5ddda8c48a6f92a6079ad4b6")
!2 = !{}
!3 = !{i32 1, !"NumRegisterParameters", i32 0}
!4 = !{i32 2, !"Dwarf Version", i32 4}
!5 = !{i32 2, !"CodeView", i32 1}
!6 = !{i32 2, !"Debug Info Version", i32 3}
!7 = !{i32 1, !"wchar_size", i32 2}
!8 = !{!"clang version 6.0.0 "}
!9 = distinct !DISubprogram(name: "m_fn1", linkageName: "\01?m_fn1@B@@AAE_NXZ", scope: !11, file: !10, line: 8, type: !18, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !17, variables: !2)
!10 = !DIFile(filename: "bug.cpp", directory: "D:\5Csrc\5Cbug", checksumkind: CSK_MD5, checksum: "2216f11c5ddda8c48a6f92a6079ad4b6")
!11 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "B", file: !10, line: 4, size: 8, elements: !12, identifier: ".?AVB@@")
!12 = !{!13, !17}
!13 = !DIDerivedType(tag: DW_TAG_member, name: "FailedTestsCache", scope: !11, file: !10, line: 5, baseType: !14, size: 8)
!14 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !10, line: 1, size: 8, elements: !15, identifier: ".?AVA@@")
!15 = !{!16}
!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "_Nodeptr", scope: !14, file: !10, line: 2, baseType: null)
!17 = !DISubprogram(name: "m_fn1", linkageName: "\01?m_fn1@B@@AAE_NXZ", scope: !11, file: !10, line: 6, type: !18, isLocal: false, isDefinition: false, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false)
!18 = !DISubroutineType(cc: DW_CC_BORLAND_thiscall, types: !19)
!19 = !{!20, !21}
!20 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean)
!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
!22 = !DILocalVariable(name: "this", arg: 1, scope: !9, type: !23, flags: DIFlagArtificial | DIFlagObjectPointer)
!23 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 32)
!24 = !DILocation(line: 0, scope: !9)
!25 = !DILocation(line: 8, scope: !9)

View File

@ -0,0 +1,8 @@
; RUN: not llc < %s 2>&1 | FileCheck %s
define void @test() {
call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1
call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1
; CHECK: error: macro 'FOO' is already defined
ret void
}

View File

@ -622,6 +622,11 @@ movl $12, foo(%rip)
// CHECK: encoding: [0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
// CHECK: fixup A - offset: 2, value: foo-8, kind: reloc_riprel_4byte
// rdar://37247000
movl $12, 1024(%rip)
// CHECK: movl $12, 1024(%rip)
// CHECK: encoding: [0xc7,0x05,0x00,0x04,0x00,0x00,0x0c,0x00,0x00,0x00]
movq $12, foo(%rip)
// CHECK: movq $12, foo(%rip)
// CHECK: encoding: [0x48,0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]

View File

@ -722,6 +722,114 @@ define <2 x half> @constant_rtz_pkrtz() {
ret <2 x half> %cvt
}
; --------------------------------------------------------------------
; llvm.amdgcn.cvt.pknorm.i16
; --------------------------------------------------------------------
declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) nounwind readnone
; CHECK-LABEL: @undef_lhs_cvt_pknorm_i16(
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
define <2 x i16> @undef_lhs_cvt_pknorm_i16(float %y) {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
ret <2 x i16> %cvt
}
; CHECK-LABEL: @undef_rhs_cvt_pknorm_i16(
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
define <2 x i16> @undef_rhs_cvt_pknorm_i16(float %x) {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
ret <2 x i16> %cvt
}
; CHECK-LABEL: @undef_cvt_pknorm_i16(
; CHECK: ret <2 x i16> undef
define <2 x i16> @undef_cvt_pknorm_i16() {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float undef)
ret <2 x i16> %cvt
}
; --------------------------------------------------------------------
; llvm.amdgcn.cvt.pknorm.u16
; --------------------------------------------------------------------
declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) nounwind readnone
; CHECK-LABEL: @undef_lhs_cvt_pknorm_u16(
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
define <2 x i16> @undef_lhs_cvt_pknorm_u16(float %y) {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
ret <2 x i16> %cvt
}
; CHECK-LABEL: @undef_rhs_cvt_pknorm_u16(
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
define <2 x i16> @undef_rhs_cvt_pknorm_u16(float %x) {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
ret <2 x i16> %cvt
}
; CHECK-LABEL: @undef_cvt_pknorm_u16(
; CHECK: ret <2 x i16> undef
define <2 x i16> @undef_cvt_pknorm_u16() {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float undef)
ret <2 x i16> %cvt
}
; --------------------------------------------------------------------
; llvm.amdgcn.cvt.pk.i16
; --------------------------------------------------------------------
declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) nounwind readnone
; CHECK-LABEL: @undef_lhs_cvt_pk_i16(
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
define <2 x i16> @undef_lhs_cvt_pk_i16(i32 %y) {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
ret <2 x i16> %cvt
}
; CHECK-LABEL: @undef_rhs_cvt_pk_i16(
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
define <2 x i16> @undef_rhs_cvt_pk_i16(i32 %x) {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
ret <2 x i16> %cvt
}
; CHECK-LABEL: @undef_cvt_pk_i16(
; CHECK: ret <2 x i16> undef
define <2 x i16> @undef_cvt_pk_i16() {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 undef)
ret <2 x i16> %cvt
}
; --------------------------------------------------------------------
; llvm.amdgcn.cvt.pk.u16
; --------------------------------------------------------------------
declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) nounwind readnone
; CHECK-LABEL: @undef_lhs_cvt_pk_u16(
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
define <2 x i16> @undef_lhs_cvt_pk_u16(i32 %y) {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
ret <2 x i16> %cvt
}
; CHECK-LABEL: @undef_rhs_cvt_pk_u16(
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
define <2 x i16> @undef_rhs_cvt_pk_u16(i32 %x) {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
ret <2 x i16> %cvt
}
; CHECK-LABEL: @undef_cvt_pk_u16(
; CHECK: ret <2 x i16> undef
define <2 x i16> @undef_cvt_pk_u16() {
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 undef)
ret <2 x i16> %cvt
}
; --------------------------------------------------------------------
; llvm.amdgcn.ubfe
; --------------------------------------------------------------------