diff --git a/CMakeLists.txt b/CMakeLists.txt index c100adef258c..2bf2c21a306f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ if(NOT DEFINED LLVM_VERSION_PATCH) set(LLVM_VERSION_PATCH 0) endif() if(NOT DEFINED LLVM_VERSION_SUFFIX) - set(LLVM_VERSION_SUFFIX svn) + set(LLVM_VERSION_SUFFIX "") endif() if (NOT PACKAGE_VERSION) diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake index c6be957b0e45..aaf22ff474b7 100755 --- a/cmake/config-ix.cmake +++ b/cmake/config-ix.cmake @@ -640,7 +640,8 @@ endif() string(REPLACE " " ";" LLVM_BINDINGS_LIST "${LLVM_BINDINGS}") function(find_python_module module) - string(TOUPPER ${module} module_upper) + string(REPLACE "." "_" module_name ${module}) + string(TOUPPER ${module_name} module_upper) set(FOUND_VAR PY_${module_upper}_FOUND) execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" "import ${module}" @@ -658,13 +659,16 @@ endfunction() set (PYTHON_MODULES pygments + # Some systems still don't have pygments.lexers.c_cpp which was introduced in + # version 2.0 in 2014... + pygments.lexers.c_cpp yaml ) foreach(module ${PYTHON_MODULES}) find_python_module(${module}) endforeach() -if(PY_PYGMENTS_FOUND AND PY_YAML_FOUND) +if(PY_PYGMENTS_FOUND AND PY_PYGMENTS_LEXERS_C_CPP_FOUND AND PY_YAML_FOUND) set (LLVM_HAVE_OPT_VIEWER_MODULES 1) else() set (LLVM_HAVE_OPT_VIEWER_MODULES 0) diff --git a/include/llvm/CodeGen/TargetPassConfig.h b/include/llvm/CodeGen/TargetPassConfig.h index 1aaa85d77a54..a378c7b2fca1 100644 --- a/include/llvm/CodeGen/TargetPassConfig.h +++ b/include/llvm/CodeGen/TargetPassConfig.h @@ -325,9 +325,9 @@ class TargetPassConfig : public ImmutablePass { virtual bool isGlobalISelEnabled() const; /// Check whether or not GlobalISel should abort on error. - /// When this is disable, GlobalISel will fall back on SDISel instead of + /// When this is disabled, GlobalISel will fall back on SDISel instead of /// erroring out. - virtual bool isGlobalISelAbortEnabled() const; + bool isGlobalISelAbortEnabled() const; /// Check whether or not a diagnostic should be emitted when GlobalISel /// uses the fallback path. In other words, it will emit a diagnostic diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h index 79c56abe1c37..def842f5fcee 100644 --- a/include/llvm/IR/Function.h +++ b/include/llvm/IR/Function.h @@ -218,6 +218,7 @@ class Function : public GlobalObject, public ilist_node { Attribute::get(getContext(), Kind, Val)); } + /// @brief Add function attributes to this function. void addFnAttr(Attribute Attr) { addAttribute(AttributeList::FunctionIndex, Attr); } @@ -268,6 +269,8 @@ class Function : public GlobalObject, public ilist_node { bool hasFnAttribute(Attribute::AttrKind Kind) const { return AttributeSets.hasFnAttribute(Kind); } + + /// @brief Return true if the function has the attribute. bool hasFnAttribute(StringRef Kind) const { return AttributeSets.hasFnAttribute(Kind); } @@ -276,6 +279,8 @@ class Function : public GlobalObject, public ilist_node { Attribute getFnAttribute(Attribute::AttrKind Kind) const { return getAttribute(AttributeList::FunctionIndex, Kind); } + + /// @brief Return the attribute for the given attribute kind. Attribute getFnAttribute(StringRef Kind) const { return getAttribute(AttributeList::FunctionIndex, Kind); } @@ -342,10 +347,12 @@ class Function : public GlobalObject, public ilist_node { return getAttributes().hasParamAttribute(ArgNo, Kind); } + /// @brief gets the attribute from the list of attributes. Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const { return AttributeSets.getAttribute(i, Kind); } + /// @brief gets the attribute from the list of attributes. Attribute getAttribute(unsigned i, StringRef Kind) const { return AttributeSets.getAttribute(i, Kind); } diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index cc08fe683272..22a3a0fe618f 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -304,7 +304,8 @@ class AMDGPUImageLoad : Intrinsic < llvm_i1_ty, // slc(imm) llvm_i1_ty, // lwe(imm) llvm_i1_ty], // da(imm) - !if(NoMem, [IntrNoMem], [IntrReadMem])>; + !if(NoMem, [IntrNoMem], [IntrReadMem]), "", + !if(NoMem, [], [SDNPMemOperand])>; def int_amdgcn_image_load : AMDGPUImageLoad; def int_amdgcn_image_load_mip : AMDGPUImageLoad; @@ -320,7 +321,7 @@ class AMDGPUImageStore : Intrinsic < llvm_i1_ty, // slc(imm) llvm_i1_ty, // lwe(imm) llvm_i1_ty], // da(imm) - []>; + [IntrWriteMem], "", [SDNPMemOperand]>; def int_amdgcn_image_store : AMDGPUImageStore; def int_amdgcn_image_store_mip : AMDGPUImageStore; @@ -336,7 +337,8 @@ class AMDGPUImageSample : Intrinsic < llvm_i1_ty, // slc(imm) llvm_i1_ty, // lwe(imm) llvm_i1_ty], // da(imm) - !if(NoMem, [IntrNoMem], [IntrReadMem])>; + !if(NoMem, [IntrNoMem], [IntrReadMem]), "", + !if(NoMem, [], [SDNPMemOperand])>; // Basic sample def int_amdgcn_image_sample : AMDGPUImageSample; @@ -428,7 +430,7 @@ class AMDGPUImageAtomic : Intrinsic < llvm_i1_ty, // r128(imm) llvm_i1_ty, // da(imm) llvm_i1_ty], // slc(imm) - []>; + [], "", [SDNPMemOperand]>; def int_amdgcn_image_atomic_swap : AMDGPUImageAtomic; def int_amdgcn_image_atomic_add : AMDGPUImageAtomic; @@ -451,7 +453,7 @@ def int_amdgcn_image_atomic_cmpswap : Intrinsic < llvm_i1_ty, // r128(imm) llvm_i1_ty, // da(imm) llvm_i1_ty], // slc(imm) - []>; + [], "", [SDNPMemOperand]>; class AMDGPUBufferLoad : Intrinsic < [llvm_anyfloat_ty], @@ -460,7 +462,7 @@ class AMDGPUBufferLoad : Intrinsic < llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) - [IntrReadMem]>; + [IntrReadMem], "", [SDNPMemOperand]>; def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; def int_amdgcn_buffer_load : AMDGPUBufferLoad; @@ -472,7 +474,7 @@ class AMDGPUBufferStore : Intrinsic < llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) - [IntrWriteMem]>; + [IntrWriteMem], "", [SDNPMemOperand]>; def int_amdgcn_buffer_store_format : AMDGPUBufferStore; def int_amdgcn_buffer_store : AMDGPUBufferStore; @@ -487,7 +489,7 @@ def int_amdgcn_tbuffer_load : Intrinsic < llvm_i32_ty, // nfmt(imm) llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) - []>; + [IntrReadMem], "", [SDNPMemOperand]>; def int_amdgcn_tbuffer_store : Intrinsic < [], @@ -501,7 +503,7 @@ def int_amdgcn_tbuffer_store : Intrinsic < llvm_i32_ty, // nfmt(imm) llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) - []>; + [IntrWriteMem], "", [SDNPMemOperand]>; class AMDGPUBufferAtomic : Intrinsic < [llvm_i32_ty], @@ -510,7 +512,7 @@ class AMDGPUBufferAtomic : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty], // slc(imm) - []>; + [], "", [SDNPMemOperand]>; def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; @@ -529,7 +531,7 @@ def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty], // slc(imm) - []>; + [], "", [SDNPMemOperand]>; // Uses that do not set the done bit should set IntrWriteMem on the // call site. diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h index d1901db7c68e..f043c112861b 100644 --- a/include/llvm/Support/CommandLine.h +++ b/include/llvm/Support/CommandLine.h @@ -1862,6 +1862,33 @@ using TokenizerCallback = void (*)(StringRef Source, StringSaver &Saver, SmallVectorImpl &NewArgv, bool MarkEOLs); +/// Tokenizes content of configuration file. +/// +/// \param [in] Source The string representing content of config file. +/// \param [in] Saver Delegates back to the caller for saving parsed strings. +/// \param [out] NewArgv All parsed strings are appended to NewArgv. +/// \param [in] MarkEOLs Added for compatibility with TokenizerCallback. +/// +/// It works like TokenizeGNUCommandLine with ability to skip comment lines. +/// +void tokenizeConfigFile(StringRef Source, StringSaver &Saver, + SmallVectorImpl &NewArgv, + bool MarkEOLs = false); + +/// Reads command line options from the given configuration file. +/// +/// \param [in] CfgFileName Path to configuration file. +/// \param [in] Saver Objects that saves allocated strings. +/// \param [out] Argv Array to which the read options are added. +/// \return true if the file was successfully read. +/// +/// It reads content of the specified file, tokenizes it and expands "@file" +/// commands resolving file names in them relative to the directory where +/// CfgFilename resides. +/// +bool readConfigFile(StringRef CfgFileName, StringSaver &Saver, + SmallVectorImpl &Argv); + /// \brief Expand response files on a command line recursively using the given /// StringSaver and tokenization strategy. Argv should contain the command line /// before expansion and will be modified in place. If requested, Argv will diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h index bd096e2f74f6..8a429ab728ed 100644 --- a/include/llvm/Support/TargetRegistry.h +++ b/include/llvm/Support/TargetRegistry.h @@ -123,8 +123,8 @@ class Target { using AsmPrinterCtorTy = AsmPrinter *(*)( TargetMachine &TM, std::unique_ptr &&Streamer); using MCAsmBackendCtorTy = MCAsmBackend *(*)(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options); using MCAsmParserCtorTy = MCTargetAsmParser *(*)( const MCSubtargetInfo &STI, MCAsmParser &P, const MCInstrInfo &MII, @@ -381,15 +381,12 @@ class Target { } /// createMCAsmBackend - Create a target specific assembly parser. - /// - /// \param TheTriple The target triple string. - MCAsmBackend *createMCAsmBackend(const MCRegisterInfo &MRI, - StringRef TheTriple, StringRef CPU, - const MCTargetOptions &Options) - const { + MCAsmBackend *createMCAsmBackend(const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, + const MCTargetOptions &Options) const { if (!MCAsmBackendCtorFn) return nullptr; - return MCAsmBackendCtorFn(*this, MRI, Triple(TheTriple), CPU, Options); + return MCAsmBackendCtorFn(*this, STI, MRI, Options); } /// createMCAsmParser - Create a target specific assembly parser. @@ -1106,10 +1103,10 @@ template struct RegisterMCAsmBackend { } private: - static MCAsmBackend *Allocator(const Target &T, const MCRegisterInfo &MRI, - const Triple &TheTriple, StringRef CPU, + static MCAsmBackend *Allocator(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options) { - return new MCAsmBackendImpl(T, MRI, TheTriple, CPU); + return new MCAsmBackendImpl(T, STI, MRI); } }; diff --git a/include/llvm/Transforms/Scalar/LoopPassManager.h b/include/llvm/Transforms/Scalar/LoopPassManager.h index 473b97dc7e8d..56a45ed34178 100644 --- a/include/llvm/Transforms/Scalar/LoopPassManager.h +++ b/include/llvm/Transforms/Scalar/LoopPassManager.h @@ -264,7 +264,8 @@ template class FunctionToLoopPassAdaptor : public PassInfoMixin> { public: - explicit FunctionToLoopPassAdaptor(LoopPassT Pass) : Pass(std::move(Pass)) { + explicit FunctionToLoopPassAdaptor(LoopPassT Pass, bool DebugLogging = false) + : Pass(std::move(Pass)), LoopCanonicalizationFPM(DebugLogging) { LoopCanonicalizationFPM.addPass(LoopSimplifyPass()); LoopCanonicalizationFPM.addPass(LCSSAPass()); } @@ -384,8 +385,8 @@ class FunctionToLoopPassAdaptor /// adaptor. template FunctionToLoopPassAdaptor -createFunctionToLoopPassAdaptor(LoopPassT Pass) { - return FunctionToLoopPassAdaptor(std::move(Pass)); +createFunctionToLoopPassAdaptor(LoopPassT Pass, bool DebugLogging = false) { + return FunctionToLoopPassAdaptor(std::move(Pass), DebugLogging); } /// \brief Pass for printing a loop's contents as textual IR. diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index 93fb1143e505..f382a1f50188 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -826,7 +826,7 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, MaxRecurse)) return V; - // Mul distributes over Add. Try some generic simplifications based on this. + // Mul distributes over Add. Try some generic simplifications based on this. if (Value *V = ExpandBinOp(Instruction::Mul, Op0, Op1, Instruction::Add, Q, MaxRecurse)) return V; @@ -3838,12 +3838,13 @@ Value *llvm::SimplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx, // Fold into undef if index is out of bounds. if (auto *CI = dyn_cast(Idx)) { uint64_t NumElements = cast(Vec->getType())->getNumElements(); - if (CI->uge(NumElements)) return UndefValue::get(Vec->getType()); } - // TODO: We should also fold if index is iteslf an undef. + // If index is undef, it might be out of bounds (see above case) + if (isa(Idx)) + return UndefValue::get(Vec->getType()); return nullptr; } @@ -3896,10 +3897,13 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQ // If extracting a specified index from the vector, see if we can recursively // find a previously computed scalar that was inserted into the vector. - if (auto *IdxC = dyn_cast(Idx)) - if (IdxC->getValue().ule(Vec->getType()->getVectorNumElements())) - if (Value *Elt = findScalarElement(Vec, IdxC->getZExtValue())) - return Elt; + if (auto *IdxC = dyn_cast(Idx)) { + if (IdxC->getValue().uge(Vec->getType()->getVectorNumElements())) + // definitely out of bounds, thus undefined result + return UndefValue::get(Vec->getType()->getVectorElementType()); + if (Value *Elt = findScalarElement(Vec, IdxC->getZExtValue())) + return Elt; + } // An undef extract index can be arbitrarily chosen to be an out-of-range // index value, which would result in the instruction being undef. @@ -4489,28 +4493,55 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd, } } + Value *IIOperand = *ArgBegin; + Value *X; switch (IID) { case Intrinsic::fabs: { - if (SignBitMustBeZero(*ArgBegin, Q.TLI)) - return *ArgBegin; + if (SignBitMustBeZero(IIOperand, Q.TLI)) + return IIOperand; return nullptr; } case Intrinsic::bswap: { - Value *IIOperand = *ArgBegin; - Value *X = nullptr; // bswap(bswap(x)) -> x if (match(IIOperand, m_BSwap(m_Value(X)))) return X; return nullptr; } case Intrinsic::bitreverse: { - Value *IIOperand = *ArgBegin; - Value *X = nullptr; // bitreverse(bitreverse(x)) -> x if (match(IIOperand, m_BitReverse(m_Value(X)))) return X; return nullptr; } + case Intrinsic::exp: { + // exp(log(x)) -> x + if (Q.CxtI->isFast() && + match(IIOperand, m_Intrinsic(m_Value(X)))) + return X; + return nullptr; + } + case Intrinsic::exp2: { + // exp2(log2(x)) -> x + if (Q.CxtI->isFast() && + match(IIOperand, m_Intrinsic(m_Value(X)))) + return X; + return nullptr; + } + case Intrinsic::log: { + // log(exp(x)) -> x + if (Q.CxtI->isFast() && + match(IIOperand, m_Intrinsic(m_Value(X)))) + return X; + return nullptr; + } + case Intrinsic::log2: { + // log2(exp2(x)) -> x + if (Q.CxtI->isFast() && + match(IIOperand, m_Intrinsic(m_Value(X)))) { + return X; + } + return nullptr; + } default: return nullptr; } diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index f34549ae52b4..10b5c74e378b 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -2358,7 +2358,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl &Ops, FoundMatch = true; } if (FoundMatch) - return getAddExpr(Ops, Flags); + return getAddExpr(Ops, Flags, Depth + 1); // Check for truncates. If all the operands are truncated from the same // type, see if factoring out the truncate would permit the result to be @@ -6402,9 +6402,8 @@ PushLoopPHIs(const Loop *L, SmallVectorImpl &Worklist) { BasicBlock *Header = L->getHeader(); // Push all Loop-header PHIs onto the Worklist stack. - for (BasicBlock::iterator I = Header->begin(); - PHINode *PN = dyn_cast(I); ++I) - Worklist.push_back(PN); + for (PHINode &PN : Header->phis()) + Worklist.push_back(&PN); } const ScalarEvolution::BackedgeTakenInfo & @@ -7638,12 +7637,9 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN, if (!Latch) return nullptr; - for (auto &I : *Header) { - PHINode *PHI = dyn_cast(&I); - if (!PHI) break; - auto *StartCST = getOtherIncomingValue(PHI, Latch); - if (!StartCST) continue; - CurrentIterVals[PHI] = StartCST; + for (PHINode &PHI : Header->phis()) { + if (auto *StartCST = getOtherIncomingValue(&PHI, Latch)) + CurrentIterVals[&PHI] = StartCST; } if (!CurrentIterVals.count(PN)) return RetVal = nullptr; @@ -7720,13 +7716,9 @@ const SCEV *ScalarEvolution::computeExitCountExhaustively(const Loop *L, BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Should follow from NumIncomingValues == 2!"); - for (auto &I : *Header) { - PHINode *PHI = dyn_cast(&I); - if (!PHI) - break; - auto *StartCST = getOtherIncomingValue(PHI, Latch); - if (!StartCST) continue; - CurrentIterVals[PHI] = StartCST; + for (PHINode &PHI : Header->phis()) { + if (auto *StartCST = getOtherIncomingValue(&PHI, Latch)) + CurrentIterVals[&PHI] = StartCST; } if (!CurrentIterVals.count(PN)) return getCouldNotCompute(); diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp index 3ceda677ba61..53ce33bacbe9 100644 --- a/lib/Analysis/ScalarEvolutionExpander.cpp +++ b/lib/Analysis/ScalarEvolutionExpander.cpp @@ -1154,16 +1154,11 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, IVIncInsertLoop && SE.DT.properlyDominates(LatchBlock, IVIncInsertLoop->getHeader()); - for (auto &I : *L->getHeader()) { - auto *PN = dyn_cast(&I); - // Found first non-phi, the rest of instructions are also not Phis. - if (!PN) - break; - - if (!SE.isSCEVable(PN->getType())) + for (PHINode &PN : L->getHeader()->phis()) { + if (!SE.isSCEVable(PN.getType())) continue; - const SCEVAddRecExpr *PhiSCEV = dyn_cast(SE.getSCEV(PN)); + const SCEVAddRecExpr *PhiSCEV = dyn_cast(SE.getSCEV(&PN)); if (!PhiSCEV) continue; @@ -1175,16 +1170,16 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, continue; Instruction *TempIncV = - cast(PN->getIncomingValueForBlock(LatchBlock)); + cast(PN.getIncomingValueForBlock(LatchBlock)); // Check whether we can reuse this PHI node. if (LSRMode) { - if (!isExpandedAddRecExprPHI(PN, TempIncV, L)) + if (!isExpandedAddRecExprPHI(&PN, TempIncV, L)) continue; if (L == IVIncInsertLoop && !hoistIVInc(TempIncV, IVIncInsertPos)) continue; } else { - if (!isNormalAddRecExprPHI(PN, TempIncV, L)) + if (!isNormalAddRecExprPHI(&PN, TempIncV, L)) continue; } @@ -1193,7 +1188,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, IncV = TempIncV; TruncTy = nullptr; InvertStep = false; - AddRecPhiMatch = PN; + AddRecPhiMatch = &PN; break; } @@ -1203,7 +1198,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, canBeCheaplyTransformed(SE, PhiSCEV, Normalized, InvertStep)) { // Record the phi node. But don't stop we might find an exact match // later. - AddRecPhiMatch = PN; + AddRecPhiMatch = &PN; IncV = TempIncV; TruncTy = SE.getEffectiveSCEVType(Normalized->getType()); } @@ -1863,12 +1858,8 @@ SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, const TargetTransformInfo *TTI) { // Find integer phis in order of increasing width. SmallVector Phis; - for (auto &I : *L->getHeader()) { - if (auto *PN = dyn_cast(&I)) - Phis.push_back(PN); - else - break; - } + for (PHINode &PN : L->getHeader()->phis()) + Phis.push_back(&PN); if (TTI) std::sort(Phis.begin(), Phis.end(), [](Value *LHS, Value *RHS) { diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index cd4cee631568..a0032f99ec20 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -2264,9 +2264,9 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth, // ashr X, C -> adds C sign bits. Vectors too. const APInt *ShAmt; if (match(U->getOperand(1), m_APInt(ShAmt))) { - unsigned ShAmtLimited = ShAmt->getZExtValue(); - if (ShAmtLimited >= TyBits) + if (ShAmt->uge(TyBits)) break; // Bad shift. + unsigned ShAmtLimited = ShAmt->getZExtValue(); Tmp += ShAmtLimited; if (Tmp > TyBits) Tmp = TyBits; } @@ -2277,9 +2277,9 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth, if (match(U->getOperand(1), m_APInt(ShAmt))) { // shl destroys sign bits. Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); + if (ShAmt->uge(TyBits) || // Bad shift. + ShAmt->uge(Tmp)) break; // Shifted all sign bits out. Tmp2 = ShAmt->getZExtValue(); - if (Tmp2 >= TyBits || // Bad shift. - Tmp2 >= Tmp) break; // Shifted all sign bits out. return Tmp - Tmp2; } break; @@ -4161,6 +4161,81 @@ static SelectPatternResult matchClamp(CmpInst::Predicate Pred, return {SPF_UNKNOWN, SPNB_NA, false}; } +/// Recognize variations of: +/// a < c ? min(a,b) : min(b,c) ==> min(min(a,b),min(b,c)) +static SelectPatternResult matchMinMaxOfMinMax(CmpInst::Predicate Pred, + Value *CmpLHS, Value *CmpRHS, + Value *TrueVal, Value *FalseVal) { + // TODO: Allow FP min/max with nnan/nsz. + assert(CmpInst::isIntPredicate(Pred) && "Expected integer comparison"); + + Value *A, *B; + SelectPatternResult L = matchSelectPattern(TrueVal, A, B); + if (!SelectPatternResult::isMinOrMax(L.Flavor)) + return {SPF_UNKNOWN, SPNB_NA, false}; + + Value *C, *D; + SelectPatternResult R = matchSelectPattern(FalseVal, C, D); + if (L.Flavor != R.Flavor) + return {SPF_UNKNOWN, SPNB_NA, false}; + + // Match the compare to the min/max operations of the select operands. + switch (L.Flavor) { + case SPF_SMIN: + if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) { + Pred = ICmpInst::getSwappedPredicate(Pred); + std::swap(CmpLHS, CmpRHS); + } + if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) + break; + return {SPF_UNKNOWN, SPNB_NA, false}; + case SPF_SMAX: + if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) { + Pred = ICmpInst::getSwappedPredicate(Pred); + std::swap(CmpLHS, CmpRHS); + } + if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) + break; + return {SPF_UNKNOWN, SPNB_NA, false}; + case SPF_UMIN: + if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) { + Pred = ICmpInst::getSwappedPredicate(Pred); + std::swap(CmpLHS, CmpRHS); + } + if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) + break; + return {SPF_UNKNOWN, SPNB_NA, false}; + case SPF_UMAX: + if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) { + Pred = ICmpInst::getSwappedPredicate(Pred); + std::swap(CmpLHS, CmpRHS); + } + if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) + break; + return {SPF_UNKNOWN, SPNB_NA, false}; + default: + llvm_unreachable("Bad flavor while matching min/max"); + } + + // a pred c ? m(a, b) : m(c, b) --> m(m(a, b), m(c, b)) + if (CmpLHS == A && CmpRHS == C && D == B) + return {L.Flavor, SPNB_NA, false}; + + // a pred d ? m(a, b) : m(b, d) --> m(m(a, b), m(b, d)) + if (CmpLHS == A && CmpRHS == D && C == B) + return {L.Flavor, SPNB_NA, false}; + + // b pred c ? m(a, b) : m(c, a) --> m(m(a, b), m(c, a)) + if (CmpLHS == B && CmpRHS == C && D == A) + return {L.Flavor, SPNB_NA, false}; + + // b pred d ? m(a, b) : m(a, d) --> m(m(a, b), m(a, d)) + if (CmpLHS == B && CmpRHS == D && C == A) + return {L.Flavor, SPNB_NA, false}; + + return {SPF_UNKNOWN, SPNB_NA, false}; +} + /// Match non-obvious integer minimum and maximum sequences. static SelectPatternResult matchMinMax(CmpInst::Predicate Pred, Value *CmpLHS, Value *CmpRHS, @@ -4174,6 +4249,10 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred, if (SPR.Flavor != SelectPatternFlavor::SPF_UNKNOWN) return SPR; + SPR = matchMinMaxOfMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal); + if (SPR.Flavor != SelectPatternFlavor::SPF_UNKNOWN) + return SPR; + if (Pred != CmpInst::ICMP_SGT && Pred != CmpInst::ICMP_SLT) return {SPF_UNKNOWN, SPNB_NA, false}; diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index d6f55bba716f..9dc1ab4e6bb5 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -633,16 +633,10 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB, if (DestBBPred == BB) continue; - bool HasAllSameValue = true; - BasicBlock::const_iterator DestBBI = DestBB->begin(); - while (const PHINode *DestPN = dyn_cast(DestBBI++)) { - if (DestPN->getIncomingValueForBlock(BB) != - DestPN->getIncomingValueForBlock(DestBBPred)) { - HasAllSameValue = false; - break; - } - } - if (HasAllSameValue) + if (llvm::all_of(DestBB->phis(), [&](const PHINode &DestPN) { + return DestPN.getIncomingValueForBlock(BB) == + DestPN.getIncomingValueForBlock(DestBBPred); + })) SameIncomingValueBBs.insert(DestBBPred); } @@ -672,9 +666,8 @@ bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB, // We only want to eliminate blocks whose phi nodes are used by phi nodes in // the successor. If there are more complex condition (e.g. preheaders), // don't mess around with them. - BasicBlock::const_iterator BBI = BB->begin(); - while (const PHINode *PN = dyn_cast(BBI++)) { - for (const User *U : PN->users()) { + for (const PHINode &PN : BB->phis()) { + for (const User *U : PN.users()) { const Instruction *UI = cast(U); if (UI->getParent() != DestBB || !isa(UI)) return false; @@ -713,10 +706,9 @@ bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB, for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = DestBBPN->getIncomingBlock(i); if (BBPreds.count(Pred)) { // Common predecessor? - BBI = DestBB->begin(); - while (const PHINode *PN = dyn_cast(BBI++)) { - const Value *V1 = PN->getIncomingValueForBlock(Pred); - const Value *V2 = PN->getIncomingValueForBlock(BB); + for (const PHINode &PN : DestBB->phis()) { + const Value *V1 = PN.getIncomingValueForBlock(Pred); + const Value *V2 = PN.getIncomingValueForBlock(BB); // If V2 is a phi node in BB, look up what the mapped value will be. if (const PHINode *V2PN = dyn_cast(V2)) @@ -759,11 +751,9 @@ void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { // Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB // to handle the new incoming edges it is about to have. - PHINode *PN; - for (BasicBlock::iterator BBI = DestBB->begin(); - (PN = dyn_cast(BBI)); ++BBI) { + for (PHINode &PN : DestBB->phis()) { // Remove the incoming value for BB, and remember it. - Value *InVal = PN->removeIncomingValue(BB, false); + Value *InVal = PN.removeIncomingValue(BB, false); // Two options: either the InVal is a phi node defined in BB or it is some // value that dominates BB. @@ -771,17 +761,17 @@ void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { if (InValPhi && InValPhi->getParent() == BB) { // Add all of the input values of the input PHI as inputs of this phi. for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i) - PN->addIncoming(InValPhi->getIncomingValue(i), - InValPhi->getIncomingBlock(i)); + PN.addIncoming(InValPhi->getIncomingValue(i), + InValPhi->getIncomingBlock(i)); } else { // Otherwise, add one instance of the dominating value for each edge that // we will be adding. if (PHINode *BBPN = dyn_cast(BB->begin())) { for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) - PN->addIncoming(InVal, BBPN->getIncomingBlock(i)); + PN.addIncoming(InVal, BBPN->getIncomingBlock(i)); } else { for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) - PN->addIncoming(InVal, *PI); + PN.addIncoming(InVal, *PI); } } } @@ -6497,22 +6487,16 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) { std::swap(TBB, FBB); // Replace the old BB with the new BB. - for (auto &I : *TBB) { - PHINode *PN = dyn_cast(&I); - if (!PN) - break; + for (PHINode &PN : TBB->phis()) { int i; - while ((i = PN->getBasicBlockIndex(&BB)) >= 0) - PN->setIncomingBlock(i, TmpBB); + while ((i = PN.getBasicBlockIndex(&BB)) >= 0) + PN.setIncomingBlock(i, TmpBB); } // Add another incoming edge form the new BB. - for (auto &I : *FBB) { - PHINode *PN = dyn_cast(&I); - if (!PN) - break; - auto *Val = PN->getIncomingValueForBlock(&BB); - PN->addIncoming(Val, TmpBB); + for (PHINode &PN : FBB->phis()) { + auto *Val = PN.getIncomingValueForBlock(&BB); + PN.addIncoming(Val, TmpBB); } // Update the branch weights (from SelectionDAGBuilder:: diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp index 433f99b0113b..705d4ded5b56 100644 --- a/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -815,7 +815,14 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { if (CI.isInlineAsm()) return translateInlineAsm(CI, MIRBuilder); - if (!F || !F->isIntrinsic()) { + Intrinsic::ID ID = Intrinsic::not_intrinsic; + if (F && F->isIntrinsic()) { + ID = F->getIntrinsicID(); + if (TII && ID == Intrinsic::not_intrinsic) + ID = static_cast(TII->getIntrinsicID(F)); + } + + if (!F || !F->isIntrinsic() || ID == Intrinsic::not_intrinsic) { unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI); SmallVector Args; for (auto &Arg: CI.arg_operands()) @@ -827,10 +834,6 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { }); } - Intrinsic::ID ID = F->getIntrinsicID(); - if (TII && ID == Intrinsic::not_intrinsic) - ID = static_cast(TII->getIntrinsicID(F)); - assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic"); if (translateKnownIntrinsic(CI, ID, MIRBuilder)) diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index a3b43c92a7fc..c7118201b753 100644 --- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -813,7 +813,21 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { unsigned Zero = MRI.createGenericVirtualRegister(Ty); MIRBuilder.buildConstant(Zero, 0); - MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero); + + // For *signed* multiply, overflow is detected by checking: + // (hi != (lo >> bitwidth-1)) + if (Opcode == TargetOpcode::G_SMULH) { + unsigned Shifted = MRI.createGenericVirtualRegister(Ty); + unsigned ShiftAmt = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildConstant(ShiftAmt, Ty.getSizeInBits() - 1); + MIRBuilder.buildInstr(TargetOpcode::G_ASHR) + .addDef(Shifted) + .addUse(Res) + .addUse(ShiftAmt); + MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted); + } else { + MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero); + } MI.eraseFromParent(); return Legalized; } diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp index 77a7aaa95732..4c6e21ab315a 100644 --- a/lib/CodeGen/LLVMTargetMachine.cpp +++ b/lib/CodeGen/LLVMTargetMachine.cpp @@ -136,8 +136,7 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM, MCE = getTarget().createMCCodeEmitter(MII, MRI, Context); MCAsmBackend *MAB = - getTarget().createMCAsmBackend(MRI, getTargetTriple().str(), TargetCPU, - Options.MCOptions); + getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions); auto FOut = llvm::make_unique(Out); MCStreamer *S = getTarget().createAsmStreamer( Context, std::move(FOut), Options.MCOptions.AsmVerbose, @@ -151,8 +150,7 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM, // emission fails. MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, MRI, Context); MCAsmBackend *MAB = - getTarget().createMCAsmBackend(MRI, getTargetTriple().str(), TargetCPU, - Options.MCOptions); + getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions); if (!MCE || !MAB) return true; @@ -225,17 +223,16 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx, // Create the code emitter for the target if it exists. If not, .o file // emission fails. + const MCSubtargetInfo &STI = *getMCSubtargetInfo(); const MCRegisterInfo &MRI = *getMCRegisterInfo(); MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getMCInstrInfo(), MRI, *Ctx); MCAsmBackend *MAB = - getTarget().createMCAsmBackend(MRI, getTargetTriple().str(), TargetCPU, - Options.MCOptions); + getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions); if (!MCE || !MAB) return true; const Triple &T = getTargetTriple(); - const MCSubtargetInfo &STI = *getMCSubtargetInfo(); std::unique_ptr AsmStreamer(getTarget().createMCObjectStreamer( T, *Ctx, std::unique_ptr(MAB), Out, std::unique_ptr(MCE), STI, Options.MCOptions.MCRelaxAll, diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp index 34572f24c181..75e3d35169cf 100644 --- a/lib/CodeGen/LiveDebugVariables.cpp +++ b/lib/CodeGen/LiveDebugVariables.cpp @@ -242,8 +242,11 @@ class UserValue { // We are storing a MachineOperand outside a MachineInstr. locations.back().clearParent(); // Don't store def operands. - if (locations.back().isReg()) + if (locations.back().isReg()) { + if (locations.back().isDef()) + locations.back().setIsDead(false); locations.back().setIsUse(); + } return locations.size() - 1; } diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3218dce8f575..81bff4d7eefa 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3850,7 +3850,6 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N, return false; } case ISD::ZERO_EXTEND: - case ISD::ANY_EXTEND: case ISD::AssertZext: { unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); @@ -13783,30 +13782,30 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { } } - // Deal with elidable overlapping chained stores. - if (StoreSDNode *ST1 = dyn_cast(Chain)) - if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && - ST1->isUnindexed() && !ST1->isVolatile() && ST1->hasOneUse() && - !ST1->getBasePtr().isUndef() && !ST->isVolatile()) { - BaseIndexOffset STBasePtr = BaseIndexOffset::match(ST->getBasePtr(), DAG); - BaseIndexOffset ST1BasePtr = - BaseIndexOffset::match(ST1->getBasePtr(), DAG); - unsigned STBytes = ST->getMemoryVT().getStoreSize(); - unsigned ST1Bytes = ST1->getMemoryVT().getStoreSize(); - int64_t PtrDiff; - // If this is a store who's preceeding store to a subset of the same - // memory and no one other node is chained to that store we can - // effectively drop the store. Do not remove stores to undef as they may - // be used as data sinks. + if (StoreSDNode *ST1 = dyn_cast(Chain)) { + if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() && + !ST1->isVolatile() && ST1->getBasePtr() == Ptr && + ST->getMemoryVT() == ST1->getMemoryVT()) { + // If this is a store followed by a store with the same value to the same + // location, then the store is dead/noop. + if (ST1->getValue() == Value) { + // The store is dead, remove it. + return Chain; + } - if (((ST->getBasePtr() == ST1->getBasePtr()) && - (ST->getValue() == ST1->getValue())) || - (STBasePtr.equalBaseIndex(ST1BasePtr, DAG, PtrDiff) && - (0 <= PtrDiff) && (PtrDiff + ST1Bytes <= STBytes))) { + // If this is a store who's preceeding store to the same location + // and no one other node is chained to that store we can effectively + // drop the store. Do not remove stores to undef as they may be used as + // data sinks. + if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && + !ST1->getBasePtr().isUndef()) { + // ST1 is fully overwritten and can be elided. Combine with it's chain + // value. CombineTo(ST1, ST1->getChain()); - return SDValue(N, 0); + return SDValue(); } } + } // If this is an FP_ROUND or TRUNC followed by a store, fold this into a // truncating store. We can do this even if this is already a truncstore. diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp index d3c94b5f9e6b..3c856914053b 100644 --- a/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -2051,11 +2051,9 @@ bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) { // At this point we know that there is a 1-1 correspondence between LLVM PHI // nodes and Machine PHI nodes, but the incoming operands have not been // emitted yet. - for (BasicBlock::const_iterator I = SuccBB->begin(); - const auto *PN = dyn_cast(I); ++I) { - + for (const PHINode &PN : SuccBB->phis()) { // Ignore dead phi's. - if (PN->use_empty()) + if (PN.use_empty()) continue; // Only handle legal types. Two interesting things to note here. First, @@ -2064,7 +2062,7 @@ bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) { // own moves. Second, this check is necessary because FastISel doesn't // use CreateRegs to create registers, so it always creates // exactly one register for each non-void instruction. - EVT VT = TLI.getValueType(DL, PN->getType(), /*AllowUnknown=*/true); + EVT VT = TLI.getValueType(DL, PN.getType(), /*AllowUnknown=*/true); if (VT == MVT::Other || !TLI.isTypeLegal(VT)) { // Handle integer promotions, though, because they're common and easy. if (!(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)) { @@ -2073,11 +2071,11 @@ bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) { } } - const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB); + const Value *PHIOp = PN.getIncomingValueForBlock(LLVMBB); // Set the DebugLoc for the copy. Prefer the location of the operand // if there is one; use the location of the PHI otherwise. - DbgLoc = PN->getDebugLoc(); + DbgLoc = PN.getDebugLoc(); if (const auto *Inst = dyn_cast(PHIOp)) DbgLoc = Inst->getDebugLoc(); diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index c7cdb49203b1..81347fa4bd46 100644 --- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -257,20 +257,20 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, // Create Machine PHI nodes for LLVM PHI nodes, lowering them as // appropriate. - for (BasicBlock::const_iterator I = BB.begin(); - const PHINode *PN = dyn_cast(I); ++I) { - if (PN->use_empty()) continue; - - // Skip empty types - if (PN->getType()->isEmptyTy()) + for (const PHINode &PN : BB.phis()) { + if (PN.use_empty()) continue; - DebugLoc DL = PN->getDebugLoc(); - unsigned PHIReg = ValueMap[PN]; + // Skip empty types + if (PN.getType()->isEmptyTy()) + continue; + + DebugLoc DL = PN.getDebugLoc(); + unsigned PHIReg = ValueMap[&PN]; assert(PHIReg && "PHI node does not have an assigned virtual register!"); SmallVector ValueVTs; - ComputeValueVTs(*TLI, MF->getDataLayout(), PN->getType(), ValueVTs); + ComputeValueVTs(*TLI, MF->getDataLayout(), PN.getType(), ValueVTs); for (EVT VT : ValueVTs) { unsigned NumRegisters = TLI->getNumRegisters(Fn->getContext(), VT); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 7643790df350..6a141818bb6d 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -139,14 +139,14 @@ class VectorLegalizer { /// \brief Implements [SU]INT_TO_FP vector promotion. /// - /// This is a [zs]ext of the input operand to the next size up. + /// This is a [zs]ext of the input operand to a larger integer type. SDValue PromoteINT_TO_FP(SDValue Op); /// \brief Implements FP_TO_[SU]INT vector promotion of the result type. /// - /// It is promoted to the next size up integer type. The result is then + /// It is promoted to a larger integer type. The result is then /// truncated back to the original type. - SDValue PromoteFP_TO_INT(SDValue Op, bool isSigned); + SDValue PromoteFP_TO_INT(SDValue Op); public: VectorLegalizer(SelectionDAG& dag) : @@ -431,7 +431,7 @@ SDValue VectorLegalizer::Promote(SDValue Op) { case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: // Promote the operation by extending the operand. - return PromoteFP_TO_INT(Op, Op->getOpcode() == ISD::FP_TO_SINT); + return PromoteFP_TO_INT(Op); } // There are currently two cases of vector promotion: @@ -472,20 +472,11 @@ SDValue VectorLegalizer::Promote(SDValue Op) { SDValue VectorLegalizer::PromoteINT_TO_FP(SDValue Op) { // INT_TO_FP operations may require the input operand be promoted even // when the type is otherwise legal. - EVT VT = Op.getOperand(0).getValueType(); - assert(Op.getNode()->getNumValues() == 1 && - "Can't promote a vector with multiple results!"); + MVT VT = Op.getOperand(0).getSimpleValueType(); + MVT NVT = TLI.getTypeToPromoteTo(Op.getOpcode(), VT); + assert(NVT.getVectorNumElements() == VT.getVectorNumElements() && + "Vectors have different number of elements!"); - // Normal getTypeToPromoteTo() doesn't work here, as that will promote - // by widening the vector w/ the same element width and twice the number - // of elements. We want the other way around, the same number of elements, - // each twice the width. - // - // Increase the bitwidth of the element to the next pow-of-two - // (which is greater than 8 bits). - - EVT NVT = VT.widenIntegerVectorElementType(*DAG.getContext()); - assert(NVT.isSimple() && "Promoting to a non-simple vector type!"); SDLoc dl(Op); SmallVector Operands(Op.getNumOperands()); @@ -505,35 +496,28 @@ SDValue VectorLegalizer::PromoteINT_TO_FP(SDValue Op) { // elements and then truncate the result. This is different from the default // PromoteVector which uses bitcast to promote thus assumning that the // promoted vector type has the same overall size. -SDValue VectorLegalizer::PromoteFP_TO_INT(SDValue Op, bool isSigned) { - assert(Op.getNode()->getNumValues() == 1 && - "Can't promote a vector with multiple results!"); - EVT VT = Op.getValueType(); +SDValue VectorLegalizer::PromoteFP_TO_INT(SDValue Op) { + MVT VT = Op.getSimpleValueType(); + MVT NVT = TLI.getTypeToPromoteTo(Op.getOpcode(), VT); + assert(NVT.getVectorNumElements() == VT.getVectorNumElements() && + "Vectors have different number of elements!"); - EVT NewVT = VT; - unsigned NewOpc; - while (true) { - NewVT = NewVT.widenIntegerVectorElementType(*DAG.getContext()); - assert(NewVT.isSimple() && "Promoting to a non-simple vector type!"); - if (TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NewVT)) { - NewOpc = ISD::FP_TO_SINT; - break; - } - if (!isSigned && TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewVT)) { - NewOpc = ISD::FP_TO_UINT; - break; - } - } + unsigned NewOpc = Op->getOpcode(); + // Change FP_TO_UINT to FP_TO_SINT if possible. + // TODO: Should we only do this if FP_TO_UINT itself isn't legal? + if (NewOpc == ISD::FP_TO_UINT && + TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT)) + NewOpc = ISD::FP_TO_SINT; SDLoc dl(Op); - SDValue Promoted = DAG.getNode(NewOpc, dl, NewVT, Op.getOperand(0)); + SDValue Promoted = DAG.getNode(NewOpc, dl, NVT, Op.getOperand(0)); // Assert that the converted value fits in the original type. If it doesn't // (eg: because the value being converted is too big), then the result of the // original operation was undefined anyway, so the assert is still correct. Promoted = DAG.getNode(Op->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext : ISD::AssertSext, - dl, NewVT, Promoted, + dl, NVT, Promoted, DAG.getValueType(VT.getScalarType())); return DAG.getNode(ISD::TRUNCATE, dl, VT, Promoted); } diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index ce1c01b621f0..df1cbeb92740 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3374,11 +3374,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) { EVT VT = N->getValueType(0); SDValue InOp = N->getOperand(0); - // If some legalization strategy other than widening is used on the operand, - // we can't safely assume that just extending the low lanes is the correct - // transformation. - if (getTypeAction(InOp.getValueType()) != TargetLowering::TypeWidenVector) - return WidenVecOp_Convert(N); + assert(getTypeAction(InOp.getValueType()) == + TargetLowering::TypeWidenVector && + "Unexpected type action"); InOp = GetWidenedVector(InOp); assert(VT.getVectorNumElements() < InOp.getValueType().getVectorNumElements() && @@ -3440,20 +3438,31 @@ SDValue DAGTypeLegalizer::WidenVecOp_FCOPYSIGN(SDNode *N) { } SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { - // Since the result is legal and the input is illegal, it is unlikely that we - // can fix the input to a legal type so unroll the convert into some scalar - // code and create a nasty build vector. + // Since the result is legal and the input is illegal. EVT VT = N->getValueType(0); EVT EltVT = VT.getVectorElementType(); SDLoc dl(N); unsigned NumElts = VT.getVectorNumElements(); SDValue InOp = N->getOperand(0); - if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector) - InOp = GetWidenedVector(InOp); + assert(getTypeAction(InOp.getValueType()) == + TargetLowering::TypeWidenVector && + "Unexpected type action"); + InOp = GetWidenedVector(InOp); EVT InVT = InOp.getValueType(); + unsigned Opcode = N->getOpcode(); + + // See if a widened result type would be legal, if so widen the node. + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, + InVT.getVectorNumElements()); + if (TLI.isTypeLegal(WideVT)) { + SDValue Res = DAG.getNode(Opcode, dl, WideVT, InOp); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, + DAG.getIntPtrConstant(0, dl)); + } + EVT InEltVT = InVT.getVectorElementType(); - unsigned Opcode = N->getOpcode(); + // Unroll the convert into some scalar code and create a nasty build vector. SmallVector Ops(NumElts); for (unsigned i=0; i < NumElts; ++i) Ops[i] = DAG.getNode( @@ -3506,8 +3515,10 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) { unsigned NumOperands = N->getNumOperands(); for (unsigned i=0; i < NumOperands; ++i) { SDValue InOp = N->getOperand(i); - if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector) - InOp = GetWidenedVector(InOp); + assert(getTypeAction(InOp.getValueType()) == + TargetLowering::TypeWidenVector && + "Unexpected type action"); + InOp = GetWidenedVector(InOp); for (unsigned j=0; j < NumInElts; ++j) Ops[Idx++] = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 71cb8cb78f6d..68bbd62e1321 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8940,17 +8940,17 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) { // At this point we know that there is a 1-1 correspondence between LLVM PHI // nodes and Machine PHI nodes, but the incoming operands have not been // emitted yet. - for (BasicBlock::const_iterator I = SuccBB->begin(); - const PHINode *PN = dyn_cast(I); ++I) { + for (const PHINode &PN : SuccBB->phis()) { // Ignore dead phi's. - if (PN->use_empty()) continue; + if (PN.use_empty()) + continue; // Skip empty types - if (PN->getType()->isEmptyTy()) + if (PN.getType()->isEmptyTy()) continue; unsigned Reg; - const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB); + const Value *PHIOp = PN.getIncomingValueForBlock(LLVMBB); if (const Constant *C = dyn_cast(PHIOp)) { unsigned &RegOut = ConstantsOut[C]; @@ -8977,7 +8977,7 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) { // the input for this MBB. SmallVector ValueVTs; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - ComputeValueVTs(TLI, DAG.getDataLayout(), PN->getType(), ValueVTs); + ComputeValueVTs(TLI, DAG.getDataLayout(), PN.getType(), ValueVTs); for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) { EVT VT = ValueVTs[vti]; unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index d13ccc263718..befd797e75b4 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -1445,13 +1445,11 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { } if (AllPredsVisited) { - for (BasicBlock::const_iterator I = LLVMBB->begin(); - const PHINode *PN = dyn_cast(I); ++I) - FuncInfo->ComputePHILiveOutRegInfo(PN); + for (const PHINode &PN : LLVMBB->phis()) + FuncInfo->ComputePHILiveOutRegInfo(&PN); } else { - for (BasicBlock::const_iterator I = LLVMBB->begin(); - const PHINode *PN = dyn_cast(I); ++I) - FuncInfo->InvalidatePHILiveOutRegInfo(PN); + for (const PHINode &PN : LLVMBB->phis()) + FuncInfo->InvalidatePHILiveOutRegInfo(&PN); } FuncInfo->VisitedBBs.insert(LLVMBB); diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index 121bed5a79cb..c90a93d7e247 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -712,8 +712,11 @@ bool TargetPassConfig::addCoreISelPasses() { // Ask the target for an isel. // Enable GlobalISel if the target wants to, but allow that to be overriden. + // Explicitly enabling fast-isel should override implicitly enabled + // global-isel. if (EnableGlobalISel == cl::BOU_TRUE || - (EnableGlobalISel == cl::BOU_UNSET && isGlobalISelEnabled())) { + (EnableGlobalISel == cl::BOU_UNSET && isGlobalISelEnabled() && + EnableFastISelOption != cl::BOU_TRUE)) { if (addIRTranslator()) return true; @@ -1133,7 +1136,12 @@ bool TargetPassConfig::isGlobalISelEnabled() const { } bool TargetPassConfig::isGlobalISelAbortEnabled() const { - return EnableGlobalISelAbort == 1; + if (EnableGlobalISelAbort.getNumOccurrences() > 0) + return EnableGlobalISelAbort == 1; + + // When no abort behaviour is specified, we don't abort if the target says + // that GISel is enabled. + return !isGlobalISelEnabled(); } bool TargetPassConfig::reportDiagnosticWhenGlobalISelFallback() const { diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp index 7ad84734203d..0b16a113640d 100644 --- a/lib/CodeGen/WinEHPrepare.cpp +++ b/lib/CodeGen/WinEHPrepare.cpp @@ -838,17 +838,11 @@ void WinEHPrepare::cloneCommonBlocks(Function &F) { for (auto &BBMapping : Orig2Clone) { BasicBlock *OldBlock = BBMapping.first; BasicBlock *NewBlock = BBMapping.second; - for (Instruction &OldI : *OldBlock) { - auto *OldPN = dyn_cast(&OldI); - if (!OldPN) - break; - UpdatePHIOnClonedBlock(OldPN, /*IsForOldBlock=*/true); + for (PHINode &OldPN : OldBlock->phis()) { + UpdatePHIOnClonedBlock(&OldPN, /*IsForOldBlock=*/true); } - for (Instruction &NewI : *NewBlock) { - auto *NewPN = dyn_cast(&NewI); - if (!NewPN) - break; - UpdatePHIOnClonedBlock(NewPN, /*IsForOldBlock=*/false); + for (PHINode &NewPN : NewBlock->phis()) { + UpdatePHIOnClonedBlock(&NewPN, /*IsForOldBlock=*/false); } } @@ -858,17 +852,13 @@ void WinEHPrepare::cloneCommonBlocks(Function &F) { BasicBlock *OldBlock = BBMapping.first; BasicBlock *NewBlock = BBMapping.second; for (BasicBlock *SuccBB : successors(NewBlock)) { - for (Instruction &SuccI : *SuccBB) { - auto *SuccPN = dyn_cast(&SuccI); - if (!SuccPN) - break; - + for (PHINode &SuccPN : SuccBB->phis()) { // Ok, we have a PHI node. Figure out what the incoming value was for // the OldBlock. - int OldBlockIdx = SuccPN->getBasicBlockIndex(OldBlock); + int OldBlockIdx = SuccPN.getBasicBlockIndex(OldBlock); if (OldBlockIdx == -1) break; - Value *IV = SuccPN->getIncomingValue(OldBlockIdx); + Value *IV = SuccPN.getIncomingValue(OldBlockIdx); // Remap the value if necessary. if (auto *Inst = dyn_cast(IV)) { @@ -877,7 +867,7 @@ void WinEHPrepare::cloneCommonBlocks(Function &F) { IV = I->second; } - SuccPN->addIncoming(IV, NewBlock); + SuccPN.addIncoming(IV, NewBlock); } } } diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp index 22513924a96d..938c40182b92 100644 --- a/lib/IR/BasicBlock.cpp +++ b/lib/IR/BasicBlock.cpp @@ -264,7 +264,8 @@ const BasicBlock *BasicBlock::getUniqueSuccessor() const { } iterator_range BasicBlock::phis() { - return make_range(dyn_cast(&front()), nullptr); + PHINode *P = empty() ? nullptr : dyn_cast(&*begin()); + return make_range(P, nullptr); } /// This method is used to notify a BasicBlock that the diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index 534104686d81..1754f7d45011 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -2210,24 +2210,23 @@ void Verifier::visitBasicBlock(BasicBlock &BB) { SmallVector Preds(pred_begin(&BB), pred_end(&BB)); SmallVector, 8> Values; std::sort(Preds.begin(), Preds.end()); - PHINode *PN; - for (BasicBlock::iterator I = BB.begin(); (PN = dyn_cast(I));++I) { + for (const PHINode &PN : BB.phis()) { // Ensure that PHI nodes have at least one entry! - Assert(PN->getNumIncomingValues() != 0, + Assert(PN.getNumIncomingValues() != 0, "PHI nodes must have at least one entry. If the block is dead, " "the PHI should be removed!", - PN); - Assert(PN->getNumIncomingValues() == Preds.size(), + &PN); + Assert(PN.getNumIncomingValues() == Preds.size(), "PHINode should have one entry for each predecessor of its " "parent basic block!", - PN); + &PN); // Get and sort all incoming values in the PHI node... Values.clear(); - Values.reserve(PN->getNumIncomingValues()); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - Values.push_back(std::make_pair(PN->getIncomingBlock(i), - PN->getIncomingValue(i))); + Values.reserve(PN.getNumIncomingValues()); + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) + Values.push_back( + std::make_pair(PN.getIncomingBlock(i), PN.getIncomingValue(i))); std::sort(Values.begin(), Values.end()); for (unsigned i = 0, e = Values.size(); i != e; ++i) { @@ -2239,12 +2238,12 @@ void Verifier::visitBasicBlock(BasicBlock &BB) { Values[i].second == Values[i - 1].second, "PHI node has multiple entries for the same basic block with " "different incoming values!", - PN, Values[i].first, Values[i].second, Values[i - 1].second); + &PN, Values[i].first, Values[i].second, Values[i - 1].second); // Check to make sure that the predecessors and PHI node entries are // matched up. Assert(Values[i].first == Preds[i], - "PHI node entries do not match predecessors!", PN, + "PHI node entries do not match predecessors!", &PN, Values[i].first, Preds[i]); } } diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp index 3e2150a451e0..c634df99a115 100644 --- a/lib/MC/MCParser/ELFAsmParser.cpp +++ b/lib/MC/MCParser/ELFAsmParser.cpp @@ -447,7 +447,7 @@ bool ELFAsmParser::parseMetadataSym(MCSymbolELF *&Associated) { Lex(); StringRef Name; if (getParser().parseIdentifier(Name)) - return true; + return TokError("invalid metadata symbol"); Associated = dyn_cast_or_null(getContext().lookupSymbol(Name)); if (!Associated || !Associated->isInSection()) return TokError("symbol is not in a section: " + Name); diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index cbae16a04ca6..21003c0be7e1 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -412,10 +412,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, // We provide the opt remark emitter pass for LICM to use. We only need to do // this once as it is immutable. FPM.addPass(RequireAnalysisPass()); - FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1))); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), DebugLogging)); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); - FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2))); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), DebugLogging)); // Eliminate redundancies. if (Level != O1) { @@ -450,7 +450,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(JumpThreadingPass()); FPM.addPass(CorrelatedValuePropagationPass()); FPM.addPass(DSEPass()); - FPM.addPass(createFunctionToLoopPassAdaptor(LICMPass())); + FPM.addPass(createFunctionToLoopPassAdaptor(LICMPass(), DebugLogging)); for (auto &C : ScalarOptimizerLateEPCallbacks) C(FPM, Level); @@ -510,7 +510,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, MPM.addPass(PGOInstrumentationGen()); FunctionPassManager FPM; - FPM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass())); + FPM.addPass( + createFunctionToLoopPassAdaptor(LoopRotatePass(), DebugLogging)); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); // Add the profile lowering pass. @@ -730,7 +731,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, C(OptimizePM, Level); // First rotate loops that may have been un-rotated by prior passes. - OptimizePM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass())); + OptimizePM.addPass( + createFunctionToLoopPassAdaptor(LoopRotatePass(), DebugLogging)); // Distribute loops to allow partial vectorization. I.e. isolate dependences // into separate loop that would otherwise inhibit vectorization. This is @@ -777,7 +779,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, OptimizePM.addPass(LoopUnrollPass(Level)); OptimizePM.addPass(InstCombinePass()); OptimizePM.addPass(RequireAnalysisPass()); - OptimizePM.addPass(createFunctionToLoopPassAdaptor(LICMPass())); + OptimizePM.addPass(createFunctionToLoopPassAdaptor(LICMPass(), DebugLogging)); // Now that we've vectorized and unrolled loops, we may have more refined // alignment information, try to re-derive it here. @@ -1533,7 +1535,8 @@ bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM, DebugLogging)) return false; // Add the nested pass manager with the appropriate adaptor. - FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM))); + FPM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM), DebugLogging)); return true; } if (auto Count = parseRepeatPassName(Name)) { diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp index 4caf4a4fdce0..d95b791972c8 100644 --- a/lib/Support/CommandLine.cpp +++ b/lib/Support/CommandLine.cpp @@ -873,6 +873,45 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver, NewArgv.push_back(nullptr); } +void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver, + SmallVectorImpl &NewArgv, + bool MarkEOLs) { + for (const char *Cur = Source.begin(); Cur != Source.end();) { + SmallString<128> Line; + // Check for comment line. + if (isWhitespace(*Cur)) { + while (Cur != Source.end() && isWhitespace(*Cur)) + ++Cur; + continue; + } + if (*Cur == '#') { + while (Cur != Source.end() && *Cur != '\n') + ++Cur; + continue; + } + // Find end of the current line. + const char *Start = Cur; + for (const char *End = Source.end(); Cur != End; ++Cur) { + if (*Cur == '\\') { + if (Cur + 1 != End) { + ++Cur; + if (*Cur == '\n' || + (*Cur == '\r' && (Cur + 1 != End) && Cur[1] == '\n')) { + Line.append(Start, Cur - 1); + if (*Cur == '\r') + ++Cur; + Start = Cur + 1; + } + } + } else if (*Cur == '\n') + break; + } + // Tokenize line. + Line.append(Start, Cur); + cl::TokenizeGNUCommandLine(Line, Saver, NewArgv, MarkEOLs); + } +} + // It is called byte order marker but the UTF-8 BOM is actually not affected // by the host system's endianness. static bool hasUTF8ByteOrderMark(ArrayRef S) { @@ -977,6 +1016,15 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer, return AllExpanded; } +bool cl::readConfigFile(StringRef CfgFile, StringSaver &Saver, + SmallVectorImpl &Argv) { + if (!ExpandResponseFile(CfgFile, Saver, cl::tokenizeConfigFile, Argv, + /*MarkEOLs*/ false, /*RelativeNames*/ true)) + return false; + return ExpandResponseFiles(Saver, cl::tokenizeConfigFile, Argv, + /*MarkEOLs*/ false, /*RelativeNames*/ true); +} + /// ParseEnvironmentOptions - An alternative entry point to the /// CommandLine library, which allows you to read the program's name /// from the caller (as PROGNAME) and its command-line arguments from diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 6f7b2b6fd5b5..41ed24c329ef 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -632,16 +632,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // AArch64 doesn't have a direct vector ->f32 conversion instructions for // elements smaller than i32, so promote the input to i32 first. - setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); + setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); + setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); + setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); + setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16 // -> v8f16 conversions. - setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote); + setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); + setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); + setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); // Similarly, there is no direct i32 -> f64 vector conversion instruction. setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td index 39e3e33b0d27..9023c3dd8c25 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/lib/Target/AArch64/AArch64RegisterInfo.td @@ -756,27 +756,31 @@ class ZPRRegOp { +// SVE predicate register classes. +class PPRClass : RegisterClass< + "AArch64", + [ nxv16i1, nxv8i1, nxv4i1, nxv2i1 ], 16, + (sequence "P%u", 0, lastreg)> { let Size = 16; } -class PPRAsmOperand : AsmOperandClass { +def PPR : PPRClass<15>; +def PPR_3b : PPRClass<7>; // Restricted 3 bit SVE predicate register class. + +class PPRAsmOperand : AsmOperandClass { let Name = "SVE" # name # "Reg"; let PredicateMethod = "isSVEVectorRegOfWidth<" - # Width # ", AArch64::PPRRegClassID>"; + # Width # ", " # "AArch64::" # RegClass # "RegClassID>"; let DiagnosticType = "InvalidSVE" # name # "Reg"; let RenderMethod = "addRegOperands"; let ParserMethod = "tryParseSVEPredicateVector"; } -def PPRAsmOpAny : PPRAsmOperand<"PredicateAny", -1>; -def PPRAsmOp8 : PPRAsmOperand<"PredicateB", 8>; -def PPRAsmOp16 : PPRAsmOperand<"PredicateH", 16>; -def PPRAsmOp32 : PPRAsmOperand<"PredicateS", 32>; -def PPRAsmOp64 : PPRAsmOperand<"PredicateD", 64>; +def PPRAsmOpAny : PPRAsmOperand<"PredicateAny", "PPR", -1>; +def PPRAsmOp8 : PPRAsmOperand<"PredicateB", "PPR", 8>; +def PPRAsmOp16 : PPRAsmOperand<"PredicateH", "PPR", 16>; +def PPRAsmOp32 : PPRAsmOperand<"PredicateS", "PPR", 32>; +def PPRAsmOp64 : PPRAsmOperand<"PredicateD", "PPR", 64>; def PPRAny : PPRRegOp<"", PPRAsmOpAny, PPR>; def PPR8 : PPRRegOp<"b", PPRAsmOp8, PPR>; @@ -784,6 +788,18 @@ def PPR16 : PPRRegOp<"h", PPRAsmOp16, PPR>; def PPR32 : PPRRegOp<"s", PPRAsmOp32, PPR>; def PPR64 : PPRRegOp<"d", PPRAsmOp64, PPR>; +def PPRAsmOp3bAny : PPRAsmOperand<"Predicate3bAny", "PPR_3b", -1>; +def PPRAsmOp3b8 : PPRAsmOperand<"Predicate3bB", "PPR_3b", 8>; +def PPRAsmOp3b16 : PPRAsmOperand<"Predicate3bH", "PPR_3b", 16>; +def PPRAsmOp3b32 : PPRAsmOperand<"Predicate3bS", "PPR_3b", 32>; +def PPRAsmOp3b64 : PPRAsmOperand<"Predicate3bD", "PPR_3b", 64>; + +def PPR3bAny : PPRRegOp<"", PPRAsmOp3bAny, PPR_3b>; +def PPR3b8 : PPRRegOp<"b", PPRAsmOp3b8, PPR_3b>; +def PPR3b16 : PPRRegOp<"h", PPRAsmOp3b16, PPR_3b>; +def PPR3b32 : PPRRegOp<"s", PPRAsmOp3b32, PPR_3b>; +def PPR3b64 : PPRRegOp<"d", PPRAsmOp3b64, PPR_3b>; + //****************************************************************************** // SVE vector register class diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 0e6ad944c141..5d00dc58a5ab 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -136,7 +136,7 @@ static cl::opt static cl::opt EnableGlobalISelAtO( "aarch64-enable-global-isel-at-O", cl::Hidden, cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), - cl::init(-1)); + cl::init(0)); static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 6e63783e5646..ac9ff51f69f1 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -819,6 +819,10 @@ class AArch64Operand : public MCParsedAsmOperand { } bool isReg() const override { + return Kind == k_Register; + } + + bool isScalarReg() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar; } @@ -839,6 +843,7 @@ class AArch64Operand : public MCParsedAsmOperand { RK = RegKind::SVEDataVector; break; case AArch64::PPRRegClassID: + case AArch64::PPR_3bRegClassID: RK = RegKind::SVEPredicateVector; break; default: @@ -3148,7 +3153,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, return true; if (Operands.size() < 2 || - !static_cast(*Operands[1]).isReg()) + !static_cast(*Operands[1]).isScalarReg()) return Error(Loc, "Only valid when first operand is register"); bool IsXReg = @@ -3648,6 +3653,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, case Match_InvalidSVEPredicateSReg: case Match_InvalidSVEPredicateDReg: return Error(Loc, "invalid predicate register."); + case Match_InvalidSVEPredicate3bAnyReg: + case Match_InvalidSVEPredicate3bBReg: + case Match_InvalidSVEPredicate3bHReg: + case Match_InvalidSVEPredicate3bSReg: + case Match_InvalidSVEPredicate3bDReg: + return Error(Loc, "restricted predicate has range [0, 7]."); default: llvm_unreachable("unexpected error code!"); } @@ -3670,7 +3681,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (NumOperands == 4 && Tok == "lsl") { AArch64Operand &Op2 = static_cast(*Operands[2]); AArch64Operand &Op3 = static_cast(*Operands[3]); - if (Op2.isReg() && Op3.isImm()) { + if (Op2.isScalarReg() && Op3.isImm()) { const MCConstantExpr *Op3CE = dyn_cast(Op3.getImm()); if (Op3CE) { uint64_t Op3Val = Op3CE->getValue(); @@ -3702,7 +3713,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, AArch64Operand LSBOp = static_cast(*Operands[2]); AArch64Operand WidthOp = static_cast(*Operands[3]); - if (Op1.isReg() && LSBOp.isImm() && WidthOp.isImm()) { + if (Op1.isScalarReg() && LSBOp.isImm() && WidthOp.isImm()) { const MCConstantExpr *LSBCE = dyn_cast(LSBOp.getImm()); const MCConstantExpr *WidthCE = dyn_cast(WidthOp.getImm()); @@ -3758,7 +3769,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, AArch64Operand &Op3 = static_cast(*Operands[3]); AArch64Operand &Op4 = static_cast(*Operands[4]); - if (Op1.isReg() && Op3.isImm() && Op4.isImm()) { + if (Op1.isScalarReg() && Op3.isImm() && Op4.isImm()) { const MCConstantExpr *Op3CE = dyn_cast(Op3.getImm()); const MCConstantExpr *Op4CE = dyn_cast(Op4.getImm()); @@ -3822,7 +3833,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, AArch64Operand &Op3 = static_cast(*Operands[3]); AArch64Operand &Op4 = static_cast(*Operands[4]); - if (Op1.isReg() && Op3.isImm() && Op4.isImm()) { + if (Op1.isScalarReg() && Op3.isImm() && Op4.isImm()) { const MCConstantExpr *Op3CE = dyn_cast(Op3.getImm()); const MCConstantExpr *Op4CE = dyn_cast(Op4.getImm()); @@ -3901,7 +3912,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // The source register can be Wn here, but the matcher expects a // GPR64. Twiddle it here if necessary. AArch64Operand &Op = static_cast(*Operands[2]); - if (Op.isReg()) { + if (Op.isScalarReg()) { unsigned Reg = getXRegFromWReg(Op.getReg()); Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar, Op.getStartLoc(), Op.getEndLoc(), @@ -3911,13 +3922,13 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // FIXME: Likewise for sxt[bh] with a Xd dst operand else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) { AArch64Operand &Op = static_cast(*Operands[1]); - if (Op.isReg() && + if (Op.isScalarReg() && AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( Op.getReg())) { // The source register can be Wn here, but the matcher expects a // GPR64. Twiddle it here if necessary. AArch64Operand &Op = static_cast(*Operands[2]); - if (Op.isReg()) { + if (Op.isScalarReg()) { unsigned Reg = getXRegFromWReg(Op.getReg()); Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar, Op.getStartLoc(), @@ -3928,13 +3939,13 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // FIXME: Likewise for uxt[bh] with a Xd dst operand else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) { AArch64Operand &Op = static_cast(*Operands[1]); - if (Op.isReg() && + if (Op.isScalarReg() && AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( Op.getReg())) { // The source register can be Wn here, but the matcher expects a // GPR32. Twiddle it here if necessary. AArch64Operand &Op = static_cast(*Operands[1]); - if (Op.isReg()) { + if (Op.isScalarReg()) { unsigned Reg = getWRegFromXReg(Op.getReg()); Operands[1] = AArch64Operand::CreateReg(Reg, RegKind::Scalar, Op.getStartLoc(), @@ -4077,6 +4088,11 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidSVEPredicateHReg: case Match_InvalidSVEPredicateSReg: case Match_InvalidSVEPredicateDReg: + case Match_InvalidSVEPredicate3bAnyReg: + case Match_InvalidSVEPredicate3bBReg: + case Match_InvalidSVEPredicate3bHReg: + case Match_InvalidSVEPredicate3bSReg: + case Match_InvalidSVEPredicate3bDReg: case Match_MSR: case Match_MRS: { if (ErrorInfo >= Operands.size()) diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index ae278caeda69..30438a159fbc 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -91,6 +91,9 @@ static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decode); +LLVM_ATTRIBUTE_UNUSED static DecodeStatus +DecodePPR_3bRegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address, + const void *Decode); static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm, uint64_t Address, @@ -481,6 +484,16 @@ static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo, return Success; } +static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void* Decoder) { + if (RegNo > 7) + return Fail; + + // Just reuse the PPR decode table + return DecodePPRRegisterClass(Inst, RegNo, Addr, Decoder); +} + static const unsigned VectorDecoderTable[] = { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7, AArch64::Q8, AArch64::Q9, diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 7b33b4b5b542..4d1d3fd57353 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -605,10 +605,10 @@ class COFFAArch64AsmBackend : public AArch64AsmBackend { } MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TheTriple, - StringRef CPU, const MCTargetOptions &Options) { + const Triple &TheTriple = STI.getTargetTriple(); if (TheTriple.isOSBinFormatMachO()) return new DarwinAArch64AsmBackend(T, TheTriple, MRI); @@ -624,10 +624,10 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, } MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TheTriple, - StringRef CPU, const MCTargetOptions &Options) { + const Triple &TheTriple = STI.getTargetTriple(); assert(TheTriple.isOSBinFormatELF() && "Big endian is only supported for ELF targets!"); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index b9e1673b9317..a5720e0e8b87 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -45,12 +45,12 @@ MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); MCAsmBackend *createAArch64leAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options); MCAsmBackend *createAArch64beAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options); std::unique_ptr diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ebf656c549ec..2e3a453f9c75 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -815,6 +815,10 @@ class KernelScopeInfo { class AMDGPUAsmParser : public MCTargetAsmParser { MCAsmParser &Parser; + // Number of extra operands parsed after the first optional operand. + // This may be necessary to skip hardcoded mandatory operands. + static const unsigned MAX_OPR_LOOKAHEAD = 1; + unsigned ForcedEncodingSize = 0; bool ForcedDPP = false; bool ForcedSDWA = false; @@ -1037,6 +1041,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); + OperandMatchResultTy parseOptionalOpr(OperandVector &Operands); OperandMatchResultTy parseExpTgt(OperandVector &Operands); OperandMatchResultTy parseSendMsgOp(OperandVector &Operands); @@ -3859,7 +3864,7 @@ AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) { } else { // Swizzle "offset" operand is optional. // If it is omitted, try parsing other optional operands. - return parseOptionalOperand(Operands); + return parseOptionalOpr(Operands); } } @@ -4179,6 +4184,39 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { }; OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { + unsigned size = Operands.size(); + assert(size > 0); + + OperandMatchResultTy res = parseOptionalOpr(Operands); + + // This is a hack to enable hardcoded mandatory operands which follow + // optional operands. + // + // Current design assumes that all operands after the first optional operand + // are also optional. However implementation of some instructions violates + // this rule (see e.g. flat/global atomic which have hardcoded 'glc' operands). + // + // To alleviate this problem, we have to (implicitly) parse extra operands + // to make sure autogenerated parser of custom operands never hit hardcoded + // mandatory operands. + + if (size == 1 || ((AMDGPUOperand &)*Operands[size - 1]).isRegKind()) { + + // We have parsed the first optional operand. + // Parse as many operands as necessary to skip all mandatory operands. + + for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) { + if (res != MatchOperand_Success || + getLexer().is(AsmToken::EndOfStatement)) break; + if (getLexer().is(AsmToken::Comma)) Parser.Lex(); + res = parseOptionalOpr(Operands); + } + } + + return res; +} + +OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands) { OperandMatchResultTy res; for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) { // try to parse any optional operand here diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 778d4a7ba9d0..d700acc34bc9 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -198,9 +198,9 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { } // end anonymous namespace MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { // Use 64-bit ELF for amdgcn - return new ELFAMDGPUAsmBackend(T, TT); + return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple()); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 0b3563303ad0..1173dfd437ca 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -45,8 +45,9 @@ MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); -MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createAMDGPUAsmBackend(const Target &T, + const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); std::unique_ptr diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index 30a2df510386..651265fc54d5 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -71,9 +71,9 @@ class MIMG_Store_Helper op, string asm, r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", dns>, MIMGe { let ssamp = 0; - let mayLoad = 1; // TableGen requires this for matching with the intrinsics + let mayLoad = 0; let mayStore = 1; - let hasSideEffects = 1; + let hasSideEffects = 0; let hasPostISelHook = 0; let DisableWQM = 1; } @@ -103,10 +103,10 @@ class MIMG_Atomic_Helper { + asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"> { + let mayLoad = 1; let mayStore = 1; - let hasSideEffects = 1; + let hasSideEffects = 1; // FIXME: Remove this let hasPostISelHook = 0; let DisableWQM = 1; let Constraints = "$vdst = $vdata"; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 50ee88fa635a..415d8a512aa8 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -575,6 +575,221 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } + + // Image load. + case Intrinsic::amdgcn_image_load: + case Intrinsic::amdgcn_image_load_mip: + + // Sample. + case Intrinsic::amdgcn_image_sample: + case Intrinsic::amdgcn_image_sample_cl: + case Intrinsic::amdgcn_image_sample_d: + case Intrinsic::amdgcn_image_sample_d_cl: + case Intrinsic::amdgcn_image_sample_l: + case Intrinsic::amdgcn_image_sample_b: + case Intrinsic::amdgcn_image_sample_b_cl: + case Intrinsic::amdgcn_image_sample_lz: + case Intrinsic::amdgcn_image_sample_cd: + case Intrinsic::amdgcn_image_sample_cd_cl: + + // Sample with comparison. + case Intrinsic::amdgcn_image_sample_c: + case Intrinsic::amdgcn_image_sample_c_cl: + case Intrinsic::amdgcn_image_sample_c_d: + case Intrinsic::amdgcn_image_sample_c_d_cl: + case Intrinsic::amdgcn_image_sample_c_l: + case Intrinsic::amdgcn_image_sample_c_b: + case Intrinsic::amdgcn_image_sample_c_b_cl: + case Intrinsic::amdgcn_image_sample_c_lz: + case Intrinsic::amdgcn_image_sample_c_cd: + case Intrinsic::amdgcn_image_sample_c_cd_cl: + + // Sample with offsets. + case Intrinsic::amdgcn_image_sample_o: + case Intrinsic::amdgcn_image_sample_cl_o: + case Intrinsic::amdgcn_image_sample_d_o: + case Intrinsic::amdgcn_image_sample_d_cl_o: + case Intrinsic::amdgcn_image_sample_l_o: + case Intrinsic::amdgcn_image_sample_b_o: + case Intrinsic::amdgcn_image_sample_b_cl_o: + case Intrinsic::amdgcn_image_sample_lz_o: + case Intrinsic::amdgcn_image_sample_cd_o: + case Intrinsic::amdgcn_image_sample_cd_cl_o: + + // Sample with comparison and offsets. + case Intrinsic::amdgcn_image_sample_c_o: + case Intrinsic::amdgcn_image_sample_c_cl_o: + case Intrinsic::amdgcn_image_sample_c_d_o: + case Intrinsic::amdgcn_image_sample_c_d_cl_o: + case Intrinsic::amdgcn_image_sample_c_l_o: + case Intrinsic::amdgcn_image_sample_c_b_o: + case Intrinsic::amdgcn_image_sample_c_b_cl_o: + case Intrinsic::amdgcn_image_sample_c_lz_o: + case Intrinsic::amdgcn_image_sample_c_cd_o: + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + + // Basic gather4 + case Intrinsic::amdgcn_image_gather4: + case Intrinsic::amdgcn_image_gather4_cl: + case Intrinsic::amdgcn_image_gather4_l: + case Intrinsic::amdgcn_image_gather4_b: + case Intrinsic::amdgcn_image_gather4_b_cl: + case Intrinsic::amdgcn_image_gather4_lz: + + // Gather4 with comparison + case Intrinsic::amdgcn_image_gather4_c: + case Intrinsic::amdgcn_image_gather4_c_cl: + case Intrinsic::amdgcn_image_gather4_c_l: + case Intrinsic::amdgcn_image_gather4_c_b: + case Intrinsic::amdgcn_image_gather4_c_b_cl: + case Intrinsic::amdgcn_image_gather4_c_lz: + + // Gather4 with offsets + case Intrinsic::amdgcn_image_gather4_o: + case Intrinsic::amdgcn_image_gather4_cl_o: + case Intrinsic::amdgcn_image_gather4_l_o: + case Intrinsic::amdgcn_image_gather4_b_o: + case Intrinsic::amdgcn_image_gather4_b_cl_o: + case Intrinsic::amdgcn_image_gather4_lz_o: + + // Gather4 with comparison and offsets + case Intrinsic::amdgcn_image_gather4_c_o: + case Intrinsic::amdgcn_image_gather4_c_cl_o: + case Intrinsic::amdgcn_image_gather4_c_l_o: + case Intrinsic::amdgcn_image_gather4_c_b_o: + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: + case Intrinsic::amdgcn_image_gather4_c_lz_o: { + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = MFI->getImagePSV( + *MF.getSubtarget().getInstrInfo(), + CI.getArgOperand(1)); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MODereferenceable; + return true; + } + case Intrinsic::amdgcn_image_store: + case Intrinsic::amdgcn_image_store_mip: { + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); + Info.ptrVal = MFI->getImagePSV( + *MF.getSubtarget().getInstrInfo(), + CI.getArgOperand(2)); + Info.flags = MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable; + Info.align = 0; + return true; + } + case Intrinsic::amdgcn_image_atomic_swap: + case Intrinsic::amdgcn_image_atomic_add: + case Intrinsic::amdgcn_image_atomic_sub: + case Intrinsic::amdgcn_image_atomic_smin: + case Intrinsic::amdgcn_image_atomic_umin: + case Intrinsic::amdgcn_image_atomic_smax: + case Intrinsic::amdgcn_image_atomic_umax: + case Intrinsic::amdgcn_image_atomic_and: + case Intrinsic::amdgcn_image_atomic_or: + case Intrinsic::amdgcn_image_atomic_xor: + case Intrinsic::amdgcn_image_atomic_inc: + case Intrinsic::amdgcn_image_atomic_dec: { + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = MFI->getImagePSV( + *MF.getSubtarget().getInstrInfo(), + CI.getArgOperand(2)); + + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable; + + // XXX - Should this be volatile without known ordering? + Info.flags |= MachineMemOperand::MOVolatile; + return true; + } + case Intrinsic::amdgcn_image_atomic_cmpswap: { + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = MFI->getImagePSV( + *MF.getSubtarget().getInstrInfo(), + CI.getArgOperand(3)); + + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable; + + // XXX - Should this be volatile without known ordering? + Info.flags |= MachineMemOperand::MOVolatile; + return true; + } + case Intrinsic::amdgcn_tbuffer_load: + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: { + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = MFI->getBufferPSV( + *MF.getSubtarget().getInstrInfo(), + CI.getArgOperand(0)); + Info.memVT = MVT::getVT(CI.getType()); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MODereferenceable; + + // There is a constant offset component, but there are additional register + // offsets which could break AA if we set the offset to anything non-0. + return true; + } + case Intrinsic::amdgcn_tbuffer_store: + case Intrinsic::amdgcn_buffer_store: + case Intrinsic::amdgcn_buffer_store_format: { + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.opc = ISD::INTRINSIC_VOID; + Info.ptrVal = MFI->getBufferPSV( + *MF.getSubtarget().getInstrInfo(), + CI.getArgOperand(1)); + Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); + Info.flags = MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable; + return true; + } + case Intrinsic::amdgcn_buffer_atomic_swap: + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_buffer_atomic_xor: { + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = MFI->getBufferPSV( + *MF.getSubtarget().getInstrInfo(), + CI.getArgOperand(1)); + Info.memVT = MVT::getVT(CI.getType()); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile; + return true; + } + case Intrinsic::amdgcn_buffer_atomic_cmpswap: { + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = MFI->getBufferPSV( + *MF.getSubtarget().getInstrInfo(), + CI.getArgOperand(2)); + Info.memVT = MVT::getVT(CI.getType()); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile; + return true; + } default: return false; } @@ -2946,24 +3161,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( SIMachineFunctionInfo *MFI = MF->getInfo(); if (TII->isMIMG(MI)) { - if (!MI.memoperands_empty()) - return BB; + if (MI.memoperands_empty() && MI.mayLoadOrStore()) { + report_fatal_error("missing mem operand from MIMG instruction"); + } // Add a memoperand for mimg instructions so that they aren't assumed to // be ordered memory instuctions. - MachinePointerInfo PtrInfo(MFI->getImagePSV()); - MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable; - if (MI.mayStore()) - Flags |= MachineMemOperand::MOStore; - - if (MI.mayLoad()) - Flags |= MachineMemOperand::MOLoad; - - if (Flags != MachineMemOperand::MODereferenceable) { - auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0); - MI.addMemOperand(*MF, MMO); - } - return BB; } @@ -4257,7 +4460,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast(Op.getOperand(1))->getZExtValue(); SDLoc DL(Op); - MachineFunction &MF = DAG.getMachineFunction(); switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: @@ -4284,21 +4486,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // glc Op.getOperand(6) // slc }; - SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(MFI->getBufferPSV()), - MachineMemOperand::MOLoad, - VT.getStoreSize(), VT.getStoreSize()); - - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); + auto *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand()); } case Intrinsic::amdgcn_tbuffer_load: { + MemSDNode *M = cast(Op); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4312,14 +4511,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(10) // slc }; - EVT VT = Op.getOperand(2).getValueType(); + EVT VT = Op.getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad, - VT.getStoreSize(), VT.getStoreSize()); return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); + Op->getVTList(), Ops, VT, M->getMemOperand()); } case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: @@ -4339,14 +4534,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // offset Op.getOperand(6) // slc }; - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOVolatile, - VT.getStoreSize(), 4); + EVT VT = Op.getValueType(); + + auto *M = cast(Op); unsigned Opcode = 0; switch (IntrID) { @@ -4384,7 +4574,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, llvm_unreachable("unhandled atomic opcode"); } - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO); + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); } case Intrinsic::amdgcn_buffer_atomic_cmpswap: { @@ -4397,17 +4588,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(6), // offset Op.getOperand(7) // slc }; - EVT VT = Op.getOperand(4).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOVolatile, - VT.getStoreSize(), 4); + EVT VT = Op.getValueType(); + auto *M = cast(Op); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, - Op->getVTList(), Ops, VT, MMO); + Op->getVTList(), Ops, VT, M->getMemOperand()); } // Basic sample. diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 6013ebc81d9f..888d8f978aff 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -28,8 +28,6 @@ using namespace llvm; SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), - BufferPSV(*(MF.getSubtarget().getInstrInfo())), - ImagePSV(*(MF.getSubtarget().getInstrInfo())), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 5dde72910ee3..02e63f0258e6 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -34,12 +34,14 @@ namespace llvm { class MachineFrameInfo; class MachineFunction; +class SIInstrInfo; class TargetRegisterClass; class AMDGPUImagePseudoSourceValue : public PseudoSourceValue { public: + // TODO: Is the img rsrc useful? explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) : - PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { } + PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) {} bool isConstant(const MachineFrameInfo *) const override { // This should probably be true for most images, but we will start by being @@ -135,8 +137,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // Stack object indices for work item IDs. std::array DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}}; - AMDGPUBufferPseudoSourceValue BufferPSV; - AMDGPUImagePseudoSourceValue ImagePSV; + DenseMap> BufferPSVs; + DenseMap> ImagePSVs; private: unsigned LDSWaveSpillSize = 0; @@ -629,12 +633,22 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { return LDSWaveSpillSize; } - const AMDGPUBufferPseudoSourceValue *getBufferPSV() const { - return &BufferPSV; + const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII, + const Value *BufferRsrc) { + assert(BufferRsrc); + auto PSV = BufferPSVs.try_emplace( + BufferRsrc, + llvm::make_unique(TII)); + return PSV.first->second.get(); } - const AMDGPUImagePseudoSourceValue *getImagePSV() const { - return &ImagePSV; + const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII, + const Value *ImgRsrc) { + assert(ImgRsrc); + auto PSV = ImagePSVs.try_emplace( + ImgRsrc, + llvm::make_unique(TII)); + return PSV.first->second.get(); } }; diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 1cb9dd44f789..ff507ab7162f 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -172,8 +172,8 @@ void ARMAsmBackend::handleAssemblerFlag(MCAssemblerFlag Flag) { } unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const { - bool HasThumb2 = STI->getFeatureBits()[ARM::FeatureThumb2]; - bool HasV8MBaselineOps = STI->getFeatureBits()[ARM::HasV8MBaselineOps]; + bool HasThumb2 = STI.getFeatureBits()[ARM::FeatureThumb2]; + bool HasV8MBaselineOps = STI.getFeatureBits()[ARM::HasV8MBaselineOps]; switch (Op) { default: @@ -389,7 +389,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, case FK_SecRel_4: return Value; case ARM::fixup_arm_movt_hi16: - if (IsResolved || !STI->getTargetTriple().isOSBinFormatELF()) + if (IsResolved || !STI.getTargetTriple().isOSBinFormatELF()) Value >>= 16; LLVM_FALLTHROUGH; case ARM::fixup_arm_movw_lo16: { @@ -401,7 +401,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, return Value; } case ARM::fixup_t2_movt_hi16: - if (IsResolved || !STI->getTargetTriple().isOSBinFormatELF()) + if (IsResolved || !STI.getTargetTriple().isOSBinFormatELF()) Value >>= 16; LLVM_FALLTHROUGH; case ARM::fixup_t2_movw_lo16: { @@ -591,7 +591,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, case ARM::fixup_arm_thumb_cp: // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we // could have an error on our hands. - if (!STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) { + if (!STI.getFeatureBits()[ARM::FeatureThumb2] && IsResolved) { const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); if (FixupDiagnostic) { Ctx.reportError(Fixup.getLoc(), FixupDiagnostic); @@ -615,8 +615,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, } case ARM::fixup_arm_thumb_br: // Offset by 4 and don't encode the lower bit, which is always 0. - if (!STI->getFeatureBits()[ARM::FeatureThumb2] && - !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) { + if (!STI.getFeatureBits()[ARM::FeatureThumb2] && + !STI.getFeatureBits()[ARM::HasV8MBaselineOps]) { const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); if (FixupDiagnostic) { Ctx.reportError(Fixup.getLoc(), FixupDiagnostic); @@ -626,7 +626,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, return ((Value - 4) >> 1) & 0x7ff; case ARM::fixup_arm_thumb_bcc: // Offset by 4 and don't encode the lower bit, which is always 0. - if (!STI->getFeatureBits()[ARM::FeatureThumb2]) { + if (!STI.getFeatureBits()[ARM::FeatureThumb2]) { const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); if (FixupDiagnostic) { Ctx.reportError(Fixup.getLoc(), FixupDiagnostic); @@ -1154,51 +1154,52 @@ static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) { } MCAsmBackend *llvm::createARMAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TheTriple, StringRef CPU, const MCTargetOptions &Options, bool isLittle) { + const Triple &TheTriple = STI.getTargetTriple(); switch (TheTriple.getObjectFormat()) { default: llvm_unreachable("unsupported object format"); case Triple::MachO: { MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName()); - return new ARMAsmBackendDarwin(T, TheTriple, MRI, CS); + return new ARMAsmBackendDarwin(T, STI, MRI, CS); } case Triple::COFF: assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported"); - return new ARMAsmBackendWinCOFF(T, TheTriple); + return new ARMAsmBackendWinCOFF(T, STI); case Triple::ELF: assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target"); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); - return new ARMAsmBackendELF(T, TheTriple, OSABI, isLittle); + return new ARMAsmBackendELF(T, STI, OSABI, isLittle); } } MCAsmBackend *llvm::createARMLEAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { - return createARMAsmBackend(T, MRI, TT, CPU, Options, true); + return createARMAsmBackend(T, STI, MRI, Options, true); } MCAsmBackend *llvm::createARMBEAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { - return createARMAsmBackend(T, MRI, TT, CPU, Options, false); + return createARMAsmBackend(T, STI, MRI, Options, false); } MCAsmBackend *llvm::createThumbLEAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { - return createARMAsmBackend(T, MRI, TT, CPU, Options, true); + return createARMAsmBackend(T, STI, MRI, Options, true); } MCAsmBackend *llvm::createThumbBEAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { - return createARMAsmBackend(T, MRI, TT, CPU, Options, false); + return createARMAsmBackend(T, STI, MRI, Options, false); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 02374966dafe..c8527e5cca20 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -19,22 +19,20 @@ namespace llvm { class ARMAsmBackend : public MCAsmBackend { - const MCSubtargetInfo *STI; + const MCSubtargetInfo &STI; bool isThumbMode; // Currently emitting Thumb code. bool IsLittleEndian; // Big or little endian. public: - ARMAsmBackend(const Target &T, const Triple &TT, bool IsLittle) - : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")), - isThumbMode(TT.getArchName().startswith("thumb")), + ARMAsmBackend(const Target &T, const MCSubtargetInfo &STI, bool IsLittle) + : MCAsmBackend(), STI(STI), + isThumbMode(STI.getTargetTriple().isThumb()), IsLittleEndian(IsLittle) {} - ~ARMAsmBackend() override { delete STI; } - unsigned getNumFixupKinds() const override { return ARM::NumTargetFixupKinds; } - bool hasNOP() const { return STI->getFeatureBits()[ARM::HasV6T2Ops]; } + bool hasNOP() const { return STI.getFeatureBits()[ARM::HasV6T2Ops]; } const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h index f05e3a6f1160..19e3fdb72046 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h @@ -19,10 +19,10 @@ class ARMAsmBackendDarwin : public ARMAsmBackend { const MCRegisterInfo &MRI; public: const MachO::CPUSubTypeARM Subtype; - ARMAsmBackendDarwin(const Target &T, const Triple &TT, + ARMAsmBackendDarwin(const Target &T, const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st) - : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), MRI(MRI), Subtype(st) { - } + : ARMAsmBackend(T, STI, /* IsLittleEndian */ true), MRI(MRI), + Subtype(st) {} std::unique_ptr createObjectWriter(raw_pwrite_stream &OS) const override { diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h index d0f5419a1b0f..361ea3040847 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h @@ -20,9 +20,9 @@ namespace { class ARMAsmBackendELF : public ARMAsmBackend { public: uint8_t OSABI; - ARMAsmBackendELF(const Target &T, const Triple &TT, uint8_t OSABI, + ARMAsmBackendELF(const Target &T, const MCSubtargetInfo &STI, uint8_t OSABI, bool IsLittle) - : ARMAsmBackend(T, TT, IsLittle), OSABI(OSABI) {} + : ARMAsmBackend(T, STI, IsLittle), OSABI(OSABI) {} std::unique_ptr createObjectWriter(raw_pwrite_stream &OS) const override { diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h index 53b9c29446a3..0ac6d4270aac 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h @@ -17,8 +17,8 @@ using namespace llvm; namespace { class ARMAsmBackendWinCOFF : public ARMAsmBackend { public: - ARMAsmBackendWinCOFF(const Target &T, const Triple &TheTriple) - : ARMAsmBackend(T, TheTriple, true) {} + ARMAsmBackendWinCOFF(const Target &T, const MCSubtargetInfo &STI) + : ARMAsmBackend(T, STI, true) {} std::unique_ptr createObjectWriter(raw_pwrite_stream &OS) const override { return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false); diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index 0fb97e5fee97..df9874c78d07 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -68,27 +68,27 @@ MCCodeEmitter *createARMBEMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); -MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createARMAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options, bool IsLittleEndian); -MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); -MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); MCAsmBackend *createThumbLEAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options); MCAsmBackend *createThumbBEAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options); // Construct a PE/COFF machine code streamer which will generate a PE/COFF diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp index 2d9dd4f8f83f..2f5e9f02e53c 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp @@ -476,10 +476,10 @@ bool AVRAsmBackend::shouldForceRelocation(const MCAssembler &Asm, } } -MCAsmBackend *createAVRAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createAVRAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const llvm::MCTargetOptions &TO) { - return new AVRAsmBackend(TT.getOS()); + return new AVRAsmBackend(STI.getTargetTriple().getOS()); } } // end of namespace llvm diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h index 5615fd72e456..fcfd8cf82292 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h +++ b/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h @@ -26,6 +26,7 @@ class MCContext; class MCInstrInfo; class MCObjectWriter; class MCRegisterInfo; +class MCSubtargetInfo; class MCTargetOptions; class StringRef; class Target; @@ -42,8 +43,8 @@ MCCodeEmitter *createAVRMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); /// Creates an assembly backend for AVR. -MCAsmBackend *createAVRAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createAVRAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const llvm::MCTargetOptions &TO); /// Creates an ELF object writer for AVR. diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index e6ea92e08364..6593d9d018fd 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -104,15 +104,15 @@ BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const { } MCAsmBackend *llvm::createBPFAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, - const MCTargetOptions&) { + const MCTargetOptions &) { return new BPFAsmBackend(/*IsLittleEndian=*/true); } MCAsmBackend *llvm::createBPFbeAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, - const MCTargetOptions&) { + const MCTargetOptions &) { return new BPFAsmBackend(/*IsLittleEndian=*/false); } diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h index 6466042f6929..a6dac3abca02 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h +++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h @@ -45,11 +45,11 @@ MCCodeEmitter *createBPFbeMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); -MCAsmBackend *createBPFAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createBPFAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); -MCAsmBackend *createBPFbeAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createBPFbeAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); std::unique_ptr createBPFELFObjectWriter(raw_pwrite_stream &OS, diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index fd602257934a..2646d0bcbf47 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -1050,14 +1050,11 @@ bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB, // Check if the exit values have types that are no wider than the type // that we want to promote to. unsigned DestBW = DestTy->getBitWidth(); - for (Instruction &In : *ExitB) { - PHINode *P = dyn_cast(&In); - if (!P) - break; - if (P->getNumIncomingValues() != 1) + for (PHINode &P : ExitB->phis()) { + if (P.getNumIncomingValues() != 1) return false; - assert(P->getIncomingBlock(0) == LoopB); - IntegerType *T = dyn_cast(P->getType()); + assert(P.getIncomingBlock(0) == LoopB); + IntegerType *T = dyn_cast(P.getType()); if (!T || T->getBitWidth() > DestBW) return false; } diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index cdc2085986a5..98229f4fa64a 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -2925,6 +2925,23 @@ let Predicates = [UseHVX] in { def HexagonVZERO: SDNode<"HexagonISD::VZERO", SDTVecLeaf>; def vzero: PatFrag<(ops), (HexagonVZERO)>; +def VSxtb: OutPatFrag<(ops node:$Vs), + (V6_vshuffvdd (HiVec (V6_vsb $Vs)), + (LoVec (V6_vsb $Vs)), + (A2_tfrsi -2))>; +def VSxth: OutPatFrag<(ops node:$Vs), + (V6_vshuffvdd (HiVec (V6_vsh $Vs)), + (LoVec (V6_vsh $Vs)), + (A2_tfrsi -4))>; +def VZxtb: OutPatFrag<(ops node:$Vs), + (V6_vshuffvdd (HiVec (V6_vzb $Vs)), + (LoVec (V6_vzb $Vs)), + (A2_tfrsi -2))>; +def VZxth: OutPatFrag<(ops node:$Vs), + (V6_vshuffvdd (HiVec (V6_vzh $Vs)), + (LoVec (V6_vzh $Vs)), + (A2_tfrsi -4))>; + let Predicates = [UseHVX] in { def: Pat<(VecI8 vzero), (V6_vd0)>; def: Pat<(VecI16 vzero), (V6_vd0)>; @@ -2970,25 +2987,18 @@ let Predicates = [UseHVX] in { def: Pat<(vselect HQ32:$Qu, HVI32:$Vs, HVI32:$Vt), (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>; - def: Pat<(VecPI16 (sext HVI8:$Vs)), (V6_vsb HvxVR:$Vs)>; - def: Pat<(VecPI32 (sext HVI16:$Vs)), (V6_vsh HvxVR:$Vs)>; - def: Pat<(VecPI16 (zext HVI8:$Vs)), (V6_vzb HvxVR:$Vs)>; - def: Pat<(VecPI32 (zext HVI16:$Vs)), (V6_vzh HvxVR:$Vs)>; + def: Pat<(VecPI16 (sext HVI8:$Vs)), (VSxtb $Vs)>; + def: Pat<(VecPI32 (sext HVI16:$Vs)), (VSxth $Vs)>; + def: Pat<(VecPI16 (zext HVI8:$Vs)), (VZxtb $Vs)>; + def: Pat<(VecPI32 (zext HVI16:$Vs)), (VZxth $Vs)>; - def: Pat<(sext_inreg HVI32:$Vs, v16i16), - (V6_vpackeb (LoVec (V6_vsh HvxVR:$Vs)), - (HiVec (V6_vsh HvxVR:$Vs)))>; - def: Pat<(sext_inreg HVI32:$Vs, v32i16), - (V6_vpackeb (LoVec (V6_vsh HvxVR:$Vs)), - (HiVec (V6_vsh HvxVR:$Vs)))>; - - def: Pat<(VecI16 (sext_invec HVI8:$Vs)), (LoVec (V6_vsb HvxVR:$Vs))>; - def: Pat<(VecI32 (sext_invec HVI16:$Vs)), (LoVec (V6_vsh HvxVR:$Vs))>; + def: Pat<(VecI16 (sext_invec HVI8:$Vs)), (LoVec (VSxtb $Vs))>; + def: Pat<(VecI32 (sext_invec HVI16:$Vs)), (LoVec (VSxth $Vs))>; def: Pat<(VecI32 (sext_invec HVI8:$Vs)), - (LoVec (V6_vsh (LoVec (V6_vsb HvxVR:$Vs))))>; + (LoVec (VSxth (LoVec (VSxtb $Vs))))>; - def: Pat<(VecI16 (zext_invec HVI8:$Vs)), (LoVec (V6_vzb HvxVR:$Vs))>; - def: Pat<(VecI32 (zext_invec HVI16:$Vs)), (LoVec (V6_vzh HvxVR:$Vs))>; + def: Pat<(VecI16 (zext_invec HVI8:$Vs)), (LoVec (VZxtb $Vs))>; + def: Pat<(VecI32 (zext_invec HVI16:$Vs)), (LoVec (VZxth $Vs))>; def: Pat<(VecI32 (zext_invec HVI8:$Vs)), - (LoVec (V6_vzh (LoVec (V6_vzb HvxVR:$Vs))))>; + (LoVec (VZxth (LoVec (VZxtb $Vs))))>; } diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index b3ab6763281c..fe54c19370b3 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -765,11 +765,12 @@ class HexagonAsmBackend : public MCAsmBackend { // MCAsmBackend MCAsmBackend *llvm::createHexagonAsmBackend(Target const &T, - MCRegisterInfo const & /*MRI*/, - const Triple &TT, StringRef CPU, - const MCTargetOptions &Options) { + const MCSubtargetInfo &STI, + MCRegisterInfo const & /*MRI*/, + const MCTargetOptions &Options) { + const Triple &TT = STI.getTargetTriple(); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS()); - StringRef CPUString = Hexagon_MC::selectHexagonCPU(CPU); + StringRef CPUString = Hexagon_MC::selectHexagonCPU(STI.getCPU()); return new HexagonAsmBackend(T, TT, OSABI, CPUString); } diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h index 05d17c368dcc..71545a5c02c9 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h @@ -61,8 +61,8 @@ MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII, MCContext &MCT); MCAsmBackend *createHexagonAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options); std::unique_ptr diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp index c4935746f5ad..e3eaa4d30a90 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp @@ -165,9 +165,10 @@ LanaiAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { } // namespace MCAsmBackend *llvm::createLanaiAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo & /*MRI*/, - const Triple &TT, StringRef /*CPU*/, const MCTargetOptions & /*Options*/) { + const Triple &TT = STI.getTargetTriple(); if (!TT.isOSBinFormatELF()) llvm_unreachable("OS not supported"); diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h index 5bc84ad83870..ddb4e9b0d728 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h +++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h @@ -38,8 +38,8 @@ MCCodeEmitter *createLanaiMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); -MCAsmBackend *createLanaiAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TheTriple, StringRef CPU, +MCAsmBackend *createLanaiAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); std::unique_ptr diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index 1ad524c06969..acbc6d37e24b 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -476,8 +476,9 @@ bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { } MCAsmBackend *llvm::createMipsAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { - return new MipsAsmBackend(T, MRI, TT, CPU, Options.ABIName == "n32"); + return new MipsAsmBackend(T, MRI, STI.getTargetTriple(), STI.getCPU(), + Options.ABIName == "n32"); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h index abbf08ed212f..5dab6c3e81d6 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h @@ -45,8 +45,8 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); -MCAsmBackend *createMipsAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createMipsAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); std::unique_ptr diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 6448fd917560..79ca9cc6b800 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -3863,13 +3863,17 @@ MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 'c': // register suitable for indirect jump if (VT == MVT::i32) return std::make_pair((unsigned)Mips::T9, &Mips::GPR32RegClass); - assert(VT == MVT::i64 && "Unexpected type."); - return std::make_pair((unsigned)Mips::T9_64, &Mips::GPR64RegClass); - case 'l': // register suitable for indirect jump + if (VT == MVT::i64) + return std::make_pair((unsigned)Mips::T9_64, &Mips::GPR64RegClass); + // This will generate an error message + return std::make_pair(0U, nullptr); + case 'l': // use the `lo` register to store values + // that are no bigger than a word if (VT == MVT::i32) return std::make_pair((unsigned)Mips::LO0, &Mips::LO32RegClass); return std::make_pair((unsigned)Mips::LO0_64, &Mips::LO64RegClass); - case 'x': // register suitable for indirect jump + case 'x': // use the concatenated `hi` and `lo` registers + // to store doubleword values // Fixme: Not triggering the use of both hi and low // This will generate an error message return std::make_pair(0U, nullptr); diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 2a1de244da92..728e7757fd28 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -18,6 +18,7 @@ #include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" @@ -231,9 +232,10 @@ namespace { } // end anonymous namespace MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { + const Triple &TT = STI.getTargetTriple(); if (TT.isOSDarwin()) return new DarwinPPCAsmBackend(T); diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index 80a74c09a598..d47b9a6e452c 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -29,6 +29,7 @@ class MCContext; class MCInstrInfo; class MCObjectWriter; class MCRegisterInfo; +class MCSubtargetInfo; class MCTargetOptions; class Target; class Triple; @@ -43,8 +44,8 @@ MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); -MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createPPCAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); /// Construct an PPC ELF object writer. diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index cea59de3e8a9..f9de65fcb1df 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -4397,13 +4397,18 @@ hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) { static bool areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC, CallingConv::ID CalleeCC) { - // Tail or Sibling call optimization (TCO/SCO) needs callee and caller to - // have the same calling convention. - if (CallerCC != CalleeCC) + // Tail calls are possible with fastcc and ccc. + auto isTailCallableCC = [] (CallingConv::ID CC){ + return CC == CallingConv::C || CC == CallingConv::Fast; + }; + if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC)) return false; - // Tail or Sibling calls can be done with fastcc/ccc. - return (CallerCC == CallingConv::Fast || CallerCC == CallingConv::C); + // We can safely tail call both fastcc and ccc callees from a c calling + // convention caller. If the caller is fastcc, we may have less stack space + // than a non-fastcc caller with the same signature so disable tail-calls in + // that case. + return CallerCC == CallingConv::C || CallerCC == CalleeCC; } bool @@ -4434,10 +4439,28 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( // Callee contains any byval parameter is not supported, too. // Note: This is a quick work around, because in some cases, e.g. // caller's stack size > callee's stack size, we are still able to apply - // sibling call optimization. See: https://reviews.llvm.org/D23441#513574 + // sibling call optimization. For example, gcc is able to do SCO for caller1 + // in the following example, but not for caller2. + // struct test { + // long int a; + // char ary[56]; + // } gTest; + // __attribute__((noinline)) int callee(struct test v, struct test *b) { + // b->a = v.a; + // return 0; + // } + // void caller1(struct test a, struct test c, struct test *b) { + // callee(gTest, b); } + // void caller2(struct test *b) { callee(gTest, b); } if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); })) return false; + // If callee and caller use different calling conventions, we cannot pass + // parameters on stack since offsets for the parameter area may be different. + if (Caller.getCallingConv() != CalleeCC && + needStackSlotPassParameters(Subtarget, Outs)) + return false; + // No TCO/SCO on indirect call because Caller have to restore its TOC if (!isFunctionGlobalAddress(Callee) && !isa(Callee)) diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index ffb5cc8757f2..fb16700a5e17 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2433,7 +2433,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, int64_t MB = MI.getOperand(3).getImm(); APInt InVal(Opc == PPC::RLDICL ? 64 : 32, SExtImm, true); InVal = InVal.rotl(SH); - uint64_t Mask = (1LU << (63 - MB + 1)) - 1; + uint64_t Mask = (1LLU << (63 - MB + 1)) - 1; InVal &= Mask; // Can't replace negative values with an LI as that will sign-extend // and not clear the left bits. If we're setting the CR bit, we will use @@ -2457,8 +2457,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, int64_t ME = MI.getOperand(4).getImm(); APInt InVal(32, SExtImm, true); InVal = InVal.rotl(SH); - // Set the bits ( MB + 32 ) to ( ME + 32 ). - uint64_t Mask = ((1 << (32 - MB)) - 1) & ~((1 << (31 - ME)) - 1); + // Set the bits ( MB + 32 ) to ( ME + 32 ). + uint64_t Mask = ((1LLU << (32 - MB)) - 1) & ~((1LLU << (31 - ME)) - 1); InVal &= Mask; // Can't replace negative values with an LI as that will sign-extend // and not clear the left bits. If we're setting the CR bit, we will use @@ -2527,6 +2527,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, III.ConstantOpNo = 2; III.ImmWidth = 16; III.ImmMustBeMultipleOf = 1; + III.TruncateImmTo = 0; switch (Opc) { default: return false; case PPC::ADD4: @@ -2600,10 +2601,6 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, case PPC::RLWNM8: case PPC::RLWNMo: case PPC::RLWNM8o: - case PPC::RLDCL: - case PPC::RLDCLo: - case PPC::RLDCR: - case PPC::RLDCRo: case PPC::SLW: case PPC::SLW8: case PPC::SLWo: @@ -2614,6 +2611,50 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, case PPC::SRW8o: case PPC::SRAW: case PPC::SRAWo: + III.SignedImm = false; + III.ZeroIsSpecialOrig = 0; + III.ZeroIsSpecialNew = 0; + III.IsCommutative = false; + // This isn't actually true, but the instructions ignore any of the + // upper bits, so any immediate loaded with an LI is acceptable. + // This does not apply to shift right algebraic because a value + // out of range will produce a -1/0. + III.ImmWidth = 16; + if (Opc == PPC::RLWNM || Opc == PPC::RLWNM8 || + Opc == PPC::RLWNMo || Opc == PPC::RLWNM8o) + III.TruncateImmTo = 5; + else + III.TruncateImmTo = 6; + switch(Opc) { + default: llvm_unreachable("Unknown opcode"); + case PPC::RLWNM: III.ImmOpcode = PPC::RLWINM; break; + case PPC::RLWNM8: III.ImmOpcode = PPC::RLWINM8; break; + case PPC::RLWNMo: III.ImmOpcode = PPC::RLWINMo; break; + case PPC::RLWNM8o: III.ImmOpcode = PPC::RLWINM8o; break; + case PPC::SLW: III.ImmOpcode = PPC::RLWINM; break; + case PPC::SLW8: III.ImmOpcode = PPC::RLWINM8; break; + case PPC::SLWo: III.ImmOpcode = PPC::RLWINMo; break; + case PPC::SLW8o: III.ImmOpcode = PPC::RLWINM8o; break; + case PPC::SRW: III.ImmOpcode = PPC::RLWINM; break; + case PPC::SRW8: III.ImmOpcode = PPC::RLWINM8; break; + case PPC::SRWo: III.ImmOpcode = PPC::RLWINMo; break; + case PPC::SRW8o: III.ImmOpcode = PPC::RLWINM8o; break; + case PPC::SRAW: + III.ImmWidth = 5; + III.TruncateImmTo = 0; + III.ImmOpcode = PPC::SRAWI; + break; + case PPC::SRAWo: + III.ImmWidth = 5; + III.TruncateImmTo = 0; + III.ImmOpcode = PPC::SRAWIo; + break; + } + break; + case PPC::RLDCL: + case PPC::RLDCLo: + case PPC::RLDCR: + case PPC::RLDCRo: case PPC::SLD: case PPC::SLDo: case PPC::SRD: @@ -2626,33 +2667,34 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI, III.IsCommutative = false; // This isn't actually true, but the instructions ignore any of the // upper bits, so any immediate loaded with an LI is acceptable. + // This does not apply to shift right algebraic because a value + // out of range will produce a -1/0. III.ImmWidth = 16; + if (Opc == PPC::RLDCL || Opc == PPC::RLDCLo || + Opc == PPC::RLDCR || Opc == PPC::RLDCRo) + III.TruncateImmTo = 6; + else + III.TruncateImmTo = 7; switch(Opc) { default: llvm_unreachable("Unknown opcode"); - case PPC::RLWNM: III.ImmOpcode = PPC::RLWINM; break; - case PPC::RLWNM8: III.ImmOpcode = PPC::RLWINM8; break; - case PPC::RLWNMo: III.ImmOpcode = PPC::RLWINMo; break; - case PPC::RLWNM8o: III.ImmOpcode = PPC::RLWINM8o; break; case PPC::RLDCL: III.ImmOpcode = PPC::RLDICL; break; case PPC::RLDCLo: III.ImmOpcode = PPC::RLDICLo; break; case PPC::RLDCR: III.ImmOpcode = PPC::RLDICR; break; case PPC::RLDCRo: III.ImmOpcode = PPC::RLDICRo; break; - case PPC::SLW: III.ImmOpcode = PPC::RLWINM; break; - case PPC::SLW8: III.ImmOpcode = PPC::RLWINM8; break; - case PPC::SLWo: III.ImmOpcode = PPC::RLWINMo; break; - case PPC::SLW8o: III.ImmOpcode = PPC::RLWINM8o; break; - case PPC::SRW: III.ImmOpcode = PPC::RLWINM; break; - case PPC::SRW8: III.ImmOpcode = PPC::RLWINM8; break; - case PPC::SRWo: III.ImmOpcode = PPC::RLWINMo; break; - case PPC::SRW8o: III.ImmOpcode = PPC::RLWINM8o; break; - case PPC::SRAW: III.ImmOpcode = PPC::SRAWI; break; - case PPC::SRAWo: III.ImmOpcode = PPC::SRAWIo; break; case PPC::SLD: III.ImmOpcode = PPC::RLDICR; break; case PPC::SLDo: III.ImmOpcode = PPC::RLDICRo; break; case PPC::SRD: III.ImmOpcode = PPC::RLDICL; break; case PPC::SRDo: III.ImmOpcode = PPC::RLDICLo; break; - case PPC::SRAD: III.ImmOpcode = PPC::SRADI; break; - case PPC::SRADo: III.ImmOpcode = PPC::SRADIo; break; + case PPC::SRAD: + III.ImmWidth = 6; + III.TruncateImmTo = 0; + III.ImmOpcode = PPC::SRADI; + break; + case PPC::SRADo: + III.ImmWidth = 6; + III.TruncateImmTo = 0; + III.ImmOpcode = PPC::SRADIo; + break; } break; // Loads and stores: @@ -2866,6 +2908,8 @@ bool PPCInstrInfo::transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III, return false; if (Imm % III.ImmMustBeMultipleOf) return false; + if (III.TruncateImmTo) + Imm &= ((1 << III.TruncateImmTo) - 1); if (III.SignedImm) { APInt ActualValue(64, Imm, true); if (!ActualValue.isSignedIntN(III.ImmWidth)) diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index 4271c50127a1..8bfb8bc88097 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -97,6 +97,8 @@ struct ImmInstrInfo { uint64_t ImmOpcode : 16; // The size of the immediate. uint64_t ImmWidth : 5; + // The immediate should be truncated to N bits. + uint64_t TruncateImmTo : 5; }; // Information required to convert an instruction to just a materialized diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp index 474661aaaee8..a4c7a030389b 100644 --- a/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -55,7 +55,7 @@ FixedPointRegToImm("ppc-reg-to-imm-fixed-point", cl::Hidden, cl::init(true), "convert reg-reg instructions to reg-imm")); static cl::opt -ConvertRegReg("ppc-convert-rr-to-ri", cl::Hidden, cl::init(false), +ConvertRegReg("ppc-convert-rr-to-ri", cl::Hidden, cl::init(true), cl::desc("Convert eligible reg+reg instructions to reg+imm")); static cl::opt diff --git a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp index 9501f0f89b81..d524c354ed35 100644 --- a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp +++ b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp @@ -35,7 +35,7 @@ STATISTIC(NumRemovedInPreEmit, "Number of instructions deleted in pre-emit peephole"); static cl::opt -RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(false), +RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(true), cl::desc("Run pre-emit peephole optimizations.")); namespace { diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index b91467fe1455..6e06a4975e2a 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -230,9 +230,10 @@ RISCVAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const { } // end anonymous namespace MCAsmBackend *llvm::createRISCVAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { + const Triple &TT = STI.getTargetTriple(); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS()); return new RISCVAsmBackend(OSABI, TT.isArch64Bit()); } diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h index bea2f8800fa6..ef58a6b8cbca 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h +++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h @@ -40,8 +40,8 @@ MCCodeEmitter *createRISCVMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); -MCAsmBackend *createRISCVAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createRISCVAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); std::unique_ptr diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp index 7d32954936be..805ca7dd956e 100644 --- a/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/lib/Target/RISCV/RISCVISelLowering.cpp @@ -580,7 +580,6 @@ SDValue RISCVTargetLowering::LowerFormalArguments( } MachineFunction &MF = DAG.getMachineFunction(); - MVT XLenVT = Subtarget.getXLenVT(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); if (IsVarArg) @@ -593,7 +592,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; - assert(VA.getLocVT() == XLenVT && "Unhandled argument type"); + assert(VA.getLocVT() == Subtarget.getXLenVT() && "Unhandled argument type"); SDValue ArgValue; if (VA.isRegLoc()) ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL); diff --git a/lib/Target/RISCV/RISCVInstrInfoC.td b/lib/Target/RISCV/RISCVInstrInfoC.td index 4ca52652086b..661d2a78eeef 100644 --- a/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/lib/Target/RISCV/RISCVInstrInfoC.td @@ -177,7 +177,7 @@ class CS_ALU funct2, string OpcodeStr, RegisterClass cls, let Predicates = [HasStdExtC] in { -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [X2] in def C_ADDI4SPN : RVInst16CIW<0b000, 0b00, (outs GPRC:$rd), (ins SP:$rs1, uimm10_lsb00nonzero:$imm), "c.addi4spn", "$rd, $rs1, $imm"> { @@ -260,7 +260,7 @@ def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb), } let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1, - DecoderNamespace = "RISCV32Only_" in + DecoderNamespace = "RISCV32Only_", Defs = [X1] in def C_JAL : RVInst16CJ<0b001, 0b01, (outs), (ins simm12_lsb0:$offset), "c.jal", "$offset">, Requires<[IsRV32]>; diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index a38545ecf430..f2438ee43075 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -14,6 +14,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/TargetRegistry.h" @@ -301,8 +302,8 @@ namespace { } // end anonymous namespace MCAsmBackend *llvm::createSparcAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { - return new ELFSparcAsmBackend(T, TT.getOS()); + return new ELFSparcAsmBackend(T, STI.getTargetTriple().getOS()); } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h index 563e6f4efbe6..8390198479ba 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h @@ -40,8 +40,8 @@ Target &getTheSparcelTarget(); MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); -MCAsmBackend *createSparcAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createSparcAsmBackend(const Target &T, const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); std::unique_ptr createSparcELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp index e035c3b87a40..5cd4a7daf0fa 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp @@ -14,6 +14,7 @@ #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSubtargetInfo.h" using namespace llvm; @@ -122,9 +123,10 @@ bool SystemZMCAsmBackend::writeNopData(uint64_t Count, } MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { - uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS()); + uint8_t OSABI = + MCELFObjectTargetWriter::getOSABI(STI.getTargetTriple().getOS()); return new SystemZMCAsmBackend(OSABI); } diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h index 99b157e37275..ed1b1b95b8f3 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h @@ -89,8 +89,8 @@ MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); MCAsmBackend *createSystemZMCAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options); std::unique_ptr createSystemZObjectWriter(raw_pwrite_stream &OS, diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index 18de4273d1d0..e7c8809de70e 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -69,10 +69,10 @@ static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII, } static MCAsmBackend *createAsmBackend(const Target & /*T*/, + const MCSubtargetInfo &STI, const MCRegisterInfo & /*MRI*/, - const Triple &TT, StringRef /*CPU*/, const MCTargetOptions & /*Options*/) { - return createWebAssemblyAsmBackend(TT); + return createWebAssemblyAsmBackend(STI.getTargetTriple()); } static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU, diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 239db2a74b24..34db5918926b 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -843,10 +843,11 @@ class DarwinX86_64AsmBackend : public DarwinX86AsmBackend { } // end anonymous namespace MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TheTriple, - StringRef CPU, const MCTargetOptions &Options) { + const Triple &TheTriple = STI.getTargetTriple(); + StringRef CPU = STI.getCPU(); if (TheTriple.isOSBinFormatMachO()) return new DarwinX86_32AsmBackend(T, MRI, CPU); @@ -862,10 +863,11 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, } MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TheTriple, - StringRef CPU, const MCTargetOptions &Options) { + const Triple &TheTriple = STI.getTargetTriple(); + StringRef CPU = STI.getCPU(); if (TheTriple.isOSBinFormatMachO()) { MachO::CPUSubTypeX86 CS = StringSwitch(TheTriple.getArchName()) diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index c5859b600ad2..d758c0588cb1 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -70,11 +70,13 @@ MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); -MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createX86_32AsmBackend(const Target &T, + const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); -MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createX86_64AsmBackend(const Target &T, + const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); /// Implements X86-only directives for assembly emission. diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp index 01d10fe4cae4..855ea683a8af 100644 --- a/lib/Target/X86/X86FixupBWInsts.cpp +++ b/lib/Target/X86/X86FixupBWInsts.cpp @@ -166,81 +166,6 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) { return true; } -/// Check if register \p Reg is live after the \p MI. -/// -/// \p LiveRegs should be in a state describing liveness information in -/// that exact place as this function tries to precise analysis made -/// by \p LiveRegs by exploiting the information about particular -/// instruction \p MI. \p MI is expected to be one of the MOVs handled -/// by the x86FixupBWInsts pass. -/// Note: similar to LivePhysRegs::contains this would state that -/// super-register is not used if only some part of it is used. -/// -/// X86 backend does not have subregister liveness tracking enabled, -/// so liveness information might be overly conservative. However, for -/// some specific instructions (this pass only cares about MOVs) we can -/// produce more precise results by analysing that MOV's operands. -/// -/// Indeed, if super-register is not live before the mov it means that it -/// was originally and so we are free to modify these -/// undef upper bits. That may happen in case where the use is in another MBB -/// and the vreg/physreg corresponding to the move has higher width than -/// necessary (e.g. due to register coalescing with a "truncate" copy). -/// So, it handles pattern like this: -/// -/// %bb.2: derived from LLVM BB %if.then -/// Live Ins: %rdi -/// Predecessors according to CFG: %bb.0 -/// %ax = MOV16rm killed %rdi, 1, %noreg, 0, %noreg, implicit-def %eax; -/// mem:LD2[%p] -/// No implicit %eax -/// Successors according to CFG: %bb.3(?%) -/// -/// %bb.3: derived from LLVM BB %if.end -/// Live Ins: %eax Only %ax is actually live -/// Predecessors according to CFG: %bb.2 %bb.1 -/// %ax = KILL %ax, implicit killed %eax -/// RET 0, %ax -static bool isLive(const MachineInstr &MI, - const LivePhysRegs &LiveRegs, - const TargetRegisterInfo *TRI, - unsigned Reg) { - if (!LiveRegs.contains(Reg)) - return false; - - unsigned Opc = MI.getOpcode(); (void)Opc; - // These are the opcodes currently handled by the pass, if something - // else will be added we need to ensure that new opcode has the same - // properties. - assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr || - Opc == X86::MOV16rr) && - "Unexpected opcode."); - - bool IsDefined = false; - for (auto &MO: MI.implicit_operands()) { - if (!MO.isReg()) - continue; - - assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!"); - - for (MCSuperRegIterator Supers(Reg, TRI, true); Supers.isValid(); ++Supers) { - if (*Supers == MO.getReg()) { - if (MO.isDef()) - IsDefined = true; - else - return true; // SuperReg Imp-used' -> live before the MI - } - } - } - // Reg is not Imp-def'ed -> it's live both before/after the instruction. - if (!IsDefined) - return true; - - // Otherwise, the Reg is not live before the MI and the MOV can't - // make it really live, so it's in fact dead even after the MI. - return false; -} - /// \brief Check if after \p OrigMI the only portion of super register /// of the destination register of \p OrigMI that is alive is that /// destination register. @@ -262,20 +187,84 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, if (SubRegIdx == X86::sub_8bit_hi) return false; - if (isLive(*OrigMI, LiveRegs, TRI, SuperDestReg)) - return false; - - if (SubRegIdx == X86::sub_8bit) { - // In the case of byte registers, we also have to check that the upper - // byte register is also dead. That is considered to be independent of - // whether the super-register is dead. - unsigned UpperByteReg = - getX86SubSuperRegister(SuperDestReg, 8, /*High=*/true); - - if (isLive(*OrigMI, LiveRegs, TRI, UpperByteReg)) - return false; + // If neither the destination-super register nor any applicable subregisters + // are live after this instruction, then the super register is safe to use. + if (!LiveRegs.contains(SuperDestReg)) { + // If the original destination register was not the low 8-bit subregister + // then the super register check is sufficient. + if (SubRegIdx != X86::sub_8bit) + return true; + // If the original destination register was the low 8-bit subregister and + // we also need to check the 16-bit subregister and the high 8-bit + // subregister. + if (!LiveRegs.contains(getX86SubSuperRegister(OrigDestReg, 16)) && + !LiveRegs.contains(getX86SubSuperRegister(SuperDestReg, 8, + /*High=*/true))) + return true; + // Otherwise, we have a little more checking to do. } + // If we get here, the super-register destination (or some part of it) is + // marked as live after the original instruction. + // + // The X86 backend does not have subregister liveness tracking enabled, + // so liveness information might be overly conservative. Specifically, the + // super register might be marked as live because it is implicitly defined + // by the instruction we are examining. + // + // However, for some specific instructions (this pass only cares about MOVs) + // we can produce more precise results by analysing that MOV's operands. + // + // Indeed, if super-register is not live before the mov it means that it + // was originally and so we are free to modify these + // undef upper bits. That may happen in case where the use is in another MBB + // and the vreg/physreg corresponding to the move has higher width than + // necessary (e.g. due to register coalescing with a "truncate" copy). + // So, we would like to handle patterns like this: + // + // %bb.2: derived from LLVM BB %if.then + // Live Ins: %rdi + // Predecessors according to CFG: %bb.0 + // %ax = MOV16rm killed %rdi, 1, %noreg, 0, %noreg, implicit-def %eax + // ; No implicit %eax + // Successors according to CFG: %bb.3(?%) + // + // %bb.3: derived from LLVM BB %if.end + // Live Ins: %eax Only %ax is actually live + // Predecessors according to CFG: %bb.2 %bb.1 + // %ax = KILL %ax, implicit killed %eax + // RET 0, %ax + unsigned Opc = OrigMI->getOpcode(); (void)Opc; + // These are the opcodes currently handled by the pass, if something + // else will be added we need to ensure that new opcode has the same + // properties. + assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr || + Opc == X86::MOV16rr) && + "Unexpected opcode."); + + bool IsDefined = false; + for (auto &MO: OrigMI->implicit_operands()) { + if (!MO.isReg()) + continue; + + assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!"); + + for (MCSuperRegIterator Supers(OrigDestReg, TRI, true); Supers.isValid(); + ++Supers) { + if (*Supers == MO.getReg()) { + if (MO.isDef()) + IsDefined = true; + else + return false; // SuperReg Imp-used' -> live before the MI + } + } + } + // Reg is not Imp-def'ed -> it's live both before/after the instruction. + if (!IsDefined) + return false; + + // Otherwise, the Reg is not live before the MI and the MOV can't + // make it really live, so it's in fact dead even after the MI. return true; } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9edd799779c7..a6f56877bd64 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -996,8 +996,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. - setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); - setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); @@ -1151,15 +1151,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); + setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i1, MVT::v16i32); + setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i1, MVT::v16i32); + setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i1, MVT::v4i32); setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); + if (Subtarget.hasVLX()) { + setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); + } + // Extends of v16i1/v8i1 to 128-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom); @@ -1186,9 +1197,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); - for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, - MVT::v16i1, MVT::v32i1, MVT::v64i1 }) - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + for (auto VT : { MVT::v1i1, MVT::v8i1 }) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); @@ -1219,11 +1229,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::v16i16, Promote); - setOperationAction(ISD::FP_TO_SINT, MVT::v16i8, Promote); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v16i8, Promote); - setOperationAction(ISD::FP_TO_UINT, MVT::v16i16, Promote); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); @@ -1428,6 +1438,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); + for (auto VT : { MVT::v16i1, MVT::v32i1 }) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Extends from v32i1 masks to 256-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); @@ -1540,6 +1552,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); + for (auto VT : { MVT::v2i1, MVT::v4i1 }) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Extends from v2i1/v4i1 masks to 128-bit vectors. setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom); @@ -2140,6 +2154,10 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, const SDLoc &Dl, SelectionDAG &DAG) { EVT ValVT = ValArg.getValueType(); + if (ValVT == MVT::v1i1) + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg, + DAG.getIntPtrConstant(0, Dl)); + if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { // Two stage lowering might be required @@ -4625,6 +4643,14 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget.hasLZCNT(); } +bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, + EVT BitcastVT) const { + if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1) + return false; + + return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT); +} + bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const SelectionDAG &DAG) const { // Do not merge to float value size (128 bytes) if no implicit @@ -7471,7 +7497,7 @@ static bool isAddSub(const BuildVectorSDNode *BV, } /// Returns true if is possible to fold MUL and an idiom that has already been -/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into +/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2. /// @@ -7708,6 +7734,10 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, case ISD::AND: case ISD::XOR: case ISD::OR: + // Don't do this if the buildvector is a splat - we'd replace one + // constant with an entire vector. + if (Op->getSplatValue()) + return SDValue(); if (!TLI.isOperationLegalOrPromote(Opcode, VT)) return SDValue(); break; @@ -11261,6 +11291,20 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( MutableArrayRef LoMask = Mask.slice(0, 4); MutableArrayRef HiMask = Mask.slice(4, 4); + // Attempt to directly match PSHUFLW or PSHUFHW. + if (isUndefOrInRange(LoMask, 0, 4) && + isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { + return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, + getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); + } + if (isUndefOrInRange(HiMask, 4, 8) && + isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { + for (int i = 0; i != 4; ++i) + HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4)); + return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, + getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); + } + SmallVector LoInputs; copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); std::sort(LoInputs.begin(), LoInputs.end()); @@ -11280,13 +11324,11 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); - // If we are splatting two values from one half - one to each half, then - // we can shuffle that half so each is splatted to a dword, then splat those - // to their respective halves. - auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp, - int DOffset) { - int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4}; - int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1}; + // If we are shuffling values from one half - check how many different DWORD + // pairs we need to create. If only 1 or 2 then we can perform this as a + // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below. + auto ShuffleDWordPairs = [&](ArrayRef PSHUFHalfMask, + ArrayRef PSHUFDMask, unsigned ShufWOp) { V = DAG.getNode(ShufWOp, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); V = DAG.getBitcast(PSHUFDVT, V); @@ -11295,10 +11337,48 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( return DAG.getBitcast(VT, V); }; - if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0) - return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0); - if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0) - return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2); + if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) { + int PSHUFDMask[4] = { -1, -1, -1, -1 }; + SmallVector, 4> DWordPairs; + int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2); + + // Collect the different DWORD pairs. + for (int DWord = 0; DWord != 4; ++DWord) { + int M0 = Mask[2 * DWord + 0]; + int M1 = Mask[2 * DWord + 1]; + M0 = (M0 >= 0 ? M0 % 4 : M0); + M1 = (M1 >= 0 ? M1 % 4 : M1); + if (M0 < 0 && M1 < 0) + continue; + + bool Match = false; + for (int j = 0, e = DWordPairs.size(); j < e; ++j) { + auto &DWordPair = DWordPairs[j]; + if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) && + (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) { + DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first); + DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second); + PSHUFDMask[DWord] = DOffset + j; + Match = true; + break; + } + } + if (!Match) { + PSHUFDMask[DWord] = DOffset + DWordPairs.size(); + DWordPairs.push_back(std::make_pair(M0, M1)); + } + } + + if (DWordPairs.size() <= 2) { + DWordPairs.resize(2, std::make_pair(-1, -1)); + int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second, + DWordPairs[1].first, DWordPairs[1].second}; + if ((NumHToL + NumHToH) == 0) + return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW); + if ((NumLToL + NumLToH) == 0) + return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW); + } + } // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up @@ -15020,6 +15100,42 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, return insert1BitVector(Op, DAG, Subtarget); } +static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 && + "Only vXi1 extract_subvectors need custom lowering"); + + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + + if (!isa(Idx)) + return SDValue(); + + unsigned IdxVal = cast(Idx)->getZExtValue(); + if (IdxVal == 0) // the operation is legal + return Op; + + MVT VecVT = Vec.getSimpleValueType(); + unsigned NumElems = VecVT.getVectorNumElements(); + + // Extend to natively supported kshift. + MVT WideVecVT = VecVT; + if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) { + WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT, + DAG.getUNDEF(WideVecVT), Vec, + DAG.getIntPtrConstant(0, dl)); + } + + // Shift to the LSB. + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec, + DAG.getIntPtrConstant(0, dl)); +} + // Returns the appropriate wrapper opcode for a global reference. unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const { // References to absolute symbols are never PC-relative. @@ -15545,19 +15661,13 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))); } - if (SrcVT.getVectorElementType() == MVT::i1) { - if (SrcVT == MVT::v2i1) { - // For v2i1, we need to widen to v4i1 first. - assert(VT == MVT::v2f64 && "Unexpected type"); - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src, - DAG.getUNDEF(MVT::v2i1)); - return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(), - DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src)); - } - - MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); - return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); + if (SrcVT == MVT::v2i1) { + // For v2i1, we need to widen to v4i1 first. + assert(VT == MVT::v2f64 && "Unexpected type"); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src, + DAG.getUNDEF(MVT::v2i1)); + return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(), + DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src)); } return SDValue(); } @@ -15894,19 +16004,13 @@ static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG, MVT SrcVT = N0.getSimpleValueType(); SDLoc dl(Op); - if (SrcVT.getVectorElementType() == MVT::i1) { - if (SrcVT == MVT::v2i1) { - // For v2i1, we need to widen to v4i1 first. - assert(Op.getValueType() == MVT::v2f64 && "Unexpected type"); - N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0, - DAG.getUNDEF(MVT::v2i1)); - return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64, - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0)); - } - - MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); - return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0)); + if (SrcVT == MVT::v2i1) { + // For v2i1, we need to widen to v4i1 first. + assert(Op.getValueType() == MVT::v2f64 && "Unexpected type"); + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0, + DAG.getUNDEF(MVT::v2i1)); + return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64, + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0)); } switch (SrcVT.SimpleTy) { @@ -16418,13 +16522,16 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, if (InVT.getScalarSizeInBits() <= 16) { if (Subtarget.hasBWI()) { // legal, will go to VPMOVB2M, VPMOVW2M - // Shift packed bytes not supported natively, bitcast to word - MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); - SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, - DAG.getBitcast(ExtVT, In), - DAG.getConstant(ShiftInx, DL, ExtVT)); - ShiftNode = DAG.getBitcast(InVT, ShiftNode); - return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) { + // We need to shift to get the lsb into sign position. + // Shift packed bytes not supported natively, bitcast to word + MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); + In = DAG.getNode(ISD::SHL, DL, ExtVT, + DAG.getBitcast(ExtVT, In), + DAG.getConstant(ShiftInx, DL, ExtVT)); + In = DAG.getBitcast(InVT, In); + } + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, In); } // Use TESTD/Q, extended vector to packed dword/qword. assert((InVT.is256BitVector() || InVT.is128BitVector()) && @@ -16437,9 +16544,12 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, ShiftInx = InVT.getScalarSizeInBits() - 1; } - SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, - DAG.getConstant(ShiftInx, DL, InVT)); - return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); + if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) { + // We need to shift to get the lsb into sign position. + In = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + } + return DAG.getNode(X86ISD::TESTM, DL, VT, In, In); } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { @@ -16572,9 +16682,29 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); if (VT.isVector()) { - assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); SDValue Src = Op.getOperand(0); SDLoc dl(Op); + + if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) { + MVT ResVT = MVT::v4i32; + MVT TruncVT = MVT::v4i1; + unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + if (!IsSigned && !Subtarget.hasVLX()) { + // Widen to 512-bits. + ResVT = MVT::v8i32; + TruncVT = MVT::v8i1; + Opc = ISD::FP_TO_UINT; + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, + DAG.getUNDEF(MVT::v8f64), + Src, DAG.getIntPtrConstant(0, dl)); + } + SDValue Res = DAG.getNode(Opc, dl, ResVT, Src); + Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, + DAG.getIntPtrConstant(0, dl)); + } + + assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) { return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, @@ -18629,6 +18759,7 @@ static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget, DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl)); } Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op); + Op = DAG.getBitcast(MVT::i8, Op); return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(), St->getMemOperand()); } @@ -18645,12 +18776,12 @@ static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget, DAG.getIntPtrConstant(16, dl)); Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi); - SDValue BasePtrHi = - DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, - DAG.getConstant(2, dl, BasePtr.getValueType())); + SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl); SDValue StHi = DAG.getStore(St->getChain(), dl, Hi, - BasePtrHi, St->getMemOperand()); + BasePtrHi, St->getPointerInfo().getWithOffset(2), + MinAlign(St->getAlignment(), 2U), + St->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi); } @@ -24545,6 +24676,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); @@ -29735,7 +29867,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, /// by this operation to try to flow through the rest of the combiner /// the fact that they're unused. static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, - SDValue &Opnd0, SDValue &Opnd1, + SDValue &Opnd0, SDValue &Opnd1, bool matchSubAdd = false) { EVT VT = N->getValueType(0); @@ -30309,9 +30441,35 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // (i16 movmsk (16i8 sext (v16i1 x))) // before the setcc result is scalarized on subtargets that don't have legal // vxi1 types. - if (DCI.isBeforeLegalize()) + if (DCI.isBeforeLegalize()) { if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget)) return V; + + // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer + // type, widen both sides to avoid a trip through memory. + if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && + Subtarget.hasVLX()) { + SDLoc dl(N); + N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0); + N0 = DAG.getBitcast(MVT::v8i1, N0); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0, + DAG.getIntPtrConstant(0, dl)); + } + + // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer + // type, widen both sides to avoid a trip through memory. + if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() && + Subtarget.hasVLX()) { + SDLoc dl(N); + unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); + SmallVector Ops(NumConcats, DAG.getUNDEF(SrcVT)); + Ops[0] = N0; + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); + N0 = DAG.getBitcast(MVT::i8, N0); + return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); + } + } + // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. @@ -30791,6 +30949,11 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if (SrcSVT == MVT::i1 || !isa(Idx)) return SDValue(); + // Handle extract(broadcast(scalar_value)), it doesn't matter what index is. + if (X86ISD::VBROADCAST == Src.getOpcode() && + Src.getOperand(0).getValueType() == VT) + return Src.getOperand(0); + // Resolve the target shuffle inputs and mask. SmallVector Mask; SmallVector Ops; @@ -36153,13 +36316,23 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"); - // We're looking for an oversized integer equality comparison, but ignore a - // comparison with zero because that gets special treatment in EmitTest(). + // We're looking for an oversized integer equality comparison. SDValue X = SetCC->getOperand(0); SDValue Y = SetCC->getOperand(1); EVT OpVT = X.getValueType(); unsigned OpSize = OpVT.getSizeInBits(); - if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y)) + if (!OpVT.isScalarInteger() || OpSize < 128) + return SDValue(); + + // Ignore a comparison with zero because that gets special treatment in + // EmitTest(). But make an exception for the special case of a pair of + // logically-combined vector-sized operands compared to zero. This pattern may + // be generated by the memcmp expansion pass with oversized integer compares + // (see PR33325). + bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR && + X.getOperand(0).getOpcode() == ISD::XOR && + X.getOperand(1).getOpcode() == ISD::XOR; + if (isNullConstant(Y) && !IsOrXorXorCCZero) return SDValue(); // Bail out if we know that this is not really just an oversized integer. @@ -36174,15 +36347,29 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, if ((OpSize == 128 && Subtarget.hasSSE2()) || (OpSize == 256 && Subtarget.hasAVX2())) { EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8; - SDValue VecX = DAG.getBitcast(VecVT, X); - SDValue VecY = DAG.getBitcast(VecVT, Y); - + SDValue Cmp; + if (IsOrXorXorCCZero) { + // This is a bitwise-combined equality comparison of 2 pairs of vectors: + // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne + // Use 2 vector equality compares and 'and' the results before doing a + // MOVMSK. + SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0)); + SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1)); + SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0)); + SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1)); + SDValue Cmp1 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, A, B); + SDValue Cmp2 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, C, D); + Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2); + } else { + SDValue VecX = DAG.getBitcast(VecVT, X); + SDValue VecY = DAG.getBitcast(VecVT, Y); + Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY); + } // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne - SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY); SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp); SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL, MVT::i32); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 7708f577ba70..1fb7c7ed4e98 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -1023,6 +1023,8 @@ namespace llvm { return NumElem > 2; } + bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override; + /// Intel processors have a unified instruction and data cache const char * getClearCacheBuiltinName() const override { return nullptr; // nothing to do, move along. diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index dcd84930741b..458f68072d6c 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2701,11 +2701,6 @@ def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), // Load/store kreg let Predicates = [HasDQI] in { - def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), - (KMOVBmk addr:$dst, VK8:$src)>; - def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), - (KMOVBkm addr:$src)>; - def : Pat<(store VK4:$src, addr:$dst), (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>; def : Pat<(store VK2:$src, addr:$dst), @@ -2745,22 +2740,10 @@ let Predicates = [HasAVX512, NoDQI] in { } let Predicates = [HasAVX512] in { - def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), - (KMOVWmk addr:$dst, VK16:$src)>; def : Pat<(v1i1 (load addr:$src)), - (COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>; - def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))), - (KMOVWkm addr:$src)>; -} -let Predicates = [HasBWI] in { - def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst), - (KMOVDmk addr:$dst, VK32:$src)>; - def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))), - (KMOVDkm addr:$src)>; - def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst), - (KMOVQmk addr:$dst, VK64:$src)>; - def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))), - (KMOVQkm addr:$src)>; + (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK1)>; + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), + (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>; } let Predicates = [HasAVX512] in { @@ -3087,66 +3070,6 @@ defm : operation_subvector_mask_lowering; defm : operation_subvector_mask_lowering; - -multiclass vextract_for_mask_to_mask { -let Predicates = [prd] in - def : - Pat<(To.KVT(extract_subvector(From.KVT From.KRC:$src), (iPTR imm:$imm8))), - (To.KVT(COPY_TO_REGCLASS - (!cast(InstrStr#"ri") From.KVT:$src, - (i8 imm:$imm8)), To.KRC))>; -} - -multiclass vextract_for_mask_to_mask_legal_w { -def : - Pat<(To.KVT(extract_subvector(From.KVT From.KRC:$src), (iPTR imm:$imm8))), - (To.KVT(COPY_TO_REGCLASS - (KSHIFTRWri(COPY_TO_REGCLASS From.KRC:$src, VK16), - (i8 imm:$imm8)), To.KRC))>; -} - -defm : vextract_for_mask_to_mask_legal_w; -defm : vextract_for_mask_to_mask_legal_w; -defm : vextract_for_mask_to_mask_legal_w; -defm : vextract_for_mask_to_mask_legal_w; -defm : vextract_for_mask_to_mask_legal_w; -defm : vextract_for_mask_to_mask_legal_w; - -defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v1i1_info, HasAVX512>; -defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v1i1_info, HasBWI>; -defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v1i1_info, HasBWI>; -defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v2i1_info, HasAVX512>; -defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v2i1_info, HasBWI>; -defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v2i1_info, HasBWI>; -defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v4i1_info, HasAVX512>; -defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v4i1_info, HasBWI>; -defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v4i1_info, HasBWI>; -defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v8i1_info, HasAVX512>; -defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v8i1_info, HasBWI>; -defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v8i1_info, HasBWI>; -defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v16i1_info, HasBWI>; -defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v16i1_info, HasBWI>; -defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v32i1_info, HasBWI>; - -// Patterns for kmask shift -multiclass mask_shift_lowering { - def : Pat<(VT (X86kshiftl RC:$src, (i8 imm:$imm))), - (VT (COPY_TO_REGCLASS - (KSHIFTLWri (COPY_TO_REGCLASS RC:$src, VK16), - (I8Imm $imm)), - RC))>; - def : Pat<(VT (X86kshiftr RC:$src, (i8 imm:$imm))), - (VT (COPY_TO_REGCLASS - (KSHIFTRWri (COPY_TO_REGCLASS RC:$src, VK16), - (I8Imm $imm)), - RC))>; -} - -defm : mask_shift_lowering, Requires<[HasAVX512, NoDQI]>; -defm : mask_shift_lowering, Requires<[HasAVX512]>; -defm : mask_shift_lowering, Requires<[HasAVX512]>; //===----------------------------------------------------------------------===// // AVX-512 - Aligned and unaligned load and store // @@ -3428,28 +3351,33 @@ def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)), (v16i32 VR512:$src))), (VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>; +multiclass mask_move_lowering { + def : Pat<(Narrow.VT (vselect (Narrow.KVT Narrow.KRCWM:$mask), + Narrow.RC:$src1, Narrow.RC:$src0)), + (EXTRACT_SUBREG + (Wide.VT + (!cast(InstrStr#"rrk") + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src0, Narrow.SubRegIdx)), + (COPY_TO_REGCLASS Narrow.KRCWM:$mask, Wide.KRCWM), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)))), + Narrow.SubRegIdx)>; + + def : Pat<(Narrow.VT (vselect (Narrow.KVT Narrow.KRCWM:$mask), + Narrow.RC:$src1, Narrow.ImmAllZerosV)), + (EXTRACT_SUBREG + (Wide.VT + (!cast(InstrStr#"rrkz") + (COPY_TO_REGCLASS Narrow.KRCWM:$mask, Wide.KRCWM), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)))), + Narrow.SubRegIdx)>; +} + // Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't // available. Use a 512-bit operation and extract. let Predicates = [HasAVX512, NoVLX] in { -def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), - (v8f32 VR256X:$src0))), - (EXTRACT_SUBREG - (v16f32 - (VMOVAPSZrrk - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)), - (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), - sub_ymm)>; - -def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), - (v8i32 VR256X:$src0))), - (EXTRACT_SUBREG - (v16i32 - (VMOVDQA32Zrrk - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)), - (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), - sub_ymm)>; + defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>; + defm : mask_move_lowering<"VMOVDQA32Z", v8i32x_info, v16i32_info>; } let Predicates = [HasAVX512] in { @@ -4633,7 +4561,7 @@ multiclass avx512_min_max_lowering { sub_xmm)>; } -let Predicates = [HasAVX512] in { +let Predicates = [HasAVX512, NoVLX] in { defm : avx512_min_max_lowering; defm : avx512_min_max_lowering; defm : avx512_min_max_lowering; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 039b4a248544..a481644efdd6 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -94,7 +94,8 @@ let Constraints = "$src1 = $dst" in { // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. multiclass MMXI_binop_rm_int opc, string OpcodeStr, Intrinsic IntId, - OpndItins itins, bit Commutable = 0> { + OpndItins itins, bit Commutable = 0, + X86MemOperand OType = i64mem> { def irr : MMXI; defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", int_x86_mmx_punpcklbw, - MMX_UNPCK_L_ITINS>; + MMX_UNPCK_L_ITINS, + 0, i32mem>; defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", int_x86_mmx_punpcklwd, - MMX_UNPCK_L_ITINS>; + MMX_UNPCK_L_ITINS, + 0, i32mem>; defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq", int_x86_mmx_punpckldq, - MMX_UNPCK_L_ITINS>; + MMX_UNPCK_L_ITINS, + 0, i32mem>; // -- Pack Instructions defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb, diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp index 8712ca4823c6..122f51a0d214 100644 --- a/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/lib/Transforms/Coroutines/CoroSplit.cpp @@ -440,16 +440,14 @@ static void scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock, DenseMap &ResolvedValues) { auto *PrevBB = Prev->getParent(); - auto *I = &*NewBlock->begin(); - while (auto PN = dyn_cast(I)) { - auto V = PN->getIncomingValueForBlock(PrevBB); + for (PHINode &PN : NewBlock->phis()) { + auto V = PN.getIncomingValueForBlock(PrevBB); // See if we already resolved it. auto VI = ResolvedValues.find(V); if (VI != ResolvedValues.end()) V = VI->second; // Remember the value. - ResolvedValues[PN] = V; - I = I->getNextNode(); + ResolvedValues[&PN] = V; } } diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 541dde6c47d2..38604830b885 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -728,6 +728,23 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { } } + // sqrt(a) * sqrt(b) -> sqrt(a * b) + if (AllowReassociate && + Op0->hasOneUse() && Op1->hasOneUse()) { + Value *Opnd0 = nullptr; + Value *Opnd1 = nullptr; + if (match(Op0, m_Intrinsic(m_Value(Opnd0))) && + match(Op1, m_Intrinsic(m_Value(Opnd1)))) { + BuilderTy::FastMathFlagGuard Guard(Builder); + Builder.setFastMathFlags(I.getFastMathFlags()); + Value *FMulVal = Builder.CreateFMul(Opnd0, Opnd1); + Value *Sqrt = Intrinsic::getDeclaration(I.getModule(), + Intrinsic::sqrt, I.getType()); + Value *SqrtCall = Builder.CreateCall(Sqrt, FMulVal); + return replaceInstUsesWith(I, SqrtCall); + } + } + // Handle symmetric situation in a 2-iteration loop Value *Opnd0 = Op0; Value *Opnd1 = Op1; diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp index 207243231aad..caa73b2ff01c 100644 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -265,15 +265,12 @@ static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2, CallSite CS2(CallInst2); // Handle PHIs used as arguments in the call-site. - for (auto &PI : *TailBB) { - PHINode *PN = dyn_cast(&PI); - if (!PN) - break; + for (PHINode &PN : TailBB->phis()) { unsigned ArgNo = 0; for (auto &CI : CS.args()) { - if (&*CI == PN) { - CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1)); - CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2)); + if (&*CI == &PN) { + CS1.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock1)); + CS2.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock2)); } ++ArgNo; } diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp index bf92e43c4715..5594c29bbd9f 100644 --- a/lib/Transforms/Scalar/GVNSink.cpp +++ b/lib/Transforms/Scalar/GVNSink.cpp @@ -592,12 +592,8 @@ class GVNSink { /// Create a ModelledPHI for each PHI in BB, adding to PHIs. void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs, SmallPtrSetImpl &PHIContents) { - for (auto &I : *BB) { - auto *PN = dyn_cast(&I); - if (!PN) - return; - - auto MPHI = ModelledPHI(PN); + for (PHINode &PN : BB->phis()) { + auto MPHI = ModelledPHI(&PN); PHIs.insert(MPHI); for (auto *V : MPHI.getValues()) PHIContents.insert(V); diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 74d6014d3e3d..221fe57581ca 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -485,9 +485,8 @@ void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { BasicBlock *Header = L->getHeader(); SmallVector PHIs; - for (BasicBlock::iterator I = Header->begin(); - PHINode *PN = dyn_cast(I); ++I) - PHIs.push_back(PN); + for (PHINode &PN : Header->phis()) + PHIs.push_back(&PN); for (unsigned i = 0, e = PHIs.size(); i != e; ++i) if (PHINode *PN = dyn_cast_or_null(&*PHIs[i])) @@ -724,13 +723,12 @@ void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { assert(LoopHeader && "Invalid loop"); for (auto *ExitBB : ExitBlocks) { - BasicBlock::iterator BBI = ExitBB->begin(); // If there are no more PHI nodes in this exit block, then no more // values defined inside the loop are used on this path. - while (auto *PN = dyn_cast(BBI++)) { - for (unsigned IncomingValIdx = 0, E = PN->getNumIncomingValues(); - IncomingValIdx != E; ++IncomingValIdx) { - auto *IncomingBB = PN->getIncomingBlock(IncomingValIdx); + for (PHINode &PN : ExitBB->phis()) { + for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues(); + IncomingValIdx != E; ++IncomingValIdx) { + auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx); // We currently only support loop exits from loop header. If the // incoming block is not loop header, we need to recursively check @@ -755,8 +753,7 @@ void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { if (!L->isLoopInvariant(Cond)) continue; - auto *ExitVal = - dyn_cast(PN->getIncomingValue(IncomingValIdx)); + auto *ExitVal = dyn_cast(PN.getIncomingValue(IncomingValIdx)); // Only deal with PHIs. if (!ExitVal) @@ -771,8 +768,8 @@ void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { if (PreheaderIdx != -1) { assert(ExitVal->getParent() == LoopHeader && "ExitVal must be in loop header"); - PN->setIncomingValue(IncomingValIdx, - ExitVal->getIncomingValue(PreheaderIdx)); + PN.setIncomingValue(IncomingValIdx, + ExitVal->getIncomingValue(PreheaderIdx)); } } } diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 5c4d55bfbb2b..cf98088111be 100644 --- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -1174,13 +1174,9 @@ void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result, if (OriginalLoop.contains(SBB)) continue; // not an exit block - for (Instruction &I : *SBB) { - auto *PN = dyn_cast(&I); - if (!PN) - break; - - Value *OldIncoming = PN->getIncomingValueForBlock(OriginalBB); - PN->addIncoming(GetClonedValue(OldIncoming), ClonedBB); + for (PHINode &PN : SBB->phis()) { + Value *OldIncoming = PN.getIncomingValueForBlock(OriginalBB); + PN.addIncoming(GetClonedValue(OldIncoming), ClonedBB); } } } @@ -1327,16 +1323,12 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of // each of the PHI nodes in the loop header. This feeds into the initial // value of the same PHI nodes if/when we continue execution. - for (Instruction &I : *LS.Header) { - auto *PN = dyn_cast(&I); - if (!PN) - break; - - PHINode *NewPHI = PHINode::Create(PN->getType(), 2, PN->getName() + ".copy", + for (PHINode &PN : LS.Header->phis()) { + PHINode *NewPHI = PHINode::Create(PN.getType(), 2, PN.getName() + ".copy", BranchToContinuation); - NewPHI->addIncoming(PN->getIncomingValueForBlock(Preheader), Preheader); - NewPHI->addIncoming(PN->getIncomingValueForBlock(LS.Latch), + NewPHI->addIncoming(PN.getIncomingValueForBlock(Preheader), Preheader); + NewPHI->addIncoming(PN.getIncomingValueForBlock(LS.Latch), RRI.ExitSelector); RRI.PHIValuesAtPseudoExit.push_back(NewPHI); } @@ -1348,12 +1340,8 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( // The latch exit now has a branch from `RRI.ExitSelector' instead of // `LS.Latch'. The PHI nodes need to be updated to reflect that. - for (Instruction &I : *LS.LatchExit) { - if (PHINode *PN = dyn_cast(&I)) - replacePHIBlock(PN, LS.Latch, RRI.ExitSelector); - else - break; - } + for (PHINode &PN : LS.LatchExit->phis()) + replacePHIBlock(&PN, LS.Latch, RRI.ExitSelector); return RRI; } @@ -1362,15 +1350,10 @@ void LoopConstrainer::rewriteIncomingValuesForPHIs( LoopStructure &LS, BasicBlock *ContinuationBlock, const LoopConstrainer::RewrittenRangeInfo &RRI) const { unsigned PHIIndex = 0; - for (Instruction &I : *LS.Header) { - auto *PN = dyn_cast(&I); - if (!PN) - break; - - for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) - if (PN->getIncomingBlock(i) == ContinuationBlock) - PN->setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]); - } + for (PHINode &PN : LS.Header->phis()) + for (unsigned i = 0, e = PN.getNumIncomingValues(); i < e; ++i) + if (PN.getIncomingBlock(i) == ContinuationBlock) + PN.setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]); LS.IndVarStart = RRI.IndVarEnd; } @@ -1381,14 +1364,9 @@ BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS, BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header); BranchInst::Create(LS.Header, Preheader); - for (Instruction &I : *LS.Header) { - auto *PN = dyn_cast(&I); - if (!PN) - break; - - for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) - replacePHIBlock(PN, OldPreheader, Preheader); - } + for (PHINode &PN : LS.Header->phis()) + for (unsigned i = 0, e = PN.getNumIncomingValues(); i < e; ++i) + replacePHIBlock(&PN, OldPreheader, Preheader); return Preheader; } diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 1476f7850cf0..141c9938bf8b 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -1800,11 +1800,10 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, BasicBlock *OldPred, BasicBlock *NewPred, DenseMap &ValueMap) { - for (BasicBlock::iterator PNI = PHIBB->begin(); - PHINode *PN = dyn_cast(PNI); ++PNI) { + for (PHINode &PN : PHIBB->phis()) { // Ok, we have a PHI node. Figure out what the incoming value was for the // DestBlock. - Value *IV = PN->getIncomingValueForBlock(OldPred); + Value *IV = PN.getIncomingValueForBlock(OldPred); // Remap the value if necessary. if (Instruction *Inst = dyn_cast(IV)) { @@ -1813,7 +1812,7 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, IV = I->second; } - PN->addIncoming(IV, NewPred); + PN.addIncoming(IV, NewPred); } } diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index 82604a8842bf..15cd1086f209 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -49,11 +49,10 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE, // must pass through a PHI in the exit block, meaning that this check is // sufficient to guarantee that no loop-variant values are used outside // of the loop. - BasicBlock::iterator BI = ExitBlock->begin(); bool AllEntriesInvariant = true; bool AllOutgoingValuesSame = true; - while (PHINode *P = dyn_cast(BI)) { - Value *incoming = P->getIncomingValueForBlock(ExitingBlocks[0]); + for (PHINode &P : ExitBlock->phis()) { + Value *incoming = P.getIncomingValueForBlock(ExitingBlocks[0]); // Make sure all exiting blocks produce the same incoming value for the exit // block. If there are different incoming values for different exiting @@ -61,7 +60,7 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE, // be used. AllOutgoingValuesSame = all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) { - return incoming == P->getIncomingValueForBlock(BB); + return incoming == P.getIncomingValueForBlock(BB); }); if (!AllOutgoingValuesSame) @@ -72,8 +71,6 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE, AllEntriesInvariant = false; break; } - - ++BI; } if (Changed) @@ -162,11 +159,9 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, if (ExitBlock && isLoopNeverExecuted(L)) { DEBUG(dbgs() << "Loop is proven to never execute, delete it!"); // Set incoming value to undef for phi nodes in the exit block. - BasicBlock::iterator BI = ExitBlock->begin(); - while (PHINode *P = dyn_cast(BI)) { - for (unsigned i = 0; i < P->getNumIncomingValues(); i++) - P->setIncomingValue(i, UndefValue::get(P->getType())); - BI++; + for (PHINode &P : ExitBlock->phis()) { + std::fill(P.incoming_values().begin(), P.incoming_values().end(), + UndefValue::get(P.getType())); } deleteDeadLoop(L, &DT, &SE, &LI); ++NumDeleted; diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 953854c8b7b7..ff3e9eef16d9 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -857,12 +857,11 @@ static MemAccessTy getAccessType(const TargetTransformInfo &TTI, /// Return true if this AddRec is already a phi in its loop. static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { - for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); - PHINode *PN = dyn_cast(I); ++I) { - if (SE.isSCEVable(PN->getType()) && - (SE.getEffectiveSCEVType(PN->getType()) == + for (PHINode &PN : AR->getLoop()->getHeader()->phis()) { + if (SE.isSCEVable(PN.getType()) && + (SE.getEffectiveSCEVType(PN.getType()) == SE.getEffectiveSCEVType(AR->getType())) && - SE.getSCEV(PN) == AR) + SE.getSCEV(&PN) == AR) return true; } return false; @@ -3013,15 +3012,14 @@ void LSRInstance::CollectChains() { } // Continue walking down the instructions. } // Continue walking down the domtree. // Visit phi backedges to determine if the chain can generate the IV postinc. - for (BasicBlock::iterator I = L->getHeader()->begin(); - PHINode *PN = dyn_cast(I); ++I) { - if (!SE.isSCEVable(PN->getType())) + for (PHINode &PN : L->getHeader()->phis()) { + if (!SE.isSCEVable(PN.getType())) continue; Instruction *IncV = - dyn_cast(PN->getIncomingValueForBlock(L->getLoopLatch())); + dyn_cast(PN.getIncomingValueForBlock(L->getLoopLatch())); if (IncV) - ChainInstruction(PN, IncV, ChainUsersVec); + ChainInstruction(&PN, IncV, ChainUsersVec); } // Remove any unprofitable chains. unsigned ChainIdx = 0; @@ -3152,12 +3150,11 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, // If LSR created a new, wider phi, we may also replace its postinc. We only // do this if we also found a wide value for the head of the chain. if (isa(Chain.tailUserInst())) { - for (BasicBlock::iterator I = L->getHeader()->begin(); - PHINode *Phi = dyn_cast(I); ++I) { - if (!isCompatibleIVType(Phi, IVSrc)) + for (PHINode &Phi : L->getHeader()->phis()) { + if (!isCompatibleIVType(&Phi, IVSrc)) continue; Instruction *PostIncV = dyn_cast( - Phi->getIncomingValueForBlock(L->getLoopLatch())); + Phi.getIncomingValueForBlock(L->getLoopLatch())); if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc))) continue; Value *IVOper = IVSrc; @@ -3168,7 +3165,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc()); IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain"); } - Phi->replaceUsesOfWith(PostIncV, IVOper); + Phi.replaceUsesOfWith(PostIncV, IVOper); DeadInsts.emplace_back(PostIncV); } } diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index bd468338a1d0..f2405d9b0c03 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -1274,12 +1274,11 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // If the successor of the exit block had PHI nodes, add an entry for // NewExit. - for (BasicBlock::iterator I = ExitSucc->begin(); - PHINode *PN = dyn_cast(I); ++I) { - Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]); + for (PHINode &PN : ExitSucc->phis()) { + Value *V = PN.getIncomingValueForBlock(ExitBlocks[i]); ValueToValueMapTy::iterator It = VMap.find(V); if (It != VMap.end()) V = It->second; - PN->addIncoming(V, NewExit); + PN.addIncoming(V, NewExit); } if (LandingPadInst *LPad = NewExit->getLandingPadInst()) { @@ -1496,10 +1495,9 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, BranchInst::Create(Abort, OldSISucc, ConstantInt::getTrue(Context), NewSISucc); // Release the PHI operands for this edge. - for (BasicBlock::iterator II = NewSISucc->begin(); - PHINode *PN = dyn_cast(II); ++II) - PN->setIncomingValue(PN->getBasicBlockIndex(Switch), - UndefValue::get(PN->getType())); + for (PHINode &PN : NewSISucc->phis()) + PN.setIncomingValue(PN.getBasicBlockIndex(Switch), + UndefValue::get(PN.getType())); // Tell the domtree about the new block. We don't fully update the // domtree here -- instead we force it to do a full recomputation // after the pass is complete -- but we do need to inform it of diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 66608ec631f6..9dc550ceaeca 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -523,10 +523,8 @@ class SCCPSolver : public InstVisitor { DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName() << " -> " << Dest->getName() << '\n'); - PHINode *PN; - for (BasicBlock::iterator I = Dest->begin(); - (PN = dyn_cast(I)); ++I) - visitPHINode(*PN); + for (PHINode &PN : Dest->phis()) + visitPHINode(PN); } } diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 3d0fca0bc3a5..aba732bc413f 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -271,19 +271,14 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, BasicBlock &OldExitingBB, BasicBlock &OldPH) { - for (Instruction &I : UnswitchedBB) { - auto *PN = dyn_cast(&I); - if (!PN) - // No more PHIs to check. - break; - + for (PHINode &PN : UnswitchedBB.phis()) { // When the loop exit is directly unswitched we just need to update the // incoming basic block. We loop to handle weird cases with repeated // incoming blocks, but expect to typically only have one operand here. - for (auto i : seq(0, PN->getNumOperands())) { - assert(PN->getIncomingBlock(i) == &OldExitingBB && + for (auto i : seq(0, PN.getNumOperands())) { + assert(PN.getIncomingBlock(i) == &OldExitingBB && "Found incoming block different from unique predecessor!"); - PN->setIncomingBlock(i, &OldPH); + PN.setIncomingBlock(i, &OldPH); } } } @@ -302,14 +297,9 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, assert(&ExitBB != &UnswitchedBB && "Must have different loop exit and unswitched blocks!"); Instruction *InsertPt = &*UnswitchedBB.begin(); - for (Instruction &I : ExitBB) { - auto *PN = dyn_cast(&I); - if (!PN) - // No more PHIs to check. - break; - - auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2, - PN->getName() + ".split", InsertPt); + for (PHINode &PN : ExitBB.phis()) { + auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2, + PN.getName() + ".split", InsertPt); // Walk backwards over the old PHI node's inputs to minimize the cost of // removing each one. We have to do this weird loop manually so that we @@ -320,18 +310,18 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, // allowed us to create a single entry for a predecessor block without // having separate entries for each "edge" even though these edges are // required to produce identical results. - for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) { - if (PN->getIncomingBlock(i) != &OldExitingBB) + for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) { + if (PN.getIncomingBlock(i) != &OldExitingBB) continue; - Value *Incoming = PN->removeIncomingValue(i); + Value *Incoming = PN.removeIncomingValue(i); NewPN->addIncoming(Incoming, &OldPH); } // Now replace the old PHI with the new one and wire the old one in as an // input to the new one. - PN->replaceAllUsesWith(NewPN); - NewPN->addIncoming(PN, &ExitBB); + PN.replaceAllUsesWith(NewPN); + NewPN->addIncoming(&PN, &ExitBB); } } diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index 2972e1cff9a4..b8fb80b6cc26 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -544,10 +544,7 @@ void StructurizeCFG::insertConditions(bool Loops) { /// them in DeletedPhis void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { PhiMap &Map = DeletedPhis[To]; - for (Instruction &I : *To) { - if (!isa(I)) - break; - PHINode &Phi = cast(I); + for (PHINode &Phi : To->phis()) { while (Phi.getBasicBlockIndex(From) != -1) { Value *Deleted = Phi.removeIncomingValue(From, false); Map[&Phi].push_back(std::make_pair(From, Deleted)); @@ -557,10 +554,7 @@ void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { /// \brief Add a dummy PHI value as soon as we knew the new predecessor void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { - for (Instruction &I : *To) { - if (!isa(I)) - break; - PHINode &Phi = cast(I); + for (PHINode &Phi : To->phis()) { Value *Undef = UndefValue::get(Phi.getType()); Phi.addIncoming(Undef, From); } diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp index 606bd8baccaa..8f59913e14bb 100644 --- a/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -94,9 +94,8 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) { // Recursively deleting a PHI may cause multiple PHIs to be deleted // or RAUW'd undef, so use an array of WeakTrackingVH for the PHIs to delete. SmallVector PHIs; - for (BasicBlock::iterator I = BB->begin(); - PHINode *PN = dyn_cast(I); ++I) - PHIs.push_back(PN); + for (PHINode &PN : BB->phis()) + PHIs.push_back(&PN); bool Changed = false; for (unsigned i = 0, e = PHIs.size(); i != e; ++i) @@ -134,24 +133,17 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT, if (!OnlySucc) return false; // Can't merge if there is PHI loop. - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) { - if (PHINode *PN = dyn_cast(BI)) { - for (Value *IncValue : PN->incoming_values()) - if (IncValue == PN) - return false; - } else - break; - } + for (PHINode &PN : BB->phis()) + for (Value *IncValue : PN.incoming_values()) + if (IncValue == &PN) + return false; // Begin by getting rid of unneeded PHIs. SmallVector IncomingValues; if (isa(BB->front())) { - for (auto &I : *BB) - if (PHINode *PN = dyn_cast(&I)) { - if (PN->getIncomingValue(0) != PN) - IncomingValues.push_back(PN->getIncomingValue(0)); - } else - break; + for (PHINode &PN : BB->phis()) + if (PN.getIncomingValue(0) != &PN) + IncomingValues.push_back(PN.getIncomingValue(0)); FoldSingleEntryPHINodes(BB, MemDep); } @@ -331,6 +323,12 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, bool IsLoopEntry = !!L; bool SplitMakesNewLoopHeader = false; for (BasicBlock *Pred : Preds) { + // Preds that are not reachable from entry should not be used to identify if + // OldBB is a loop entry or if SplitMakesNewLoopHeader. Unreachable blocks + // are not within any loops, so we incorrectly mark SplitMakesNewLoopHeader + // as true and make the NewBB the header of some loop. This breaks LI. + if (!DT->isReachableFromEntry(Pred)) + continue; // If we need to preserve LCSSA, determine if any of the preds is a loop // exit. if (PreserveLCSSA) diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp index 3653c307619b..464d1a34f518 100644 --- a/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -106,10 +106,9 @@ static void createPHIsForSplitLoopExit(ArrayRef Preds, SplitBB->isLandingPad()) && "SplitBB has non-PHI nodes!"); // For each PHI in the destination block. - for (BasicBlock::iterator I = DestBB->begin(); - PHINode *PN = dyn_cast(I); ++I) { - unsigned Idx = PN->getBasicBlockIndex(SplitBB); - Value *V = PN->getIncomingValue(Idx); + for (PHINode &PN : DestBB->phis()) { + unsigned Idx = PN.getBasicBlockIndex(SplitBB); + Value *V = PN.getIncomingValue(Idx); // If the input is a PHI which already satisfies LCSSA, don't create // a new one. @@ -119,13 +118,13 @@ static void createPHIsForSplitLoopExit(ArrayRef Preds, // Otherwise a new PHI is needed. Create one and populate it. PHINode *NewPN = PHINode::Create( - PN->getType(), Preds.size(), "split", + PN.getType(), Preds.size(), "split", SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator()); for (unsigned i = 0, e = Preds.size(); i != e; ++i) NewPN->addIncoming(V, Preds[i]); // Update the original PHI. - PN->setIncomingValue(Idx, NewPN); + PN.setIncomingValue(Idx, NewPN); } } diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp index 8825f77555e7..5dc6068d4a0b 100644 --- a/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -47,14 +47,11 @@ using namespace llvm; /// static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock, BasicBlock *MergeBlock) { - for (auto &I : *Invoke->getNormalDest()) { - auto *Phi = dyn_cast(&I); - if (!Phi) - break; - int Idx = Phi->getBasicBlockIndex(OrigBlock); + for (PHINode &Phi : Invoke->getNormalDest()->phis()) { + int Idx = Phi.getBasicBlockIndex(OrigBlock); if (Idx == -1) continue; - Phi->setIncomingBlock(Idx, MergeBlock); + Phi.setIncomingBlock(Idx, MergeBlock); } } @@ -82,16 +79,13 @@ static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock, static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock, BasicBlock *ThenBlock, BasicBlock *ElseBlock) { - for (auto &I : *Invoke->getUnwindDest()) { - auto *Phi = dyn_cast(&I); - if (!Phi) - break; - int Idx = Phi->getBasicBlockIndex(OrigBlock); + for (PHINode &Phi : Invoke->getUnwindDest()->phis()) { + int Idx = Phi.getBasicBlockIndex(OrigBlock); if (Idx == -1) continue; - auto *V = Phi->getIncomingValue(Idx); - Phi->setIncomingBlock(Idx, ThenBlock); - Phi->addIncoming(V, ElseBlock); + auto *V = Phi.getIncomingValue(Idx); + Phi.setIncomingBlock(Idx, ThenBlock); + Phi.addIncoming(V, ElseBlock); } } diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index 3b19ba1b50f2..16af2c7b808b 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -493,17 +493,13 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // Handle PHI nodes specially, as we have to remove references to dead // blocks. - for (BasicBlock::const_iterator I = BI.begin(), E = BI.end(); I != E; ++I) { + for (const PHINode &PN : BI.phis()) { // PHI nodes may have been remapped to non-PHI nodes by the caller or // during the cloning process. - if (const PHINode *PN = dyn_cast(I)) { - if (isa(VMap[PN])) - PHIToResolve.push_back(PN); - else - break; - } else { + if (isa(VMap[&PN])) + PHIToResolve.push_back(&PN); + else break; - } } // Finally, remap the terminator instructions, as those can't be remapped diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index a1961eecb391..acccf7abf808 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -105,21 +105,17 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, IRBuilder<> Builder(T); // Branch - See if we are conditional jumping on constant - if (BranchInst *BI = dyn_cast(T)) { + if (auto *BI = dyn_cast(T)) { if (BI->isUnconditional()) return false; // Can't optimize uncond branch BasicBlock *Dest1 = BI->getSuccessor(0); BasicBlock *Dest2 = BI->getSuccessor(1); - if (ConstantInt *Cond = dyn_cast(BI->getCondition())) { + if (auto *Cond = dyn_cast(BI->getCondition())) { // Are we branching on constant? // YES. Change to unconditional branch... BasicBlock *Destination = Cond->getZExtValue() ? Dest1 : Dest2; BasicBlock *OldDest = Cond->getZExtValue() ? Dest2 : Dest1; - //cerr << "Function: " << T->getParent()->getParent() - // << "\nRemoving branch from " << T->getParent() - // << "\n\nTo: " << OldDest << endl; - // Let the basic block know that we are letting go of it. Based on this, // it will adjust it's PHI nodes. OldDest->removePredecessor(BB); @@ -150,10 +146,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, return false; } - if (SwitchInst *SI = dyn_cast(T)) { + if (auto *SI = dyn_cast(T)) { // If we are switching on a constant, we can convert the switch to an // unconditional branch. - ConstantInt *CI = dyn_cast(SI->getCondition()); + auto *CI = dyn_cast(SI->getCondition()); BasicBlock *DefaultDest = SI->getDefaultDest(); BasicBlock *TheOnlyDest = DefaultDest; @@ -280,9 +276,9 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, return false; } - if (IndirectBrInst *IBI = dyn_cast(T)) { + if (auto *IBI = dyn_cast(T)) { // indirectbr blockaddress(@F, @BB) -> br label @BB - if (BlockAddress *BA = + if (auto *BA = dyn_cast(IBI->getAddress()->stripPointerCasts())) { BasicBlock *TheOnlyDest = BA->getBasicBlock(); // Insert the new branch. diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index dc98a39adcc5..92dfb1c7204d 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -258,11 +258,8 @@ static bool isEpilogProfitable(Loop *L) { BasicBlock *PreHeader = L->getLoopPreheader(); BasicBlock *Header = L->getHeader(); assert(PreHeader && Header); - for (Instruction &BBI : *Header) { - PHINode *PN = dyn_cast(&BBI); - if (!PN) - break; - if (isa(PN->getIncomingValueForBlock(PreHeader))) + for (const PHINode &PN : Header->phis()) { + if (isa(PN.getIncomingValueForBlock(PreHeader))) return true; } return false; @@ -611,13 +608,12 @@ LoopUnrollResult llvm::UnrollLoop( for (BasicBlock *Succ : successors(*BB)) { if (L->contains(Succ)) continue; - for (BasicBlock::iterator BBI = Succ->begin(); - PHINode *phi = dyn_cast(BBI); ++BBI) { - Value *Incoming = phi->getIncomingValueForBlock(*BB); + for (PHINode &PHI : Succ->phis()) { + Value *Incoming = PHI.getIncomingValueForBlock(*BB); ValueToValueMapTy::iterator It = LastValueMap.find(Incoming); if (It != LastValueMap.end()) Incoming = It->second; - phi->addIncoming(Incoming, New); + PHI.addIncoming(Incoming, New); } } // Keep track of new headers and latches as we create them, so that @@ -721,10 +717,8 @@ LoopUnrollResult llvm::UnrollLoop( for (BasicBlock *Succ: successors(BB)) { if (Succ == Headers[i]) continue; - for (BasicBlock::iterator BBI = Succ->begin(); - PHINode *Phi = dyn_cast(BBI); ++BBI) { - Phi->removeIncomingValue(BB, false); - } + for (PHINode &Phi : Succ->phis()) + Phi.removeIncomingValue(BB, false); } } // Replace the conditional branch with an unconditional one. diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index e00541d3c812..f79f423ce019 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -80,25 +80,21 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, // The new PHI node value is added as an operand of a PHI node in either // the loop header or the loop exit block. for (BasicBlock *Succ : successors(Latch)) { - for (Instruction &BBI : *Succ) { - PHINode *PN = dyn_cast(&BBI); - // Exit when we passed all PHI nodes. - if (!PN) - break; + for (PHINode &PN : Succ->phis()) { // Add a new PHI node to the prolog end block and add the // appropriate incoming values. - PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr", + PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr", PrologExit->getFirstNonPHI()); // Adding a value to the new PHI node from the original loop preheader. // This is the value that skips all the prolog code. - if (L->contains(PN)) { - NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), + if (L->contains(&PN)) { + NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader); } else { - NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader); + NewPN->addIncoming(UndefValue::get(PN.getType()), PreHeader); } - Value *V = PN->getIncomingValueForBlock(Latch); + Value *V = PN.getIncomingValueForBlock(Latch); if (Instruction *I = dyn_cast(V)) { if (L->contains(I)) { V = VMap.lookup(I); @@ -111,10 +107,10 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, // Update the existing PHI node operand with the value from the // new PHI node. How this is done depends on if the existing // PHI node is in the original loop block, or the exit block. - if (L->contains(PN)) { - PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN); + if (L->contains(&PN)) { + PN.setIncomingValue(PN.getBasicBlockIndex(NewPreHeader), NewPN); } else { - PN->addIncoming(NewPN, PrologExit); + PN.addIncoming(NewPN, PrologExit); } } } @@ -191,11 +187,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, // Exit (EpilogPN) // Update PHI nodes at NewExit and Exit. - for (Instruction &BBI : *NewExit) { - PHINode *PN = dyn_cast(&BBI); - // Exit when we passed all PHI nodes. - if (!PN) - break; + for (PHINode &PN : NewExit->phis()) { // PN should be used in another PHI located in Exit block as // Exit was split by SplitBlockPredecessors into Exit and NewExit // Basicaly it should look like: @@ -207,14 +199,14 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, // // There is EpilogPreHeader incoming block instead of NewExit as // NewExit was spilt 1 more time to get EpilogPreHeader. - assert(PN->hasOneUse() && "The phi should have 1 use"); - PHINode *EpilogPN = cast (PN->use_begin()->getUser()); + assert(PN.hasOneUse() && "The phi should have 1 use"); + PHINode *EpilogPN = cast(PN.use_begin()->getUser()); assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block"); // Add incoming PreHeader from branch around the Loop - PN->addIncoming(UndefValue::get(PN->getType()), PreHeader); + PN.addIncoming(UndefValue::get(PN.getType()), PreHeader); - Value *V = PN->getIncomingValueForBlock(Latch); + Value *V = PN.getIncomingValueForBlock(Latch); Instruction *I = dyn_cast(V); if (I && L->contains(I)) // If value comes from an instruction in the loop add VMap value. @@ -242,23 +234,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, // Skip this as we already updated phis in exit blocks. if (!L->contains(Succ)) continue; - for (Instruction &BBI : *Succ) { - PHINode *PN = dyn_cast(&BBI); - // Exit when we passed all PHI nodes. - if (!PN) - break; + for (PHINode &PN : Succ->phis()) { // Add new PHI nodes to the loop exit block and update epilog // PHIs with the new PHI values. - PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr", + PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr", NewExit->getFirstNonPHI()); // Adding a value to the new PHI node from the unrolling loop preheader. - NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader); + NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader); // Adding a value to the new PHI node from the unrolling loop latch. - NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch); + NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch); // Update the existing PHI node operand with the value from the new PHI // node. Corresponding instruction in epilog loop should be PHI. - PHINode *VPN = cast(VMap[&BBI]); + PHINode *VPN = cast(VMap[&PN]); VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN); } } diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp index fe106e33bca1..a5a305ef582b 100644 --- a/lib/Transforms/Utils/LoopUtils.cpp +++ b/lib/Transforms/Utils/LoopUtils.cpp @@ -1321,13 +1321,12 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, // Rewrite phis in the exit block to get their inputs from the Preheader // instead of the exiting block. - BasicBlock::iterator BI = ExitBlock->begin(); - while (PHINode *P = dyn_cast(BI)) { + for (PHINode &P : ExitBlock->phis()) { // Set the zero'th element of Phi to be from the preheader and remove all // other incoming values. Given the loop has dedicated exits, all other // incoming values must be from the exiting blocks. int PredIndex = 0; - P->setIncomingBlock(PredIndex, Preheader); + P.setIncomingBlock(PredIndex, Preheader); // Removes all incoming values from all other exiting blocks (including // duplicate values from an exiting block). // Nuke all entries except the zero'th entry which is the preheader entry. @@ -1335,13 +1334,12 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, // below, to keep the indices valid for deletion (removeIncomingValues // updates getNumIncomingValues and shifts all values down into the operand // being deleted). - for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i) - P->removeIncomingValue(e - i, false); + for (unsigned i = 0, e = P.getNumIncomingValues() - 1; i != e; ++i) + P.removeIncomingValue(e - i, false); - assert((P->getNumIncomingValues() == 1 && - P->getIncomingBlock(PredIndex) == Preheader) && + assert((P.getNumIncomingValues() == 1 && + P.getIncomingBlock(PredIndex) == Preheader) && "Should have exactly one value and that's from the preheader!"); - ++BI; } // Disconnect the loop body by branching directly to its exit. diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp index e4b20b0faa15..b2231d68a301 100644 --- a/lib/Transforms/Utils/SSAUpdater.cpp +++ b/lib/Transforms/Utils/SSAUpdater.cpp @@ -147,11 +147,9 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { if (isa(BB->begin())) { SmallDenseMap ValueMapping(PredValues.begin(), PredValues.end()); - PHINode *SomePHI; - for (BasicBlock::iterator It = BB->begin(); - (SomePHI = dyn_cast(It)); ++It) { - if (IsEquivalentPHI(SomePHI, ValueMapping)) - return SomePHI; + for (PHINode &SomePHI : BB->phis()) { + if (IsEquivalentPHI(&SomePHI, ValueMapping)) + return &SomePHI; } } diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index e7358dbcb624..7c195788e416 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -283,12 +283,8 @@ isProfitableToFoldUnconditional(BranchInst *SI1, BranchInst *SI2, /// of Succ. static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, BasicBlock *ExistPred) { - if (!isa(Succ->begin())) - return; // Quick exit if nothing to do - - PHINode *PN; - for (BasicBlock::iterator I = Succ->begin(); (PN = dyn_cast(I)); ++I) - PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred); + for (PHINode &PN : Succ->phis()) + PN.addIncoming(PN.getIncomingValueForBlock(ExistPred), NewPred); } /// Compute an abstract "cost" of speculating the given instruction, @@ -1228,11 +1224,9 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2, Instruction *I1, Instruction *I2) { for (BasicBlock *Succ : successors(BB1)) { - PHINode *PN; - for (BasicBlock::iterator BBI = Succ->begin(); - (PN = dyn_cast(BBI)); ++BBI) { - Value *BB1V = PN->getIncomingValueForBlock(BB1); - Value *BB2V = PN->getIncomingValueForBlock(BB2); + for (const PHINode &PN : Succ->phis()) { + Value *BB1V = PN.getIncomingValueForBlock(BB1); + Value *BB2V = PN.getIncomingValueForBlock(BB2); if (BB1V != BB2V && (BB1V == I1 || BB2V == I2)) { return false; } @@ -1282,6 +1276,17 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, if (isa(I1)) goto HoistTerminator; + // If we're going to hoist a call, make sure that the two instructions we're + // commoning/hoisting are both marked with musttail, or neither of them is + // marked as such. Otherwise, we might end up in a situation where we hoist + // from a block where the terminator is a `ret` to a block where the terminator + // is a `br`, and `musttail` calls expect to be followed by a return. + auto *C1 = dyn_cast(I1); + auto *C2 = dyn_cast(I2); + if (C1 && C2) + if (C1->isMustTailCall() != C2->isMustTailCall()) + return Changed; + if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2)) return Changed; @@ -1332,18 +1337,16 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, return Changed; for (BasicBlock *Succ : successors(BB1)) { - PHINode *PN; - for (BasicBlock::iterator BBI = Succ->begin(); - (PN = dyn_cast(BBI)); ++BBI) { - Value *BB1V = PN->getIncomingValueForBlock(BB1); - Value *BB2V = PN->getIncomingValueForBlock(BB2); + for (PHINode &PN : Succ->phis()) { + Value *BB1V = PN.getIncomingValueForBlock(BB1); + Value *BB2V = PN.getIncomingValueForBlock(BB2); if (BB1V == BB2V) continue; // Check for passingValueIsAlwaysUndefined here because we would rather // eliminate undefined control flow then converting it to a select. - if (passingValueIsAlwaysUndefined(BB1V, PN) || - passingValueIsAlwaysUndefined(BB2V, PN)) + if (passingValueIsAlwaysUndefined(BB1V, &PN) || + passingValueIsAlwaysUndefined(BB2V, &PN)) return Changed; if (isa(BB1V) && !isSafeToSpeculativelyExecute(BB1V)) @@ -1369,11 +1372,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, // nodes, so we insert select instruction to compute the final result. std::map, SelectInst *> InsertedSelects; for (BasicBlock *Succ : successors(BB1)) { - PHINode *PN; - for (BasicBlock::iterator BBI = Succ->begin(); - (PN = dyn_cast(BBI)); ++BBI) { - Value *BB1V = PN->getIncomingValueForBlock(BB1); - Value *BB2V = PN->getIncomingValueForBlock(BB2); + for (PHINode &PN : Succ->phis()) { + Value *BB1V = PN.getIncomingValueForBlock(BB1); + Value *BB2V = PN.getIncomingValueForBlock(BB2); if (BB1V == BB2V) continue; @@ -1386,9 +1387,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, BB1V->getName() + "." + BB2V->getName(), BI)); // Make the PHI node use the select for all incoming values for BB1/BB2 - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (PN->getIncomingBlock(i) == BB1 || PN->getIncomingBlock(i) == BB2) - PN->setIncomingValue(i, SI); + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) + if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2) + PN.setIncomingValue(i, SI); } } @@ -1999,10 +2000,9 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, // Check that the PHI nodes can be converted to selects. bool HaveRewritablePHIs = false; - for (BasicBlock::iterator I = EndBB->begin(); - PHINode *PN = dyn_cast(I); ++I) { - Value *OrigV = PN->getIncomingValueForBlock(BB); - Value *ThenV = PN->getIncomingValueForBlock(ThenBB); + for (PHINode &PN : EndBB->phis()) { + Value *OrigV = PN.getIncomingValueForBlock(BB); + Value *ThenV = PN.getIncomingValueForBlock(ThenBB); // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf. // Skip PHIs which are trivial. @@ -2010,8 +2010,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, continue; // Don't convert to selects if we could remove undefined behavior instead. - if (passingValueIsAlwaysUndefined(OrigV, PN) || - passingValueIsAlwaysUndefined(ThenV, PN)) + if (passingValueIsAlwaysUndefined(OrigV, &PN) || + passingValueIsAlwaysUndefined(ThenV, &PN)) return false; HaveRewritablePHIs = true; @@ -2072,12 +2072,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, // Insert selects and rewrite the PHI operands. IRBuilder Builder(BI); - for (BasicBlock::iterator I = EndBB->begin(); - PHINode *PN = dyn_cast(I); ++I) { - unsigned OrigI = PN->getBasicBlockIndex(BB); - unsigned ThenI = PN->getBasicBlockIndex(ThenBB); - Value *OrigV = PN->getIncomingValue(OrigI); - Value *ThenV = PN->getIncomingValue(ThenI); + for (PHINode &PN : EndBB->phis()) { + unsigned OrigI = PN.getBasicBlockIndex(BB); + unsigned ThenI = PN.getBasicBlockIndex(ThenBB); + Value *OrigV = PN.getIncomingValue(OrigI); + Value *ThenV = PN.getIncomingValue(ThenI); // Skip PHIs which are trivial. if (OrigV == ThenV) @@ -2091,8 +2090,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, std::swap(TrueV, FalseV); Value *V = Builder.CreateSelect( BrCond, TrueV, FalseV, "spec.select", BI); - PN->setIncomingValue(OrigI, V); - PN->setIncomingValue(ThenI, V); + PN.setIncomingValue(OrigI, V); + PN.setIncomingValue(ThenI, V); } // Remove speculated dbg intrinsics. @@ -3335,17 +3334,15 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, // it. If it has PHIs though, the PHIs may have different // entries for BB and PBI's BB. If so, insert a select to make // them agree. - PHINode *PN; - for (BasicBlock::iterator II = CommonDest->begin(); - (PN = dyn_cast(II)); ++II) { - Value *BIV = PN->getIncomingValueForBlock(BB); - unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent()); - Value *PBIV = PN->getIncomingValue(PBBIdx); + for (PHINode &PN : CommonDest->phis()) { + Value *BIV = PN.getIncomingValueForBlock(BB); + unsigned PBBIdx = PN.getBasicBlockIndex(PBI->getParent()); + Value *PBIV = PN.getIncomingValue(PBBIdx); if (BIV != PBIV) { // Insert a select in PBI to pick the right value. SelectInst *NV = cast( Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux")); - PN->setIncomingValue(PBBIdx, NV); + PN.setIncomingValue(PBBIdx, NV); // Although the select has the same condition as PBI, the original branch // weights for PBI do not apply to the new select because the select's // 'logical' edges are incoming edges of the phi that is eliminated, not @@ -4451,17 +4448,16 @@ static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue, BasicBlock *Succ = Branch->getSuccessor(0); - BasicBlock::iterator I = Succ->begin(); - while (PHINode *PHI = dyn_cast(I++)) { - int Idx = PHI->getBasicBlockIndex(BB); + for (PHINode &PHI : Succ->phis()) { + int Idx = PHI.getBasicBlockIndex(BB); assert(Idx >= 0 && "PHI has no entry for predecessor?"); - Value *InValue = PHI->getIncomingValue(Idx); + Value *InValue = PHI.getIncomingValue(Idx); if (InValue != CaseValue) continue; *PhiIndex = Idx; - return PHI; + return &PHI; } return nullptr; @@ -4491,19 +4487,16 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) { // --> // %r = phi i32 ... [ %x, %switchbb ] ... - for (Instruction &InstInCaseDest : *CaseDest) { - auto *Phi = dyn_cast(&InstInCaseDest); - if (!Phi) break; - + for (PHINode &Phi : CaseDest->phis()) { // This only works if there is exactly 1 incoming edge from the switch to // a phi. If there is >1, that means multiple cases of the switch map to 1 // value in the phi, and that phi value is not the switch condition. Thus, // this transform would not make sense (the phi would be invalid because // a phi can't have different incoming values from the same block). - int SwitchBBIdx = Phi->getBasicBlockIndex(SwitchBlock); - if (Phi->getIncomingValue(SwitchBBIdx) == CaseValue && - count(Phi->blocks(), SwitchBlock) == 1) { - Phi->setIncomingValue(SwitchBBIdx, SI->getCondition()); + int SwitchBBIdx = Phi.getBasicBlockIndex(SwitchBlock); + if (Phi.getIncomingValue(SwitchBBIdx) == CaseValue && + count(Phi.blocks(), SwitchBlock) == 1) { + Phi.setIncomingValue(SwitchBBIdx, SI->getCondition()); Changed = true; } } @@ -4656,14 +4649,13 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest, return false; // Get the values for this case from phi nodes in the destination block. - BasicBlock::iterator I = (*CommonDest)->begin(); - while (PHINode *PHI = dyn_cast(I++)) { - int Idx = PHI->getBasicBlockIndex(Pred); + for (PHINode &PHI : (*CommonDest)->phis()) { + int Idx = PHI.getBasicBlockIndex(Pred); if (Idx == -1) continue; Constant *ConstVal = - LookupConstant(PHI->getIncomingValue(Idx), ConstantPool); + LookupConstant(PHI.getIncomingValue(Idx), ConstantPool); if (!ConstVal) return false; @@ -4671,7 +4663,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest, if (!ValidLookupTableConstant(ConstVal, TTI)) return false; - Res.push_back(std::make_pair(PHI, ConstVal)); + Res.push_back(std::make_pair(&PHI, ConstVal)); } return Res.size() > 0; @@ -5946,14 +5938,13 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) { /// If BB has an incoming value that will always trigger undefined behavior /// (eg. null pointer dereference), remove the branch leading here. static bool removeUndefIntroducingPredecessor(BasicBlock *BB) { - for (BasicBlock::iterator i = BB->begin(); - PHINode *PHI = dyn_cast(i); ++i) - for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) - if (passingValueIsAlwaysUndefined(PHI->getIncomingValue(i), PHI)) { - TerminatorInst *T = PHI->getIncomingBlock(i)->getTerminator(); + for (PHINode &PHI : BB->phis()) + for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i) + if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) { + TerminatorInst *T = PHI.getIncomingBlock(i)->getTerminator(); IRBuilder<> Builder(T); if (BranchInst *BI = dyn_cast(T)) { - BB->removePredecessor(PHI->getIncomingBlock(i)); + BB->removePredecessor(PHI.getIncomingBlock(i)); // Turn uncoditional branches into unreachables and remove the dead // destination from conditional branches. if (BI->isUnconditional()) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 52f32cda2609..6ef54385c452 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4164,15 +4164,12 @@ void InnerLoopVectorizer::fixCrossIterationPHIs() { // the currently empty PHI nodes. At this point every instruction in the // original loop is widened to a vector form so we can use them to construct // the incoming edges. - for (Instruction &I : *OrigLoop->getHeader()) { - PHINode *Phi = dyn_cast(&I); - if (!Phi) - break; + for (PHINode &Phi : OrigLoop->getHeader()->phis()) { // Handle first-order recurrences and reductions that need to be fixed. - if (Legal->isFirstOrderRecurrence(Phi)) - fixFirstOrderRecurrence(Phi); - else if (Legal->isReductionVariable(Phi)) - fixReduction(Phi); + if (Legal->isFirstOrderRecurrence(&Phi)) + fixFirstOrderRecurrence(&Phi); + else if (Legal->isReductionVariable(&Phi)) + fixReduction(&Phi); } } @@ -4337,12 +4334,9 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // vector recurrence we extracted in the middle block. Since the loop is in // LCSSA form, we just need to find the phi node for the original scalar // recurrence in the exit block, and then add an edge for the middle block. - for (auto &I : *LoopExitBlock) { - auto *LCSSAPhi = dyn_cast(&I); - if (!LCSSAPhi) - break; - if (LCSSAPhi->getIncomingValue(0) == Phi) { - LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { + if (LCSSAPhi.getIncomingValue(0) == Phi) { + LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); break; } } @@ -4499,21 +4493,15 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // inside and outside of the scalar remainder loop. // We know that the loop is in LCSSA form. We need to update the // PHI nodes in the exit blocks. - for (BasicBlock::iterator LEI = LoopExitBlock->begin(), - LEE = LoopExitBlock->end(); - LEI != LEE; ++LEI) { - PHINode *LCSSAPhi = dyn_cast(LEI); - if (!LCSSAPhi) - break; - + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { // All PHINodes need to have a single entry edge, or two if // we already fixed them. - assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); + assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); // We found a reduction value exit-PHI. Update it with the // incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) - LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) + LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); } // end of the LCSSA phi scan. // Fix the scalar loop reduction variable with the incoming reduction sum @@ -4528,14 +4516,11 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } void InnerLoopVectorizer::fixLCSSAPHIs() { - for (Instruction &LEI : *LoopExitBlock) { - auto *LCSSAPhi = dyn_cast(&LEI); - if (!LCSSAPhi) - break; - if (LCSSAPhi->getNumIncomingValues() == 1) { - assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) && + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { + if (LCSSAPhi.getNumIncomingValues() == 1) { + assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) && "Incoming value isn't loop invariant"); - LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock); + LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock); } } } @@ -4981,11 +4966,8 @@ void InnerLoopVectorizer::updateAnalysis() { /// Phi nodes with constant expressions that can trap are not safe to if /// convert. static bool canIfConvertPHINodes(BasicBlock *BB) { - for (Instruction &I : *BB) { - auto *Phi = dyn_cast(&I); - if (!Phi) - return true; - for (Value *V : Phi->incoming_values()) + for (PHINode &Phi : BB->phis()) { + for (Value *V : Phi.incoming_values()) if (auto *C = dyn_cast(V)) if (C->canTrap()) return false; diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index 111aaf88b160..1b90f29407d4 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -1,4 +1,4 @@ -; RUN: not llc -O0 -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR +; RUN: not llc -O0 -global-isel -global-isel-abort=1 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR ; RUN: llc -O0 -global-isel -global-isel-abort=0 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=FALLBACK ; RUN: llc -O0 -global-isel -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o %t.out 2> %t.err ; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-OUT < %t.out diff --git a/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll index 0972840de47b..0f7f0bdae7d4 100644 --- a/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll +++ b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll @@ -1,5 +1,8 @@ ; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ -; RUN: -O0 -aarch64-enable-global-isel-at-O=0 \ +; RUN: -O0 | FileCheck %s --check-prefix ENABLED --check-prefix ENABLED-O0 --check-prefix FALLBACK + +; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ +; RUN: -O0 -aarch64-enable-global-isel-at-O=0 -global-isel-abort=1 \ ; RUN: | FileCheck %s --check-prefix ENABLED --check-prefix ENABLED-O0 --check-prefix NOFALLBACK ; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir b/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir index c94d73920ca3..efe9105b90c7 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir @@ -8,7 +8,12 @@ entry: ret void } - define void @test_mul_overflow() { ret void } + define void @test_smul_overflow() { + ret void + } + define void @test_umul_overflow() { + ret void + } ... --- @@ -43,18 +48,19 @@ body: | --- -name: test_mul_overflow +name: test_smul_overflow body: | bb.0: liveins: %x0, %x1, %w2, %w3 - ; CHECK-LABEL: name: test_mul_overflow + ; CHECK-LABEL: name: test_smul_overflow ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY %x0 ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %x1 ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[COPY1]] ; CHECK: [[SMULH:%[0-9]+]]:_(s64) = G_SMULH [[COPY]], [[COPY1]] - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[SMULH]](s64), [[C]] + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63 + ; CHECK: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MUL]], [[C]] + ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[SMULH]](s64), [[ASHR]] ; CHECK: %x0 = COPY [[MUL]](s64) ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ICMP]](s32) ; CHECK: %w0 = COPY [[COPY2]](s32) @@ -66,3 +72,29 @@ body: | %w0 = COPY %4 ... + + +--- +name: test_umul_overflow +body: | + bb.0: + liveins: %x0, %x1, %w2, %w3 + + ; CHECK-LABEL: name: test_umul_overflow + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY %x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %x1 + ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[COPY1]] + ; CHECK: [[UMULH:%[0-9]+]]:_(s64) = G_UMULH [[COPY]], [[COPY1]] + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UMULH]](s64), [[C]] + ; CHECK: %x0 = COPY [[MUL]](s64) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ICMP]](s32) + ; CHECK: %w0 = COPY [[COPY2]](s32) + %0:_(s64) = COPY %x0 + %1:_(s64) = COPY %x1 + %2:_(s64), %3:_(s1) = G_UMULO %0, %1 + %x0 = COPY %2 + %4:_(s32) = G_ANYEXT %3 + %w0 = COPY %4 + +... diff --git a/test/CodeGen/AArch64/GlobalISel/unknown-intrinsic.ll b/test/CodeGen/AArch64/GlobalISel/unknown-intrinsic.ll new file mode 100644 index 000000000000..179dd518d3f0 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/unknown-intrinsic.ll @@ -0,0 +1,10 @@ +; RUN: llc -O0 -mtriple=arm64 < %s + +declare i8* @llvm.invariant.group.barrier(i8*) + +define i8* @barrier(i8* %p) { +; CHECK: bl llvm.invariant.group.barrier + %q = call i8* @llvm.invariant.group.barrier(i8* %p) + ret i8* %q +} + diff --git a/test/CodeGen/AArch64/aarch64_f16_be.ll b/test/CodeGen/AArch64/aarch64_f16_be.ll index 7504439bab80..b51798be1697 100644 --- a/test/CodeGen/AArch64/aarch64_f16_be.ll +++ b/test/CodeGen/AArch64/aarch64_f16_be.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=aarch64-linux-gnuabi -O0 < %s | FileCheck %s -; RUN: llc -mtriple=aarch64_be-linux-gnuabi -O0 < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -mtriple=aarch64-linux-gnuabi -O0 -fast-isel < %s | FileCheck %s +; RUN: llc -mtriple=aarch64_be-linux-gnuabi -O0 -fast-isel < %s | FileCheck %s --check-prefix=CHECK-BE define void @test_bitcast_v8f16_to_v4f32(<8 x half> %a) { ; CHECK-LABEL: test_bitcast_v8f16_to_v4f32: diff --git a/test/CodeGen/AArch64/and-mask-removal.ll b/test/CodeGen/AArch64/and-mask-removal.ll index 8291516d81ea..c02bc881cd33 100644 --- a/test/CodeGen/AArch64/and-mask-removal.ll +++ b/test/CodeGen/AArch64/and-mask-removal.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -fast-isel=false -mtriple=arm64-apple-darwin < %s | FileCheck %s +; RUN: llc -mtriple=arm64-apple-darwin < %s | FileCheck %s @board = common global [400 x i8] zeroinitializer, align 1 @next_string = common global i32 0, align 4 diff --git a/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll index 5a1eabc2ee6c..a1002989165c 100644 --- a/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll +++ b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs < %s | FileCheck %s ; The following 2 test cases test shufflevector with beginning UNDEF mask. define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) { diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll index 5be84b7d493b..bfc03c6b9757 100644 --- a/test/CodeGen/AArch64/arm64-abi.ll +++ b/test/CodeGen/AArch64/arm64-abi.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=arm64-apple-darwin -mcpu=cyclone -enable-misched=false < %s | FileCheck %s -; RUN: llc -O0 -mtriple=arm64-apple-darwin < %s | FileCheck --check-prefix=FAST %s +; RUN: llc -O0 -fast-isel -mtriple=arm64-apple-darwin < %s | FileCheck --check-prefix=FAST %s ; rdar://9932559 define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline { diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll index b844aab5628c..56a882a2a15f 100644 --- a/test/CodeGen/AArch64/arm64-abi_align.ll +++ b/test/CodeGen/AArch64/arm64-abi_align.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=arm64-apple-darwin -mcpu=cyclone -enable-misched=false -disable-fp-elim | FileCheck %s -; RUN: llc < %s -mtriple=arm64-apple-darwin -O0 -disable-fp-elim | FileCheck -check-prefix=FAST %s +; RUN: llc < %s -mtriple=arm64-apple-darwin -O0 -disable-fp-elim -fast-isel | FileCheck -check-prefix=FAST %s ; rdar://12648441 ; Generated from arm64-arguments.c with -O2. diff --git a/test/CodeGen/AArch64/arm64-elf-constpool.ll b/test/CodeGen/AArch64/arm64-elf-constpool.ll index 95d334376b76..9f7a885f0087 100644 --- a/test/CodeGen/AArch64/arm64-elf-constpool.ll +++ b/test/CodeGen/AArch64/arm64-elf-constpool.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s -; RUN: llc -mtriple=arm64-linux-gnu -O0 -o - %s | FileCheck %s +; RUN: llc -mtriple=arm64-linux-gnu -O0 -fast-isel -o - %s | FileCheck %s ; O0 checked for fastisel purposes. It has a separate path which ; creates a constpool entry for floating values. diff --git a/test/CodeGen/AArch64/arm64-elf-globals.ll b/test/CodeGen/AArch64/arm64-elf-globals.ll index 92dc8179f8ea..6cb72e2e3f4e 100644 --- a/test/CodeGen/AArch64/arm64-elf-globals.ll +++ b/test/CodeGen/AArch64/arm64-elf-globals.ll @@ -1,11 +1,11 @@ ; RUN: llc -mtriple=arm64-linux-gnu -o - %s -mcpu=cyclone | FileCheck %s -; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST +; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 -fast-isel -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST ; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC -; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC +; RUN: llc -mtriple=arm64-linux-gnu -O0 -fast-isel -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC ; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -o - %s -mcpu=cyclone | FileCheck %s -; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST +; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -o - %s -O0 -fast-isel -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST ; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC -; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC +; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -O0 -fast-isel -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC @var8 = external global i8, align 1 @var16 = external global i16, align 2 diff --git a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll index bdc24aea2144..256db180d911 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll @@ -1,5 +1,5 @@ ; This test should cause the TargetMaterializeAlloca to be invoked -; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin -disable-fp-elim < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin -disable-fp-elim < %s | FileCheck %s %struct.S1Ty = type { i64 } %struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty } diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll index 55c9c6036ed5..87d6811f239e 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-br.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -fast-isel-abort=1 -mtriple=arm64-apple-darwin -mcpu=cyclone -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -mtriple=arm64-apple-darwin -mcpu=cyclone -verify-machineinstrs < %s | FileCheck %s define void @branch1() nounwind uwtable ssp { %x = alloca i32, align 4 diff --git a/test/CodeGen/AArch64/arm64-fast-isel-call.ll b/test/CodeGen/AArch64/arm64-fast-isel-call.ll index 59c4e38e5467..4cf23545aabc 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-call.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-call.ll @@ -1,6 +1,6 @@ -; RUN: llc -O0 -fast-isel-abort=2 -code-model=small -verify-machineinstrs -disable-fp-elim -mtriple=arm64-apple-darwin < %s | FileCheck %s -; RUN: llc -O0 -fast-isel-abort=2 -code-model=large -verify-machineinstrs -disable-fp-elim -mtriple=arm64-apple-darwin < %s | FileCheck %s --check-prefix=LARGE -; RUN: llc -O0 -fast-isel-abort=2 -code-model=small -verify-machineinstrs -disable-fp-elim -mtriple=aarch64_be-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -O0 -fast-isel -fast-isel-abort=2 -code-model=small -verify-machineinstrs -disable-fp-elim -mtriple=arm64-apple-darwin < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -fast-isel-abort=2 -code-model=large -verify-machineinstrs -disable-fp-elim -mtriple=arm64-apple-darwin < %s | FileCheck %s --check-prefix=LARGE +; RUN: llc -O0 -fast-isel -fast-isel-abort=2 -code-model=small -verify-machineinstrs -disable-fp-elim -mtriple=aarch64_be-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-BE define void @call0() nounwind { entry: diff --git a/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll index 16a02de79a91..b3e649c3fc33 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -verify-machineinstrs -mtriple=arm64-eabi < %s | FileCheck --enable-var-scope %s +; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=arm64-eabi < %s | FileCheck --enable-var-scope %s ; Test fptosi define i32 @fptosi_wh(half %a) nounwind ssp { diff --git a/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll index 1b6886523311..7b208cceb5b2 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin -mcpu=cyclone < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin -mcpu=cyclone < %s | FileCheck %s ;; Test various conversions. define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp { diff --git a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll index c77949f996c3..51ec377ccaf4 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s define zeroext i1 @fcmp_float1(float %a) { ; CHECK-LABEL: fcmp_float1 diff --git a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll index 85d000b8606b..00e2fab81f98 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s ; Test load/store of global value from global offset table. @seed = common global i64 0, align 8 diff --git a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll index 4bc02ebdd3e1..4288aa1df444 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s define i32 @icmp_eq_imm(i32 %a) nounwind ssp { entry: diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll index a8f30ad4777d..1af960a12a19 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios < %s | FileCheck %s --check-prefix=ARM64 +; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios < %s | FileCheck %s --check-prefix=ARM64 @message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16 @temp = common global [80 x i8] zeroinitializer, align 16 diff --git a/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll index b5a08c148930..234731cfa242 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s ; Materialize using fmov define float @fmov_float1() { diff --git a/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll index 81daa7c1d5ac..d9997f916955 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -verify-machineinstrs -mtriple=aarch64-apple-ios < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=aarch64-apple-ios < %s | FileCheck %s ; Fast-isel can't do vector conversions yet, but it was emitting some highly ; suspect UCVTFUWDri MachineInstrs. diff --git a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll index c26bfa8bcfeb..635e6b92542a 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll @@ -1,5 +1,5 @@ -; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s -; RUN: llc %s -O0 -fast-isel-abort=1 -mtriple=arm64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2> %t +; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s +; RUN: llc %s -O0 -fast-isel -fast-isel-abort=1 -mtriple=arm64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2> %t ; RUN: FileCheck %s < %t --check-prefix=CHECK-SSA ; CHECK-SSA-LABEL: Machine code for function t1 diff --git a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll index 1f6a60e77cc3..9a67fff00ac3 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s ;; Test returns. define void @t0() nounwind ssp { diff --git a/test/CodeGen/AArch64/arm64-fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll index 9f83a9c359a2..39934c4399b4 100644 --- a/test/CodeGen/AArch64/arm64-fast-isel.ll +++ b/test/CodeGen/AArch64/arm64-fast-isel.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s define void @t0(i32 %a) nounwind { entry: diff --git a/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll index e72c2b7989d2..98851917999b 100644 --- a/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll +++ b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -O0 -fast-isel -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp { ; CHECK: uaddlv.16b h0, v0 diff --git a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll index bb9ad46ba63d..9f77d3527d4b 100644 --- a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll +++ b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll @@ -1,6 +1,6 @@ -; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic \ +; RUN: llc -O0 -fast-isel -mtriple=arm64-none-linux-gnu -relocation-model=pic \ ; RUN: -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=NOEMU %s -; RUN: llc -emulated-tls -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic \ +; RUN: llc -emulated-tls -O0 -fast-isel -mtriple=arm64-none-linux-gnu -relocation-model=pic \ ; RUN: -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=EMU %s ; If the .tlsdesccall and blr parts are emitted completely separately (even with diff --git a/test/CodeGen/AArch64/arm64-vcvt_f.ll b/test/CodeGen/AArch64/arm64-vcvt_f.ll index 254671a3c3c5..90cc2d37882c 100644 --- a/test/CodeGen/AArch64/arm64-vcvt_f.ll +++ b/test/CodeGen/AArch64/arm64-vcvt_f.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s -; RUN: llc < %s -O0 -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s +; RUN: llc < %s -O0 -fast-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp { ; CHECK-LABEL: test_vcvt_f64_f32: diff --git a/test/CodeGen/AArch64/br-cond-not-merge.ll b/test/CodeGen/AArch64/br-cond-not-merge.ll index bf21ef307905..46532386783f 100644 --- a/test/CodeGen/AArch64/br-cond-not-merge.ll +++ b/test/CodeGen/AArch64/br-cond-not-merge.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=aarch64 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s -; RUN: llc -mtriple=aarch64 -verify-machineinstrs -O0 -fast-isel=0 < %s | FileCheck --check-prefix=CHECK --check-prefix=NOOPT %s +; RUN: llc -mtriple=aarch64 -verify-machineinstrs -O0 -fast-isel=0 -global-isel=false < %s | FileCheck --check-prefix=CHECK --check-prefix=NOOPT %s declare void @foo() diff --git a/test/CodeGen/AArch64/cmpxchg-O0.ll b/test/CodeGen/AArch64/cmpxchg-O0.ll index 1bfbcf851c0e..bd3d328ec119 100644 --- a/test/CodeGen/AArch64/cmpxchg-O0.ll +++ b/test/CodeGen/AArch64/cmpxchg-O0.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -O0 -fast-isel=0 %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -O0 -fast-isel=0 -global-isel=false %s -o - | FileCheck %s define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind { ; CHECK-LABEL: test_cmpxchg_8: diff --git a/test/CodeGen/AArch64/cxx-tlscc.ll b/test/CodeGen/AArch64/cxx-tlscc.ll index a36aad51ca82..d179eab7e8fa 100644 --- a/test/CodeGen/AArch64/cxx-tlscc.ll +++ b/test/CodeGen/AArch64/cxx-tlscc.ll @@ -3,7 +3,7 @@ ; Shrink wrapping currently does not kick in because we have a TLS CALL ; in the entry block and it will clobber the link register. -; RUN: llc < %s -mtriple=aarch64-apple-ios -O0 | FileCheck --check-prefix=CHECK-O0 %s +; RUN: llc < %s -mtriple=aarch64-apple-ios -O0 -fast-isel | FileCheck --check-prefix=CHECK-O0 %s %struct.S = type { i8 } diff --git a/test/CodeGen/AArch64/fast-isel-atomic.ll b/test/CodeGen/AArch64/fast-isel-atomic.ll index ec612616ae2a..452129e49515 100644 --- a/test/CodeGen/AArch64/fast-isel-atomic.ll +++ b/test/CodeGen/AArch64/fast-isel-atomic.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=aarch64-- -O0 -fast-isel -fast-isel-abort=4 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-- -O0 -fast-isel=0 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-- -O0 -fast-isel=0 -global-isel=false -verify-machineinstrs < %s | FileCheck %s ; Note that checking SelectionDAG output isn't strictly necessary, but they ; currently match, so we might as well check both! Feel free to remove SDAG. diff --git a/test/CodeGen/AArch64/fast-isel-sp-adjust.ll b/test/CodeGen/AArch64/fast-isel-sp-adjust.ll index 9201d1be6a9c..a17a2564b4fe 100644 --- a/test/CodeGen/AArch64/fast-isel-sp-adjust.ll +++ b/test/CodeGen/AArch64/fast-isel-sp-adjust.ll @@ -1,5 +1,5 @@ -; RUN: llc -O0 -mtriple=aarch64-apple-ios -o - %s | FileCheck %s -; RUN: not llc -O0 -mtriple=aarch64-apple-ios -o /dev/null -fast-isel-abort=3 %s 2> %t +; RUN: llc -O0 -fast-isel -mtriple=aarch64-apple-ios -o - %s | FileCheck %s +; RUN: not llc -O0 -mtriple=aarch64-apple-ios -o /dev/null -fast-isel -fast-isel-abort=3 %s 2> %t ; RUN: FileCheck %s --check-prefix=CHECK-ERRORS < %t ; The issue here is that FastISel cannot emit an ADDrr where one of the inputs diff --git a/test/CodeGen/AArch64/i128-fast-isel-fallback.ll b/test/CodeGen/AArch64/i128-fast-isel-fallback.ll index 1cffbf3de052..80c83bd4823e 100644 --- a/test/CodeGen/AArch64/i128-fast-isel-fallback.ll +++ b/test/CodeGen/AArch64/i128-fast-isel-fallback.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=arm64-apple-ios7.0 -mcpu=generic < %s | FileCheck %s +; RUN: llc -O0 -fast-isel -mtriple=arm64-apple-ios7.0 -mcpu=generic < %s | FileCheck %s ; Function Attrs: nounwind ssp define void @test1() { diff --git a/test/CodeGen/AArch64/ldst-paired-aliasing.ll b/test/CodeGen/AArch64/ldst-paired-aliasing.ll index 9b0b51d369a3..9c698b5fdcc6 100644 --- a/test/CodeGen/AArch64/ldst-paired-aliasing.ll +++ b/test/CodeGen/AArch64/ldst-paired-aliasing.ll @@ -10,10 +10,11 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3 define i32 @main() local_unnamed_addr #1 { ; Make sure the stores happen in the correct order (the exact instructions could change). ; CHECK-LABEL: main: -; CHECK: str xzr, [sp, #80] +; CHECK: stp xzr, xzr, [sp, #72] ; CHECK: str w9, [sp, #80] -; CHECK: stp q0, q0, [sp, #48] +; CHECK: str q0, [sp, #48] ; CHECK: ldr w8, [sp, #48] +; CHECK: str q0, [sp, #64] for.body.lr.ph.i.i.i.i.i.i63: %b1 = alloca [10 x i32], align 16 diff --git a/test/CodeGen/AArch64/minmax-of-minmax.ll b/test/CodeGen/AArch64/minmax-of-minmax.ll new file mode 100644 index 000000000000..07466f5b92f3 --- /dev/null +++ b/test/CodeGen/AArch64/minmax-of-minmax.ll @@ -0,0 +1,1032 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s + +; There are 4 commuted variants (abbc/abcb/bcab/bcba) * +; 4 predicate variants ([*][lg][te]) * +; 4 min/max flavors (smin/smax/umin/umax) +; = 64 tests + +define <4 x i32> @smin_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_ab_bc: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp slt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @smin_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_ab_cb: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp slt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp slt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @smin_bc_ab(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_bc_ab: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp slt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @smin_bc_ba(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_bc_ba: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s +; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp slt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp slt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @smin_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_ab_bc_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp sgt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @smin_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_ab_cb_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp slt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp sgt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @smin_bc_ab_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_bc_ab_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp sgt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @smin_bc_ba_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_bc_ba_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s +; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp slt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp sgt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @smin_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_ab_bc_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp sle <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @smin_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_ab_cb_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp slt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp sle <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @smin_bc_ab_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_bc_ab_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp sle <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @smin_bc_ba_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_bc_ba_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s +; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp slt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp sle <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @smin_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_ab_bc_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp sge <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @smin_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_ab_cb_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp slt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp sge <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @smin_bc_ab_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_bc_ab_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp slt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp sge <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @smin_bc_ba_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smin_bc_ba_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s +; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp slt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp slt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp sge <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @smax_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_ab_bc: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp sgt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @smax_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_ab_cb: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp sgt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp sgt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @smax_bc_ab(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_bc_ab: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp sgt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @smax_bc_ba(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_bc_ba: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s +; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp sgt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp sgt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @smax_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_ab_bc_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp slt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @smax_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_ab_cb_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp sgt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp slt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @smax_bc_ab_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_bc_ab_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp slt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @smax_bc_ba_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_bc_ba_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s +; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp sgt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp slt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @smax_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_ab_bc_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp sge <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @smax_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_ab_cb_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp sgt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp sge <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @smax_bc_ab_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_bc_ab_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp sge <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @smax_bc_ba_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_bc_ba_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s +; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp sgt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp sge <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @smax_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_ab_bc_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp sle <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @smax_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_ab_cb_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp sgt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp sle <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @smax_bc_ab_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_bc_ab_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp sgt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp sle <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @smax_bc_ba_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: smax_bc_ba_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s +; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp sgt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp sgt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp sle <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @umin_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_ab_bc: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp ult <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @umin_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_ab_cb: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp ult <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp ult <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @umin_bc_ab(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_bc_ab: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp ult <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @umin_bc_ba(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_bc_ba: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s +; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp ult <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp ult <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @umin_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_ab_bc_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp ugt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @umin_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_ab_cb_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp ult <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp ugt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @umin_bc_ab_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_bc_ab_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp ugt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @umin_bc_ba_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_bc_ba_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s +; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp ult <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp ugt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @umin_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_ab_bc_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp ule <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @umin_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_ab_cb_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp ult <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp ule <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @umin_bc_ab_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_bc_ab_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp ule <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @umin_bc_ba_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_bc_ba_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s +; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp ult <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp ule <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @umin_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_ab_bc_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp uge <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @umin_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_ab_cb_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp ult <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp uge <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @umin_bc_ab_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_bc_ab_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp ult <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp uge <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @umin_bc_ba_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umin_bc_ba_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s +; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ult <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp ult <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp uge <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @umax_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_ab_bc: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp ugt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @umax_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_ab_cb: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp ugt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp ugt <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @umax_bc_ab(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_bc_ab: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp ugt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @umax_bc_ba(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_bc_ba: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s +; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp ugt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp ugt <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @umax_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_ab_bc_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp ult <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @umax_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_ab_cb_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp ugt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp ult <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @umax_bc_ab_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_bc_ab_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp ult <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @umax_bc_ba_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_bc_ba_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s +; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp ugt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp ult <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @umax_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_ab_bc_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp uge <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @umax_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_ab_cb_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp ugt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp uge <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @umax_bc_ab_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_bc_ab_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp uge <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @umax_bc_ba_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_bc_ba_eq_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s +; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp ugt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp uge <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + +define <4 x i32> @umax_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_ab_bc_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ac = icmp ule <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_bc + ret <4 x i32> %r +} + +define <4 x i32> @umax_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_ab_cb_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_cb = icmp ugt <4 x i32> %c, %b + %min_cb = select <4 x i1> %cmp_cb, <4 x i32> %c, <4 x i32> %b + %cmp_ac = icmp ule <4 x i32> %c, %a + %r = select <4 x i1> %cmp_ac, <4 x i32> %min_ab, <4 x i32> %min_cb + ret <4 x i32> %r +} + +define <4 x i32> @umax_bc_ab_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_bc_ab_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ab = icmp ugt <4 x i32> %a, %b + %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b + %cmp_ca = icmp ule <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ab + ret <4 x i32> %r +} + +define <4 x i32> @umax_bc_ba_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: umax_bc_ba_eq_swap_pred: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s +; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %cmp_bc = icmp ugt <4 x i32> %b, %c + %min_bc = select <4 x i1> %cmp_bc, <4 x i32> %b, <4 x i32> %c + %cmp_ba = icmp ugt <4 x i32> %b, %a + %min_ba = select <4 x i1> %cmp_ba, <4 x i32> %b, <4 x i32> %a + %cmp_ca = icmp ule <4 x i32> %a, %c + %r = select <4 x i1> %cmp_ca, <4 x i32> %min_bc, <4 x i32> %min_ba + ret <4 x i32> %r +} + diff --git a/test/CodeGen/AArch64/preferred-alignment.ll b/test/CodeGen/AArch64/preferred-alignment.ll index c032e83d268f..b39a5e8703d7 100644 --- a/test/CodeGen/AArch64/preferred-alignment.ll +++ b/test/CodeGen/AArch64/preferred-alignment.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64 -O0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 -O0 -fast-isel < %s | FileCheck %s ; Function Attrs: nounwind define i32 @foo() #0 { diff --git a/test/CodeGen/AArch64/swift-return.ll b/test/CodeGen/AArch64/swift-return.ll index 15c19ce36196..b909482dc0bf 100644 --- a/test/CodeGen/AArch64/swift-return.ll +++ b/test/CodeGen/AArch64/swift-return.ll @@ -1,5 +1,5 @@ ; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s -; RUN: llc -O0 -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-O0 +; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-O0 ; CHECK-LABEL: test1 ; CHECK: bl _gen diff --git a/test/CodeGen/AArch64/swifterror.ll b/test/CodeGen/AArch64/swifterror.ll index bcad19e391d0..c5049e49dabf 100644 --- a/test/CodeGen/AArch64/swifterror.ll +++ b/test/CodeGen/AArch64/swifterror.ll @@ -1,5 +1,5 @@ ; RUN: llc -verify-machineinstrs -disable-fp-elim -enable-shrink-wrap=false < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck --check-prefix=CHECK-APPLE %s -; RUN: llc -verify-machineinstrs -disable-fp-elim -O0 < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck --check-prefix=CHECK-O0 %s +; RUN: llc -verify-machineinstrs -disable-fp-elim -O0 -fast-isel < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck --check-prefix=CHECK-O0 %s declare i8* @malloc(i64) declare void @free(i8*) diff --git a/test/CodeGen/AArch64/swiftself.ll b/test/CodeGen/AArch64/swiftself.ll index 33a49198430e..f19c852cb9b1 100644 --- a/test/CodeGen/AArch64/swiftself.ll +++ b/test/CodeGen/AArch64/swiftself.ll @@ -1,5 +1,5 @@ ; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s -; RUN: llc -O0 -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s +; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=aarch64-unknown-linux-gnu -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s ; Parameter with swiftself should be allocated to x20. diff --git a/test/CodeGen/AArch64/tailcall-fastisel.ll b/test/CodeGen/AArch64/tailcall-fastisel.ll index 3ba639183161..ea173de274ed 100644 --- a/test/CodeGen/AArch64/tailcall-fastisel.ll +++ b/test/CodeGen/AArch64/tailcall-fastisel.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=arm64-apple-darwin -O0 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-apple-darwin -O0 -fast-isel | FileCheck %s ; CHECK: b _foo0 diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 2b4aca019a39..5218c7845861 100644 --- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -146,6 +146,7 @@ entry: } ; GCN-LABEL: {{^}}extract_undef_offset_sgpr: +; undefined behavior, but shouldn't crash compiler define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { entry: %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in @@ -155,9 +156,7 @@ entry: } ; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src: -; GCN-DAG: buffer_load_dwordx4 -; MOVREL-DAG: s_mov_b32 m0, -; MOVREL: v_movreld_b32 +; undefined behavior, but shouldn't crash compiler define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { entry: %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -471,32 +470,6 @@ bb2: ; GCN-LABEL: {{^}}insert_adjacent_blocks: -; GCN: s_load_dword [[ARG:s[0-9]+]] -; GCN: s_cmp_lg_u32 -; GCN: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]] - -; GCN: buffer_load_dwordx4 -; MOVREL: s_mov_b32 m0, -; MOVREL: v_movreld_b32_e32 - -; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, dst -; IDXMODE: v_mov_b32_e32 -; IDXMODE: s_set_gpr_idx_off - -; GCN: s_branch [[ENDBB:BB[0-9]+_[0-9]+]] - -; GCN: [[BB4]]: -; GCN: buffer_load_dwordx4 -; MOVREL: s_mov_b32 m0, -; MOVREL: v_movreld_b32_e32 - -; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, dst -; IDXMODE: v_mov_b32_e32 -; IDXMODE: s_set_gpr_idx_off - -; GCN: [[ENDBB]]: -; GCN: buffer_store_dword -; GCN: s_endpgm define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 { bb: %tmp = icmp eq i32 %arg, 0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll index d9be4a4d0191..fac0d665bf06 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -157,12 +157,13 @@ main_body: ; Ideally, the register allocator would avoid the wait here ; +; XXX - Is this really allowed? Are the resource descriptors allowed to alias? ; GCN-LABEL: {{^}}image_store_wait: +; GCN: image_load v[5:8], v4, s[8:15] dmask:0xf unorm ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm -; GCN: s_waitcnt expcnt(0) -; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm -; GCN: s_waitcnt vmcnt(0) -; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(1) +; GCN: image_store v[5:8], v4, s[16:23] dmask:0xf unorm +; GCN-NEXT: s_endpgm define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <8 x i32> inreg %arg2, <4 x float> %arg3, i32 %arg4) #0 { main_body: call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %arg3, i32 %arg4, <8 x i32> %arg, i32 15, i1 false, i1 false, i1 false, i1 false) @@ -171,6 +172,21 @@ main_body: ret void } +; The same image resource is used so reordering is not OK. +; GCN-LABEL: {{^}}image_store_wait_same_resource: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm +; GCN: s_waitcnt expcnt(0) +; GCN: image_load v[0:3], v4, s[0:7] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm +define amdgpu_ps void @image_store_wait_same_resource(<8 x i32> inreg %rsrc, <4 x float> %arg3, i32 %arg4) #0 { +main_body: + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %arg3, i32 %arg4, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %arg4, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %arg4, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) + ret void +} + ; SI won't merge ds memory operations, because of the signed offset bug, so ; we only have check lines for VI. ; VI-LABEL: image_load_mmo diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll index f6c2cb44c993..61c287a896fe 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test1: @@ -20,6 +20,7 @@ define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> ; CHECK-LABEL: {{^}}test2: ; CHECK-NOT: s_waitcnt ; CHECK: image_load +; CHECK-NEXT: v_lshlrev_b32 ; CHECK-NEXT: s_waitcnt ; CHECK: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: image_store diff --git a/test/CodeGen/Hexagon/autohvx/vext-128b.ll b/test/CodeGen/Hexagon/autohvx/vext-128b.ll index 6ddab1d55933..3a0cd06578e5 100644 --- a/test/CodeGen/Hexagon/autohvx/vext-128b.ll +++ b/test/CodeGen/Hexagon/autohvx/vext-128b.ll @@ -1,36 +1,48 @@ ; RUN: llc -march=hexagon < %s | FileCheck %s ; CHECK-LABEL: test_00: -; CHECK: v1:0.h = vsxt(v0.b) +; CHECK-DAG: v[[H00:[0-9]+]]:[[L00:[0-9]+]].h = vsxt(v0.b) +; CHECK-DAG: r[[R00:[0-9]+]] = #-2 +; CHECK: v1:0 = vshuff(v[[H00]],v[[L00]],r[[R00]]) define <128 x i16> @test_00(<128 x i8> %v0) #0 { %p = sext <128 x i8> %v0 to <128 x i16> ret <128 x i16> %p } ; CHECK-LABEL: test_01: -; CHECK: v1:0.w = vsxt(v0.h) +; CHECK-DAG: v[[H10:[0-9]+]]:[[L10:[0-9]+]].w = vsxt(v0.h) +; CHECK-DAG: r[[R10:[0-9]+]] = #-4 +; CHECK: v1:0 = vshuff(v[[H10]],v[[L10]],r[[R10]]) define <64 x i32> @test_01(<64 x i16> %v0) #0 { %p = sext <64 x i16> %v0 to <64 x i32> ret <64 x i32> %p } ; CHECK-LABEL: test_02: -; CHECK: v1:0.uh = vzxt(v0.ub) +; CHECK-DAG: v[[H20:[0-9]+]]:[[L20:[0-9]+]].uh = vzxt(v0.ub) +; CHECK-DAG: r[[R20:[0-9]+]] = #-2 +; CHECK: v1:0 = vshuff(v[[H20]],v[[L20]],r[[R20]]) define <128 x i16> @test_02(<128 x i8> %v0) #0 { %p = zext <128 x i8> %v0 to <128 x i16> ret <128 x i16> %p } ; CHECK-LABEL: test_03: -; CHECK: v1:0.uw = vzxt(v0.uh) +; CHECK-DAG: v[[H30:[0-9]+]]:[[L30:[0-9]+]].uw = vzxt(v0.uh) +; CHECK-DAG: r[[R30:[0-9]+]] = #-4 +; CHECK: v1:0 = vshuff(v[[H30]],v[[L30]],r[[R30]]) define <64 x i32> @test_03(<64 x i16> %v0) #0 { %p = zext <64 x i16> %v0 to <64 x i32> ret <64 x i32> %p } ; CHECK-LABEL: test_04: -; CHECK: v[[H40:[0-9]+]]:[[L40:[0-9]+]].h = vsxt(v0.b) -; CHECK: v1:0.w = vsxt(v[[L40]].h) +; CHECK-DAG: v[[H40:[0-9]+]]:[[L40:[0-9]+]].h = vsxt(v0.b) +; CHECK-DAG: r[[R40:[0-9]+]] = #-2 +; CHECK-DAG: r[[R41:[0-9]+]] = #-4 +; CHECK: v[[H41:[0-9]+]]:[[L41:[0-9]+]] = vshuff(v[[H40]],v[[L40]],r[[R40]]) +; CHECK: v[[H42:[0-9]+]]:[[L42:[0-9]+]].w = vsxt(v[[L41]].h) +; CHECK: v1:0 = vshuff(v[[H42]],v[[L42]],r[[R41]]) define <32 x i32> @test_04(<128 x i8> %v0) #0 { %x = sext <128 x i8> %v0 to <128 x i32> %p = shufflevector <128 x i32> %x, <128 x i32> undef, <32 x i32> @@ -38,8 +50,12 @@ define <32 x i32> @test_04(<128 x i8> %v0) #0 { } ; CHECK-LABEL: test_05: -; CHECK: v[[H40:[0-9]+]]:[[L40:[0-9]+]].uh = vzxt(v0.ub) -; CHECK: v1:0.uw = vzxt(v[[L40]].uh) +; CHECK-DAG: v[[H50:[0-9]+]]:[[L50:[0-9]+]].uh = vzxt(v0.ub) +; CHECK-DAG: r[[R50:[0-9]+]] = #-2 +; CHECK-DAG: r[[R51:[0-9]+]] = #-4 +; CHECK: v[[H51:[0-9]+]]:[[L51:[0-9]+]] = vshuff(v[[H50]],v[[L50]],r[[R50]]) +; CHECK: v[[H52:[0-9]+]]:[[L52:[0-9]+]].uw = vzxt(v[[L51]].uh) +; CHECK: v1:0 = vshuff(v[[H52]],v[[L52]],r[[R51]]) define <32 x i32> @test_05(<128 x i8> %v0) #0 { %x = zext <128 x i8> %v0 to <128 x i32> %p = shufflevector <128 x i32> %x, <128 x i32> undef, <32 x i32> diff --git a/test/CodeGen/Hexagon/autohvx/vext-64b.ll b/test/CodeGen/Hexagon/autohvx/vext-64b.ll index a3df0edc28e4..ff246aebde3c 100644 --- a/test/CodeGen/Hexagon/autohvx/vext-64b.ll +++ b/test/CodeGen/Hexagon/autohvx/vext-64b.ll @@ -1,36 +1,48 @@ ; RUN: llc -march=hexagon < %s | FileCheck %s ; CHECK-LABEL: test_00: -; CHECK: v1:0.h = vsxt(v0.b) +; CHECK-DAG: v[[H00:[0-9]+]]:[[L00:[0-9]+]].h = vsxt(v0.b) +; CHECK-DAG: r[[R00:[0-9]+]] = #-2 +; CHECK: v1:0 = vshuff(v[[H00]],v[[L00]],r[[R00]]) define <64 x i16> @test_00(<64 x i8> %v0) #0 { %p = sext <64 x i8> %v0 to <64 x i16> ret <64 x i16> %p } ; CHECK-LABEL: test_01: -; CHECK: v1:0.w = vsxt(v0.h) +; CHECK-DAG: v[[H10:[0-9]+]]:[[L10:[0-9]+]].w = vsxt(v0.h) +; CHECK-DAG: r[[R10:[0-9]+]] = #-4 +; CHECK: v1:0 = vshuff(v[[H10]],v[[L10]],r[[R10]]) define <32 x i32> @test_01(<32 x i16> %v0) #0 { %p = sext <32 x i16> %v0 to <32 x i32> ret <32 x i32> %p } ; CHECK-LABEL: test_02: -; CHECK: v1:0.uh = vzxt(v0.ub) +; CHECK-DAG: v[[H20:[0-9]+]]:[[L20:[0-9]+]].uh = vzxt(v0.ub) +; CHECK-DAG: r[[R20:[0-9]+]] = #-2 +; CHECK: v1:0 = vshuff(v[[H20]],v[[L20]],r[[R20]]) define <64 x i16> @test_02(<64 x i8> %v0) #0 { %p = zext <64 x i8> %v0 to <64 x i16> ret <64 x i16> %p } ; CHECK-LABEL: test_03: -; CHECK: v1:0.uw = vzxt(v0.uh) +; CHECK-DAG: v[[H30:[0-9]+]]:[[L30:[0-9]+]].uw = vzxt(v0.uh) +; CHECK-DAG: r[[R30:[0-9]+]] = #-4 +; CHECK: v1:0 = vshuff(v[[H30]],v[[L30]],r[[R30]]) define <32 x i32> @test_03(<32 x i16> %v0) #0 { %p = zext <32 x i16> %v0 to <32 x i32> ret <32 x i32> %p } ; CHECK-LABEL: test_04: -; CHECK: v[[H40:[0-9]+]]:[[L40:[0-9]+]].h = vsxt(v0.b) -; CHECK: v1:0.w = vsxt(v[[L40]].h) +; CHECK-DAG: v[[H40:[0-9]+]]:[[L40:[0-9]+]].h = vsxt(v0.b) +; CHECK-DAG: r[[R40:[0-9]+]] = #-2 +; CHECK-DAG: r[[R41:[0-9]+]] = #-4 +; CHECK: v[[H41:[0-9]+]]:[[L41:[0-9]+]] = vshuff(v[[H40]],v[[L40]],r[[R40]]) +; CHECK: v[[H42:[0-9]+]]:[[L42:[0-9]+]].w = vsxt(v[[L41]].h) +; CHECK: v1:0 = vshuff(v[[H42]],v[[L42]],r[[R41]]) define <16 x i32> @test_04(<64 x i8> %v0) #0 { %x = sext <64 x i8> %v0 to <64 x i32> %p = shufflevector <64 x i32> %x, <64 x i32> undef, <16 x i32> @@ -38,8 +50,12 @@ define <16 x i32> @test_04(<64 x i8> %v0) #0 { } ; CHECK-LABEL: test_05: -; CHECK: v[[H40:[0-9]+]]:[[L40:[0-9]+]].uh = vzxt(v0.ub) -; CHECK: v1:0.uw = vzxt(v[[L40]].uh) +; CHECK-DAG: v[[H50:[0-9]+]]:[[L50:[0-9]+]].uh = vzxt(v0.ub) +; CHECK-DAG: r[[R50:[0-9]+]] = #-2 +; CHECK-DAG: r[[R51:[0-9]+]] = #-4 +; CHECK: v[[H51:[0-9]+]]:[[L51:[0-9]+]] = vshuff(v[[H50]],v[[L50]],r[[R50]]) +; CHECK: v[[H52:[0-9]+]]:[[L52:[0-9]+]].uw = vzxt(v[[L51]].uh) +; CHECK: v1:0 = vshuff(v[[H52]],v[[L52]],r[[R51]]) define <16 x i32> @test_05(<64 x i8> %v0) #0 { %x = zext <64 x i8> %v0 to <64 x i32> %p = shufflevector <64 x i32> %x, <64 x i32> undef, <16 x i32> diff --git a/test/CodeGen/Mips/constraint-c-err.ll b/test/CodeGen/Mips/constraint-c-err.ll new file mode 100644 index 000000000000..4015ef480653 --- /dev/null +++ b/test/CodeGen/Mips/constraint-c-err.ll @@ -0,0 +1,17 @@ +; Check that invalid type for constraint `c` causes an error message. +; RUN: not llc -march=mips -target-abi o32 < %s 2>&1 | FileCheck %s + +define i32 @main() #0 { +entry: + %jmp = alloca float, align 4 + store float 0x4200000000000000, float* %jmp, align 4 + %0 = load float, float* %jmp, align 4 + call void asm sideeffect "jr $0", "c,~{$1}"(float %0) #1 + +; CHECK: error: couldn't allocate input reg for constraint 'c' + + ret i32 0 +} + +attributes #0 = { noinline nounwind } +attributes #1 = { nounwind } diff --git a/test/CodeGen/Mips/constraint-c.ll b/test/CodeGen/Mips/constraint-c.ll new file mode 100644 index 000000000000..5a5d7672e956 --- /dev/null +++ b/test/CodeGen/Mips/constraint-c.ll @@ -0,0 +1,18 @@ +; Check handling of the constraint `c`. +; RUN: llc -march=mips -target-abi o32 < %s | FileCheck %s + +define i32 @main() #0 { +entry: + %jmp = alloca i32, align 4 + store i32 0, i32* %jmp, align 4 + %0 = load i32, i32* %jmp, align 4 + call void asm sideeffect "jr $0", "c,~{$1}"(i32 %0) #1 + +; CHECK: addiu $25, $zero, 0 +; CHECK: jr $25 + + ret i32 0 +} + +attributes #0 = { noinline nounwind } +attributes #1 = { nounwind } diff --git a/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir b/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir new file mode 100644 index 000000000000..31aa1219a235 --- /dev/null +++ b/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir @@ -0,0 +1,1329 @@ +# RUN: llc -run-pass ppc-mi-peepholes -ppc-convert-rr-to-ri %s -o - | FileCheck %s +# RUN: llc -start-after ppc-mi-peepholes -ppc-late-peephole %s -o - | FileCheck %s --check-prefix=CHECK-LATE + +--- | + ; ModuleID = 'convert-rr-to-ri-instrs.ll' + source_filename = "convert-rr-to-ri-instrs.c" + target datalayout = "e-m:e-i64:64-n32:64" + target triple = "powerpc64le-unknown-linux-gnu" + + ; Function Attrs: norecurse nounwind readnone + define zeroext i32 @testRLWNM(i32 zeroext %a) local_unnamed_addr #0 { + entry: + %shl = shl i32 %a, 4 + %and = and i32 %shl, 4080 + ret i32 %and + } + + ; Function Attrs: norecurse nounwind readnone + define i64 @testRLWNM8(i64 %a) local_unnamed_addr #0 { + entry: + %shl = shl i64 %a, 4 + %and = and i64 %shl, 4080 + ret i64 %and + } + + ; Function Attrs: norecurse nounwind readnone + define zeroext i32 @testRLWNMo(i32 zeroext %a, i32 zeroext %b) local_unnamed_addr #0 { + entry: + %and = and i32 %a, 255 + %tobool = icmp eq i32 %and, 0 + %cond = select i1 %tobool, i32 %b, i32 %a + ret i32 %cond + } + + ; Function Attrs: norecurse nounwind readnone + define i64 @testRLWNM8o(i64 %a, i64 %b) local_unnamed_addr #0 { + entry: + %a.tr = trunc i64 %a to i32 + %0 = shl i32 %a.tr, 4 + %conv = and i32 %0, 4080 + %tobool = icmp eq i32 %conv, 0 + %conv1 = zext i32 %conv to i64 + %cond = select i1 %tobool, i64 %b, i64 %conv1 + ret i64 %cond + } + + ; Function Attrs: norecurse nounwind readnone + define zeroext i32 @testSLW(i32 zeroext %a, i32 zeroext %b) local_unnamed_addr #0 { + entry: + %shl = shl i32 %a, %b + ret i32 %shl + } + + ; Function Attrs: norecurse nounwind readnone + define zeroext i32 @testSLWo(i32 zeroext %a, i32 zeroext %b) local_unnamed_addr #0 { + entry: + %shl = shl i32 %a, %b + %tobool = icmp eq i32 %shl, 0 + %cond = select i1 %tobool, i32 %b, i32 %a + ret i32 %cond + } + + ; Function Attrs: norecurse nounwind readnone + define zeroext i32 @testSRW(i32 zeroext %a, i32 zeroext %b) local_unnamed_addr #0 { + entry: + %shr = lshr i32 %a, %b + ret i32 %shr + } + + ; Function Attrs: norecurse nounwind readnone + define zeroext i32 @testSRWo(i32 zeroext %a, i32 zeroext %b) local_unnamed_addr #0 { + entry: + %shr = lshr i32 %a, %b + %tobool = icmp eq i32 %shr, 0 + %cond = select i1 %tobool, i32 %b, i32 %a + ret i32 %cond + } + + ; Function Attrs: norecurse nounwind readnone + define signext i32 @testSRAW(i32 signext %a, i32 signext %b) local_unnamed_addr #0 { + entry: + %shr = ashr i32 %a, %b + ret i32 %shr + } + + ; Function Attrs: norecurse nounwind readnone + define signext i32 @testSRAWo(i32 signext %a, i32 signext %b) local_unnamed_addr #0 { + entry: + %shr = ashr i32 %a, %b + %tobool = icmp eq i32 %shr, 0 + %cond = select i1 %tobool, i32 %b, i32 %shr + ret i32 %cond + } + + ; Function Attrs: norecurse nounwind readnone + define i64 @testRLDCL(i64 %a, i64 %b) local_unnamed_addr #0 { + entry: + %and = and i64 %b, 63 + %shl = shl i64 %a, %and + %sub = sub nsw i64 64, %and + %shr = lshr i64 %a, %sub + %or = or i64 %shr, %shl + ret i64 %or + } + + ; Function Attrs: norecurse nounwind readnone + define i64 @testRLDCLo(i64 %a, i64 %b) local_unnamed_addr #0 { + entry: + %and = and i64 %b, 63 + %shl = shl i64 %a, %and + %sub = sub nsw i64 64, %and + %shr = lshr i64 %a, %sub + %or = or i64 %shr, %shl + %tobool = icmp eq i64 %or, 0 + %cond = select i1 %tobool, i64 %and, i64 %a + ret i64 %cond + } + + ; Function Attrs: norecurse nounwind readnone + define i64 @testRLDCR(i64 %a, i64 %b) local_unnamed_addr #0 { + entry: + %and = and i64 %b, 63 + %shl = shl i64 %a, %and + %sub = sub nsw i64 64, %and + %shr = lshr i64 %a, %sub + %or = or i64 %shr, %shl + ret i64 %or + } + + ; Function Attrs: norecurse nounwind readnone + define i64 @testRLDCRo(i64 %a, i64 %b) local_unnamed_addr #0 { + entry: + %and = and i64 %b, 63 + %shl = shl i64 %a, %and + %sub = sub nsw i64 64, %and + %shr = lshr i64 %a, %sub + %or = or i64 %shr, %shl + %tobool = icmp eq i64 %or, 0 + %cond = select i1 %tobool, i64 %and, i64 %a + ret i64 %cond + } + + define i64 @testSLD(i64 %a, i64 %b) local_unnamed_addr #0 { + entry: + %shl = shl i64 %a, %b + ret i64 %shl + } + + ; Function Attrs: norecurse nounwind readnone + define i64 @testSLDo(i64 %a, i64 %b) local_unnamed_addr #0 { + entry: + %shl = shl i64 %a, %b + %tobool = icmp eq i64 %shl, 0 + %cond = select i1 %tobool, i64 %b, i64 %a + ret i64 %cond + } + + ; Function Attrs: norecurse nounwind readnone + define i64 @testSRD(i64 %a, i64 %b) local_unnamed_addr #0 { + entry: + %shr = lshr i64 %a, %b + ret i64 %shr + } + + ; Function Attrs: norecurse nounwind readnone + define i64 @testSRDo(i64 %a, i64 %b) local_unnamed_addr #0 { + entry: + %shr = lshr i64 %a, %b + %tobool = icmp eq i64 %shr, 0 + %cond = select i1 %tobool, i64 %b, i64 %a + ret i64 %cond + } + + ; Function Attrs: norecurse nounwind readnone + define i64 @testSRAD(i64 %a, i64 %b) local_unnamed_addr #0 { + entry: + %shr = ashr i64 %a, %b + ret i64 %shr + } + + ; Function Attrs: norecurse nounwind readnone + define i64 @testSRADo(i64 %a, i64 %b) local_unnamed_addr #0 { + entry: + %shr = ashr i64 %a, %b + %tobool = icmp eq i64 %shr, 0 + %cond = select i1 %tobool, i64 %b, i64 %shr + ret i64 %cond + } + + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 7, !"PIC Level", i32 2} + !2 = !{!"clang version 6.0.0 (trunk 316067)"} + !3 = !{!4, !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = !{!7, !7, i64 0} + !7 = !{!"short", !4, i64 0} + !8 = !{!9, !9, i64 0} + !9 = !{!"int", !4, i64 0} + !10 = !{!11, !11, i64 0} + !11 = !{!"long long", !4, i64 0} + !12 = !{!13, !13, i64 0} + !13 = !{!"double", !4, i64 0} + !14 = !{!15, !15, i64 0} + !15 = !{!"float", !4, i64 0} + +... +--- +name: testRLWNM +# CHECK-ALL: name: testRLWNM +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: gprc, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } + - { id: 4, class: gprc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3 + + %0 = COPY %x3 + %1 = COPY %0.sub_32 + %3 = IMPLICIT_DEF + %2 = LI 170 + %4 = RLWNM killed %1, %2, 20, 27 + ; CHECK: RLWINM killed %1, 10, 20, 27 + ; CHECK-LATE: rlwinm 3, 3, 10, 20, 27 + %x3 = EXTSW_32_64 %4 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testRLWNM8 +# CHECK-ALL: name: testRLWNM8 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3 + + %0 = LI8 234 + %1 = COPY %x3 + %2 = RLWNM8 %1, %0, 20, 27 + ; CHECK: RLWINM8 %1, 10, 20, 27 + ; CHECK-LATE: rlwinm 3, 3, 10, 20, 27 + %x3 = COPY %2 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testRLWNMo +# CHECK-ALL: name: testRLWNMo +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 3, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 4, class: gprc, preferred-register: '' } + - { id: 5, class: crrc, preferred-register: '' } + - { id: 6, class: gprc, preferred-register: '' } + - { id: 7, class: g8rc, preferred-register: '' } + - { id: 8, class: g8rc, preferred-register: '' } + - { id: 9, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = COPY %1.sub_32 + %3 = LI -22 + %4 = RLWNMo %2, %3, 24, 31, implicit-def %cr0 + ; CHECK: RLWINMo %2, 10, 24, 31, implicit-def %cr0 + ; CHECK-LATE: li 3, -22 + ; CHECK-LATE: rlwinm. 5, 4, 10, 24, 31 + %5 = COPY killed %cr0 + %6 = ISEL %2, %3, %5.sub_eq + %8 = IMPLICIT_DEF + %7 = INSERT_SUBREG %8, killed %6, 1 + %9 = RLDICL killed %7, 0, 32 + %x3 = COPY %9 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testRLWNM8o +# CHECK-ALL: name: testRLWNM8o +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 2, class: g8rc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } + - { id: 4, class: g8rc, preferred-register: '' } + - { id: 5, class: g8rc, preferred-register: '' } + - { id: 6, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 7, class: crrc, preferred-register: '' } + - { id: 8, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI8 -18 + %3 = RLWNM8o %1, %2, 20, 27, implicit-def %cr0 + ; CHECK: RLWINM8o %1, 14, 20, 27, implicit-def %cr0 + ; CHECK-LATE: rlwinm. 3, 4, 14, 20, 27 + %7 = COPY killed %cr0 + %6 = RLDICL killed %3, 0, 32 + %8 = ISEL8 %1, %6, %7.sub_eq + %x3 = COPY %8 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSLW +# CHECK-ALL: name: testSLW +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } + - { id: 4, class: g8rc, preferred-register: '' } + - { id: 5, class: gprc, preferred-register: '' } + - { id: 6, class: g8rc, preferred-register: '' } + - { id: 7, class: g8rc, preferred-register: '' } + - { id: 8, class: gprc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = COPY %1.sub_32 + %5 = LI 210 + %8 = SLW killed %2, killed %5 + ; CHECK: RLWINM killed %2, 18, 0, 13 + ; CHECK-LATE: slwi 3, 4, 18 + %x3 = EXTSW_32_64 %8 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSLWo +# CHECK-ALL: name: testSLWo +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 3, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 4, class: gprc, preferred-register: '' } + - { id: 5, class: crrc, preferred-register: '' } + - { id: 6, class: gprc, preferred-register: '' } + - { id: 7, class: g8rc, preferred-register: '' } + - { id: 8, class: g8rc, preferred-register: '' } + - { id: 9, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI 35 + %3 = COPY %0.sub_32 + %4 = SLWo %3, %2, implicit-def %cr0 + ; CHECK: ANDIo %3, 0, implicit-def %cr0 + ; CHECK-LATE: andi. 5, 3, 0 + %5 = COPY killed %cr0 + %6 = ISEL %2, %3, %5.sub_eq + %8 = IMPLICIT_DEF + %7 = INSERT_SUBREG %8, killed %6, 1 + %9 = RLDICL killed %7, 0, 32 + %x3 = COPY %9 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSRW +# CHECK-ALL: name: testSRW +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } + - { id: 4, class: g8rc, preferred-register: '' } + - { id: 5, class: gprc, preferred-register: '' } + - { id: 6, class: g8rc, preferred-register: '' } + - { id: 7, class: g8rc, preferred-register: '' } + - { id: 8, class: gprc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI 48 + %5 = COPY %0.sub_32 + %8 = SRW killed %5, killed %2 + ; CHECK: LI 0 + ; CHECK-LATE: li 3, 0 + %x3 = EXTSW_32_64 %8 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSRWo +# CHECK-ALL: name: testSRWo +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 3, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 4, class: gprc, preferred-register: '' } + - { id: 5, class: crrc, preferred-register: '' } + - { id: 6, class: gprc, preferred-register: '' } + - { id: 7, class: g8rc, preferred-register: '' } + - { id: 8, class: g8rc, preferred-register: '' } + - { id: 9, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI -7 + %3 = COPY %0.sub_32 + %4 = SRWo %3, %2, implicit-def %cr0 + ; CHECK: ANDIo %3, 0, implicit-def %cr0 + ; CHECK-LATE: andi. 5, 3, 0 + %5 = COPY killed %cr0 + %6 = ISEL %2, %3, %5.sub_eq + %8 = IMPLICIT_DEF + %7 = INSERT_SUBREG %8, killed %6, 1 + %9 = RLDICL killed %7, 0, 32 + %x3 = COPY %9 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSRAW +# CHECK-ALL: name: testSRAW +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: gprc, preferred-register: '' } + - { id: 4, class: gprc, preferred-register: '' } + - { id: 5, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI 48 + %3 = COPY %0.sub_32 + %4 = SRAW killed %3, killed %2, implicit-def dead %carry + ; CHECK: LI 48 + ; CHECK: SRAW killed %3, killed %2, implicit-def dead %carry + ; CHECK-LATE: sraw 3, 3, 4 + %5 = EXTSW_32_64 killed %4 + %x3 = COPY %5 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSRAWo +# CHECK-ALL: name: testSRAWo +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 3, class: gprc, preferred-register: '' } + - { id: 4, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 5, class: crrc, preferred-register: '' } + - { id: 6, class: gprc, preferred-register: '' } + - { id: 7, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI 80 + %3 = COPY %0.sub_32 + %4 = SRAWo killed %3, %2, implicit-def dead %carry, implicit-def %cr0 + ; CHECK: SRAWo killed %3, %2, implicit-def dead %carry, implicit-def %cr0 + ; CHECK-LATE: sraw. 3, 3, 4 + %5 = COPY killed %cr0 + %6 = ISEL %2, %4, %5.sub_eq + %7 = EXTSW_32_64 killed %6 + %x3 = COPY %7 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testRLDCL +# CHECK-ALL: name: testRLDCL +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: gprc, preferred-register: '' } + - { id: 4, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = COPY %1.sub_32 + %3 = LI 140 + %4 = RLDCL %0, killed %3, 0 + ; CHECK: RLDICL %0, 12, 0 + ; CHECK-LATE: rotldi 3, 3, 12 + %x3 = COPY %4 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testRLDCLo +# CHECK-ALL: name: testRLDCLo +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 3, class: gprc, preferred-register: '' } + - { id: 4, class: g8rc, preferred-register: '' } + - { id: 5, class: crrc, preferred-register: '' } + - { id: 6, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = RLDICL %1, 0, 58 + %3 = LI -37 + %4 = RLDCLo %0, killed %3, 0, implicit-def %cr0 + ; CHECK: RLDICLo %0, 27, 0, implicit-def %cr0 + ; CHECK-LATE: rldicl. 5, 3, 27, 0 + %5 = COPY killed %cr0 + %6 = ISEL8 %2, %0, %5.sub_eq + %x3 = COPY %6 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testRLDCR +# CHECK-ALL: name: testRLDCR +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: gprc, preferred-register: '' } + - { id: 4, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = COPY %1.sub_32 + %3 = LI 300 + %4 = RLDCR %0, killed %3, 0 + ; CHECK: RLDICR %0, 44, 0 + ; CHECK-LATE: rldicr 3, 3, 44, 0 + %x3 = COPY %4 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testRLDCRo +# CHECK-ALL: name: testRLDCRo +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 3, class: gprc, preferred-register: '' } + - { id: 4, class: g8rc, preferred-register: '' } + - { id: 5, class: crrc, preferred-register: '' } + - { id: 6, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = RLDICL %1, 0, 58 + %3 = LI -18 + %4 = RLDCRo %0, killed %3, 0, implicit-def %cr0 + ; CHECK: RLDICRo %0, 46, 0, implicit-def %cr0 + ; CHECK-LATE: rldicr. 5, 3, 46, 0 + %5 = COPY killed %cr0 + %6 = ISEL8 %2, %0, %5.sub_eq + %x3 = COPY %6 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSLD +# CHECK-ALL: name: testSLD +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI -13 + %3 = SLD %0, killed %2 + ; CHECK: LI8 0 + ; CHECK-LATE: li 3, 0 + %x3 = COPY %3 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSLDo +# CHECK-ALL: name: testSLDo +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 1, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } + - { id: 4, class: crrc, preferred-register: '' } + - { id: 5, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI 88 + %3 = SLDo %0, killed %2, implicit-def %cr0 + ; CHECK: ANDIo8 %0, 0, implicit-def %cr0 + ; CHECK-LATE: andi. 5, 3, 0 + %4 = COPY killed %cr0 + %5 = ISEL8 %1, %0, %4.sub_eq + %x3 = COPY %5 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSRD +# CHECK-ALL: name: testSRD +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI 400 + %3 = SRD %0, killed %2 + ; CHECK: RLDICL %0, 48, 16 + ; CHECK-LATE: rldicl 3, 3, 48, 16 + %x3 = COPY %3 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSRDo +# CHECK-ALL: name: testSRDo +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 1, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } + - { id: 4, class: crrc, preferred-register: '' } + - { id: 5, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI 64 + %3 = SRDo %0, killed %2, implicit-def %cr0 + ; CHECK: ANDIo8 %0, 0, implicit-def %cr0 + ; CHECK-LATE: andi. 5, 3, 0 + %4 = COPY killed %cr0 + %5 = ISEL8 %1, %0, %4.sub_eq + %x3 = COPY %5 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSRAD +# CHECK-ALL: name: testSRAD +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI -44 + %3 = SRAD %0, killed %2, implicit-def dead %carry + ; CHECK: SRAD %0, killed %2, implicit-def dead %carry + ; CHECK-LATE: srad 3, 3, 4 + %x3 = COPY %3 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testSRADo +# CHECK-ALL: name: testSRADo +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 4, class: crrc, preferred-register: '' } + - { id: 5, class: g8rc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } + - { reg: '%x4', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3, %x4 + + %1 = COPY %x4 + %0 = COPY %x3 + %2 = LI 68 + %3 = SRADo %0, killed %2, implicit-def dead %carry, implicit-def %cr0 + ; CHECK: SRADo %0, killed %2, implicit-def dead %carry, implicit-def %cr0 + ; CHECK-LATE: srad. 3, 3, 5 + %4 = COPY killed %cr0 + %5 = ISEL8 %1, %3, %4.sub_eq + %x3 = COPY %5 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- diff --git a/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir b/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir index c4783de4a18f..67733795ed5d 100644 --- a/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir +++ b/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir @@ -568,6 +568,22 @@ ret i32 %and } + ; Function Attrs: norecurse nounwind readnone + define zeroext i32 @testRLWINMFullReg(i32 zeroext %a) local_unnamed_addr #0 { + entry: + %shl = shl i32 %a, 4 + %and = and i32 %shl, 4080 + ret i32 %and + } + + ; Function Attrs: norecurse nounwind readnone + define zeroext i32 @testRLWINMFullRegOutOfRange(i32 zeroext %a) local_unnamed_addr #0 { + entry: + %shl = shl i32 %a, 4 + %and = and i32 %shl, 4080 + ret i32 %and + } + ; Function Attrs: norecurse nounwind readnone define i64 @testRLWINM8(i64 %a) local_unnamed_addr #0 { entry: @@ -3938,6 +3954,110 @@ body: | %x3 = EXTSW_32_64 %4 BLR8 implicit %lr8, implicit %rm, implicit %x3 +... +--- +name: testRLWINMFullReg +# CHECK-ALL: name: testRLWINMFullReg +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: gprc, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } + - { id: 4, class: gprc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3 + + %0 = COPY %x3 + %1 = COPY %0.sub_32 + %3 = IMPLICIT_DEF + %2 = LI 2 + %4 = RLWINM killed %2, 31, 0, 31 + ; CHECK: LI 1 + ; CHECK-LATE: li 3, 1 + %x3 = EXTSW_32_64 %4 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + +... +--- +name: testRLWINMFullRegOutOfRange +# CHECK-ALL: name: testRLWINMFullRegOutOfRange +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: gprc, preferred-register: '' } + - { id: 2, class: gprc, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } + - { id: 4, class: gprc, preferred-register: '' } +liveins: + - { reg: '%x3', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: %x3 + + %0 = COPY %x3 + %1 = COPY %0.sub_32 + %3 = IMPLICIT_DEF + %2 = LI 1 + %4 = RLWINM killed %2, 31, 0, 31 + ; CHECK: RLWINM killed %2, 31, 0, 31 + ; CHECK-LATE: rotlwi 3, 3, 31 + %x3 = EXTSW_32_64 %4 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + ... --- name: testRLWINM8 diff --git a/test/CodeGen/PowerPC/duplicate-returns-for-tailcall.ll b/test/CodeGen/PowerPC/duplicate-returns-for-tailcall.ll index 520efd8106f4..9c0e77dafde6 100644 --- a/test/CodeGen/PowerPC/duplicate-returns-for-tailcall.ll +++ b/test/CodeGen/PowerPC/duplicate-returns-for-tailcall.ll @@ -42,10 +42,10 @@ if.end4: ; preds = %if.end if.then6: ; preds = %if.end4 %call7 = tail call fastcc signext i32 @call3(i32 signext %a, i32 signext %b, i32 signext %c) br label %return -; No duplication here because the calling convention mismatch means we won't tail-call +; tail calling a fastcc function from a ccc function is supported. ; CHECK_LABEL: if.then13: -; CHECK: tail call fastcc signext i32 @call3 -; CHECK-NEXT: br +; CHECK: %[[T2:[a-zA-Z0-9]+]] = tail call fastcc signext i32 @call3 +; CHECK-NEXT: ret i32 %[[T2]] return: ; preds = %if.end4, %if.then6, %if.then2, %if.then %retval.0 = phi i32 [ %call, %if.then ], [ %call3, %if.then2 ], [ %call7, %if.then6 ], [ %c, %if.end4 ] diff --git a/test/CodeGen/PowerPC/ppc64-sibcall.ll b/test/CodeGen/PowerPC/ppc64-sibcall.ll index 3c08ecb5119f..fc0e71f878ca 100644 --- a/test/CodeGen/PowerPC/ppc64-sibcall.ll +++ b/test/CodeGen/PowerPC/ppc64-sibcall.ll @@ -41,6 +41,15 @@ define void @caller_64_64_copy([8 x i64] %a, [8 x i64] %b) #1 { ; CHECK-SCO: b callee_64_64_copy } +define internal fastcc void @callee_64_64_copy_fastcc([8 x i64] %a, [8 x i64] %b) #0 { ret void } +define void @caller_64_64_copy_ccc([8 x i64] %a, [8 x i64] %b) #1 { + tail call fastcc void @callee_64_64_copy_fastcc([8 x i64] %a, [8 x i64] %b) + ret void +; If caller and callee use different calling convensions, we cannot apply TCO. +; CHECK-SCO-LABEL: caller_64_64_copy_ccc: +; CHECK-SCO: bl callee_64_64_copy_fastcc +} + define void @caller_64_64_reorder_copy([8 x i64] %a, [8 x i64] %b) #1 { tail call void @callee_64_64_copy([8 x i64] %b, [8 x i64] %a) ret void diff --git a/test/CodeGen/PowerPC/pr35688.ll b/test/CodeGen/PowerPC/pr35688.ll new file mode 100644 index 000000000000..832cd43eb95c --- /dev/null +++ b/test/CodeGen/PowerPC/pr35688.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s | \ +; RUN: FileCheck %s +; Function Attrs: nounwind +define void @ec_GFp_nistp256_points_mul() { +; CHECK-LABEL: ec_GFp_nistp256_points_mul: +; CHECK: ld 5, 0(3) +; CHECK: li 3, 127 +; CHECK: li 4, 0 +; CHECK: subfic 6, 5, 0 +; CHECK: subfze 6, 4 +; CHECK: sradi 7, 6, 63 +; CHECK: srad 6, 6, 3 +; CHECK: subfc 5, 5, 7 +; CHECK: subfe 5, 4, 6 +; CHECK: sradi 5, 5, 63 +entry: + br label %fe_cmovznz.exit.i534.i.15 + +fe_cmovznz.exit.i534.i.15: ; preds = %fe_cmovznz.exit.i534.i.15, %entry + %0 = load i64, i64* undef, align 8 + %1 = load i64, i64* undef, align 8 + %conv.i69.i.i = zext i64 %0 to i128 + %sub.i72.i.i = sub nsw i128 0, %conv.i69.i.i + %conv.i63.i.i = zext i64 %1 to i128 + %add.neg.i.i.i = ashr i128 %sub.i72.i.i, 127 + %sub.i65.i.i = sub nsw i128 %add.neg.i.i.i, %conv.i63.i.i + %sub.i65.lobit.i.i = ashr i128 %sub.i65.i.i, 127 + %conv1.i58.i.i = and i128 %sub.i65.lobit.i.i, 18446744073709551615 + %add3.i59.i.i = add nuw nsw i128 %conv1.i58.i.i, 0 + %conv4.i60.i.i = trunc i128 %add3.i59.i.i to i64 + store i64 %conv4.i60.i.i, i64* undef, align 16 + br label %fe_cmovznz.exit.i534.i.15 +} diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 297922809ea7..9069755ad131 100644 --- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -2364,16 +2364,16 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind { ; X32: # %bb.0: ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vmovd %eax, %xmm0 -; X32-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set1_epi16: ; X64: # %bb.0: ; X64-NEXT: vmovd %edi, %xmm0 -; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %res0 = insertelement <16 x i16> undef, i16 %a0, i32 0 diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index da547397c6ce..9cd05a353fbf 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -15,8 +15,8 @@ entry: define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp { ; CHECK-LABEL: funcB: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq entry: diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll index 5a9f23007d86..e2dc40c6f0ef 100644 --- a/test/CodeGen/X86/avx-vbroadcast.ll +++ b/test/CodeGen/X86/avx-vbroadcast.ll @@ -874,39 +874,33 @@ define float @broadcast_lifetime() nounwind { ; X32-LABEL: broadcast_lifetime: ; X32: ## %bb.0: ; X32-NEXT: pushl %esi -; X32-NEXT: subl $56, %esp +; X32-NEXT: subl $40, %esp ; X32-NEXT: leal {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: calll _gfunc ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill +; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## 4-byte Spill ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: calll _gfunc ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: vpermilps $0, {{[0-9]+}}(%esp), %xmm1 ## 16-byte Folded Reload -; X32-NEXT: ## xmm1 = mem[0,0,0,0] -; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X32-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; X32-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0 ## 4-byte Folded Reload ; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: flds {{[0-9]+}}(%esp) -; X32-NEXT: addl $56, %esp +; X32-NEXT: addl $40, %esp ; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: broadcast_lifetime: ; X64: ## %bb.0: ; X64-NEXT: subq $40, %rsp -; X64-NEXT: movq %rsp, %rdi +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; X64-NEXT: callq _gfunc ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill -; X64-NEXT: movq %rsp, %rdi +; X64-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) ## 4-byte Spill +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; X64-NEXT: callq _gfunc ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: vpermilps $0, {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Folded Reload -; X64-NEXT: ## xmm1 = mem[0,0,0,0] -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; X64-NEXT: vsubss {{[0-9]+}}(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload ; X64-NEXT: addq $40, %rsp ; X64-NEXT: retq %1 = alloca <4 x float>, align 16 diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index 821c65bef06a..e3cf2181387f 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -410,3 +410,27 @@ define i32 @test12(i32 %a1, i32 %a2, i32 %b1) { %res1 = select i1 %cond, i32 %res, i32 0 ret i32 %res1 } + +define <1 x i1> @test13(<1 x i1>* %foo) { +; KNL-LABEL: test13: +; KNL: ## %bb.0: +; KNL-NEXT: movzbl (%rdi), %eax +; KNL-NEXT: ## kill: def %al killed %al killed %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test13: +; SKX: ## %bb.0: +; SKX-NEXT: kmovb (%rdi), %k0 +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: ## kill: def %al killed %al killed %eax +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test13: +; KNL_X32: ## %bb.0: +; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: movzbl (%eax), %eax +; KNL_X32-NEXT: ## kill: def %al killed %al killed %eax +; KNL_X32-NEXT: retl + %bar = load <1 x i1>, <1 x i1>* %foo + ret <1 x i1> %bar +} diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index 18e9f306bc1b..a95e22a04856 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -2018,3 +2018,329 @@ define <2 x double> @ubto2f64(<2 x i32> %a) { %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 } + +define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f64toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttsd2usi %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm2 +; NOVLDQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; NOVLDQ-NEXT: vcvttsd2usi %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm0 +; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NOVLDQ-NEXT: vpsllq $63, %xmm0, %xmm0 +; NOVLDQ-NEXT: vpsraq $63, %zmm0, %zmm0 +; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VL-LABEL: test_2f64toub: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2udq %xmm0, %xmm0 +; VL-NEXT: vpslld $31, %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VL-NEXT: retq +; +; AVX512DQ-LABEL: test_2f64toub: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsraq $63, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %mask = fptoui <2 x double> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) { +; NOVL-LABEL: test_4f64toub: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 +; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0 +; NOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVL-NEXT: vpsrad $31, %xmm0, %xmm0 +; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0 +; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; NOVL-NEXT: retq +; +; VL-LABEL: test_4f64toub: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VL-NEXT: vpslld $31, %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptoui <4 x double> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) { +; NOVL-LABEL: test_8f64toub: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vpslld $31, %ymm0, %ymm0 +; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVL-NEXT: retq +; +; VL-LABEL: test_8f64toub: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VL-NEXT: vpslld $31, %ymm0, %ymm0 +; VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptoui <8 x double> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f32toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttss2usi %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm2 +; NOVLDQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; NOVLDQ-NEXT: vcvttss2usi %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm0 +; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NOVLDQ-NEXT: vpsllq $63, %xmm0, %xmm0 +; NOVLDQ-NEXT: vpsraq $63, %zmm0, %zmm0 +; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VL-LABEL: test_2f32toub: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; VL-NEXT: vpslld $31, %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VL-NEXT: retq +; +; AVX512DQ-LABEL: test_2f32toub: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsraq $63, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %mask = fptoui <2 x float> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) { +; NOVL-LABEL: test_4f32toub: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 +; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 +; NOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVL-NEXT: vpsrad $31, %xmm0, %xmm0 +; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0 +; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; NOVL-NEXT: retq +; +; VL-LABEL: test_4f32toub: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; VL-NEXT: vpslld $31, %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptoui <4 x float> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) { +; NOVL-LABEL: test_8f32toub: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; NOVL-NEXT: vpslld $31, %ymm0, %ymm0 +; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVL-NEXT: retq +; +; VL-LABEL: test_8f32toub: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; VL-NEXT: vpslld $31, %ymm0, %ymm0 +; VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptoui <8 x float> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) { +; ALL-LABEL: test_16f32toub: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vpslld $31, %zmm0, %zmm0 +; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; ALL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; ALL-NEXT: retq + %mask = fptoui <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +} + +define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f64tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttsd2si %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm2 +; NOVLDQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; NOVLDQ-NEXT: vcvttsd2si %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm0 +; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVLDQ-NEXT: retq +; +; VL-LABEL: test_2f64tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VL-NEXT: vpslld $31, %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VL-NEXT: retq +; +; AVX512DQ-LABEL: test_2f64tosb: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %mask = fptosi <2 x double> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) { +; NOVL-LABEL: test_4f64tosb: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0 +; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; NOVL-NEXT: retq +; +; VL-LABEL: test_4f64tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptosi <4 x double> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) { +; NOVL-LABEL: test_8f64tosb: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVL-NEXT: retq +; +; VL-LABEL: test_8f64tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptosi <8 x double> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f32tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttss2si %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm2 +; NOVLDQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; NOVLDQ-NEXT: vcvttss2si %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm0 +; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVLDQ-NEXT: retq +; +; VL-LABEL: test_2f32tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VL-NEXT: retq +; +; AVX512DQ-LABEL: test_2f32tosb: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %mask = fptosi <2 x float> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) { +; NOVL-LABEL: test_4f32tosb: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0 +; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; NOVL-NEXT: retq +; +; VL-LABEL: test_4f32tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptosi <4 x float> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) { +; NOVL-LABEL: test_8f32tosb: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVL-NEXT: retq +; +; VL-LABEL: test_8f32tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptosi <8 x float> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) { +; ALL-LABEL: test_16f32tosb: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; ALL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; ALL-NEXT: retq + %mask = fptosi <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +} diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index 8c7941591217..a966235df216 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -345,9 +345,8 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL-NEXT: retq ; @@ -369,9 +368,8 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovsxbd (%rdi), %ymm1 -; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovsxbd (%rdi), %ymm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL-NEXT: retq ; @@ -702,9 +700,8 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL-NEXT: retq ; @@ -726,9 +723,8 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovsxwd (%rdi), %ymm1 -; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovsxwd (%rdi), %ymm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL-NEXT: retq ; @@ -760,9 +756,8 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind ; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 ; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 -; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL-NEXT: retq ; diff --git a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll index e1ed8ea98a1c..b49e2ceca0bf 100644 --- a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -6,7 +6,7 @@ define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $4, %k0, %k0 +; AVX512-NEXT: kshiftrb $4, %k0, %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -22,7 +22,6 @@ define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -37,7 +36,7 @@ define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $6, %k0, %k0 +; AVX512-NEXT: kshiftrb $6, %k0, %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -53,7 +52,6 @@ define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -83,7 +81,6 @@ define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x do ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -113,7 +110,6 @@ define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x flo ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -143,7 +139,6 @@ define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x d ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -173,7 +168,6 @@ define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -203,7 +197,6 @@ define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x d ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -233,7 +226,6 @@ define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -264,7 +256,6 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) @@ -295,7 +286,6 @@ define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x d ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -325,7 +315,6 @@ define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -358,7 +347,6 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) @@ -389,7 +377,6 @@ define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x d ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -419,7 +406,6 @@ define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -450,7 +436,6 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) @@ -481,7 +466,6 @@ define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) @@ -512,7 +496,6 @@ define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x d ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -542,7 +525,6 @@ define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -575,7 +557,6 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) @@ -608,7 +589,6 @@ define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2 -; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) @@ -624,7 +604,7 @@ define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k0 +; AVX512-NEXT: kshiftrb $1, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -645,7 +625,7 @@ define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k0 +; AVX512-NEXT: kshiftrb $1, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -666,7 +646,7 @@ define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k0 +; AVX512-NEXT: kshiftrb $2, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -687,7 +667,7 @@ define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k0 +; AVX512-NEXT: kshiftrb $2, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -708,7 +688,7 @@ define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $3, %k0, %k0 +; AVX512-NEXT: kshiftrb $3, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -729,7 +709,7 @@ define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $4, %k0, %k0 +; AVX512-NEXT: kshiftrb $4, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -750,7 +730,7 @@ define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $4, %k0, %k0 +; AVX512-NEXT: kshiftrb $4, %k0, %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -765,7 +745,6 @@ define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -779,7 +758,7 @@ define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -800,7 +779,7 @@ define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $6, %k0, %k0 +; AVX512-NEXT: kshiftrb $6, %k0, %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -815,7 +794,6 @@ define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -863,7 +841,6 @@ define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -891,7 +868,6 @@ define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -939,7 +915,6 @@ define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -967,7 +942,6 @@ define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1015,7 +989,6 @@ define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1043,7 +1016,6 @@ define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1072,7 +1044,6 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 -; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1121,7 +1092,6 @@ define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1149,7 +1119,6 @@ define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1180,7 +1149,6 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1229,7 +1197,6 @@ define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1257,7 +1224,6 @@ define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1286,7 +1252,6 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 -; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1315,7 +1280,6 @@ define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0 -; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper @@ -1363,7 +1327,6 @@ define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1391,7 +1354,6 @@ define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1422,7 +1384,6 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1453,7 +1414,6 @@ define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index 7477e05f0c7f..9e11c799e179 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -805,7 +805,6 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 0601c011e290..333efb04913d 100644 --- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -3,26 +3,28 @@ ; FIXME: All cases here should be fixed by PR34380 -define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { -; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15] -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { +; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,6,4] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] +; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> ret <8 x i16> %res } -define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15] -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { +; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,6,4] +; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] +; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 @@ -35,13 +37,14 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i ret <8 x i16> %res } -define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15] -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) { +; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,7,6,6,4] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] +; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 @@ -55,14 +58,14 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x } define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK: # %bb.0: +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] +; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -74,14 +77,14 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK: # %bb.0: +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] +; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll index 6bee0de181ab..f6cb093d521b 100644 --- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -136,7 +136,7 @@ define <4 x i1> @test9(<8 x i1> %a, <8 x i1> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 ; CHECK-NEXT: vpmovw2m %xmm0, %k0 -; CHECK-NEXT: kshiftrw $4, %k0, %k0 +; CHECK-NEXT: kshiftrb $4, %k0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i1> %a, <8 x i1> %b, <4 x i32> @@ -148,7 +148,7 @@ define <2 x i1> @test10(<4 x i1> %a, <4 x i1> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 -; CHECK-NEXT: kshiftrw $2, %k0, %k0 +; CHECK-NEXT: kshiftrb $2, %k0, %k0 ; CHECK-NEXT: vpmovm2q %k0, %xmm0 ; CHECK-NEXT: retq %res = shufflevector <4 x i1> %a, <4 x i1> %b, <2 x i32> diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index 826a4538f3f1..6f0f873c2f70 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -195,14 +195,12 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind { ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rsp) ; KNL-NEXT: movl (%rsp), %eax @@ -235,28 +233,24 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind { ; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rsp) ; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: movl (%rsp), %ecx diff --git a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll index fdd6f7126457..8c13d4b842fc 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -871,23 +871,14 @@ define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm_mask_broadcastd_epi32: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastd_epi32: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -903,23 +894,14 @@ define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm_maskz_broadcastd_epi32: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastd_epi32: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1007,23 +989,14 @@ define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm_mask_broadcastq_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastq_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i2 @@ -1036,23 +1009,14 @@ define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm_maskz_broadcastq_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastq_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i2 @@ -1079,23 +1043,14 @@ define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) { define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm256_mask_broadcastq_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_broadcastq_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1108,23 +1063,14 @@ define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm256_maskz_broadcastq_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_broadcastq_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1151,23 +1097,14 @@ define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) { define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) { ; X32-LABEL: test_mm_mask_broadcastsd_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastsd_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i2 @@ -1180,23 +1117,14 @@ define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { ; X32-LABEL: test_mm_maskz_broadcastsd_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastsd_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i2 @@ -1223,23 +1151,14 @@ define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) { define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 x double> %a2) { ; X32-LABEL: test_mm256_mask_broadcastsd_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_broadcastsd_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1252,23 +1171,14 @@ define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { ; X32-LABEL: test_mm256_maskz_broadcastsd_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_broadcastsd_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1295,23 +1205,14 @@ define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { ; X32-LABEL: test_mm_mask_broadcastss_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastss_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1324,23 +1225,14 @@ define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x fl define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) { ; X32-LABEL: test_mm_maskz_broadcastss_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastss_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1419,23 +1311,14 @@ define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) { define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) { ; X32-LABEL: test_mm_mask_movddup_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_movddup_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i2 @@ -1448,23 +1331,14 @@ define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x doub define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) { ; X32-LABEL: test_mm_maskz_movddup_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_movddup_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i2 @@ -1491,23 +1365,14 @@ define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) { define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) { ; X32-LABEL: test_mm256_mask_movddup_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_movddup_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1520,23 +1385,14 @@ define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x d define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) { ; X32-LABEL: test_mm256_maskz_movddup_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_movddup_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i4 @@ -1563,23 +1419,14 @@ define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) { define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { ; X32-LABEL: test_mm_mask_movehdup_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_movehdup_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1592,23 +1439,14 @@ define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) { ; X32-LABEL: test_mm_maskz_movehdup_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_movehdup_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1687,23 +1525,14 @@ define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) { define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { ; X32-LABEL: test_mm_mask_moveldup_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_moveldup_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1716,23 +1545,14 @@ define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) { ; X32-LABEL: test_mm_maskz_moveldup_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_moveldup_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1811,23 +1631,14 @@ define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) { define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64> %a2) { ; X32-LABEL: test_mm256_mask_permutex_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_permutex_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1840,23 +1651,14 @@ define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64 define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_maskz_permutex_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_permutex_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i4 @@ -1883,23 +1685,14 @@ define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) { define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) { ; X32-LABEL: test_mm256_mask_permutex_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_permutex_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1912,23 +1705,14 @@ define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) { ; X32-LABEL: test_mm256_maskz_permutex_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_permutex_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i4 @@ -1955,23 +1739,14 @@ define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) { define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2, <2 x double> %a3) { ; X32-LABEL: test_mm_mask_shuffle_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_shuffle_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i2 @@ -1984,23 +1759,14 @@ define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x doub define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x double> %a2) { ; X32-LABEL: test_mm_maskz_shuffle_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_shuffle_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i2 @@ -2027,23 +1793,14 @@ define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) { define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2, <4 x double> %a3) { ; X32-LABEL: test_mm256_mask_shuffle_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_shuffle_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -2056,23 +1813,14 @@ define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x d define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x double> %a2) { ; X32-LABEL: test_mm256_maskz_shuffle_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_shuffle_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i4 @@ -2099,23 +1847,14 @@ define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) { define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2, <4 x float> %a3) { ; X32-LABEL: test_mm_mask_shuffle_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_shuffle_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -2128,23 +1867,14 @@ define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float> %a2) { ; X32-LABEL: test_mm_maskz_shuffle_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_shuffle_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll index 5f4b050b863d..ea1ff4e56b95 100644 --- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -22,7 +22,6 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -59,7 +58,6 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -96,7 +94,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -104,7 +101,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -142,7 +138,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -150,7 +145,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -193,7 +187,6 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -235,7 +228,6 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -275,7 +267,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -285,7 +276,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -326,7 +316,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -336,7 +325,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -380,11 +368,9 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -427,11 +413,9 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -587,7 +571,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -614,7 +597,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -643,7 +625,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -674,7 +655,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -712,7 +692,6 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -776,7 +755,6 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -842,7 +820,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -910,7 +887,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -979,7 +955,6 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -1048,7 +1023,6 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -1119,7 +1093,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -1192,7 +1165,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -1271,7 +1243,6 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -1311,7 +1282,6 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -1349,7 +1319,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -1357,7 +1326,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -1396,7 +1364,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -1404,7 +1371,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -1450,7 +1416,6 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -1495,7 +1460,6 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -1536,7 +1500,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -1546,7 +1509,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -1588,7 +1550,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -1598,7 +1559,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -1814,13 +1774,11 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -1949,14 +1907,12 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -3117,11 +3073,9 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -3158,11 +3112,9 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -3214,7 +3166,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -3272,7 +3223,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -3319,11 +3269,9 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -3377,7 +3325,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -3427,11 +3374,9 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -3474,11 +3419,9 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -3536,7 +3479,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -3600,7 +3542,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -3653,11 +3594,9 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -3717,7 +3656,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -4802,7 +4740,6 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -4842,7 +4779,6 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -4885,7 +4821,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -4929,7 +4864,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -4973,7 +4907,6 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -5017,7 +4950,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -5064,7 +4996,6 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5109,7 +5040,6 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5157,7 +5087,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5206,7 +5135,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5255,7 +5183,6 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5304,7 +5231,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5333,8 +5259,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: @@ -5361,8 +5286,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* % ; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: @@ -5391,8 +5315,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: @@ -5430,8 +5353,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: @@ -5470,8 +5392,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) ; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: @@ -5502,8 +5423,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: @@ -6090,11 +6010,9 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -6131,11 +6049,9 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -6181,7 +6097,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6233,7 +6148,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6280,11 +6194,9 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -6332,7 +6244,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6382,11 +6293,9 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -6429,11 +6338,9 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -6485,7 +6392,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6543,7 +6449,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6596,11 +6501,9 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -6654,7 +6557,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7476,7 +7378,6 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7519,7 +7420,6 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7577,7 +7477,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7637,7 +7536,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7686,7 +7584,6 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7746,7 +7643,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7798,7 +7694,6 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7847,7 +7742,6 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7911,7 +7805,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7977,7 +7870,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -8032,7 +7924,6 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -8098,7 +7989,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -9132,7 +9022,6 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -9169,7 +9058,6 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -9206,7 +9094,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -9214,7 +9101,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -9252,7 +9138,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -9260,7 +9145,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -9303,7 +9187,6 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -9345,7 +9228,6 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -9385,7 +9267,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -9395,7 +9276,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -9436,7 +9316,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -9446,7 +9325,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -9490,11 +9368,9 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -9537,11 +9413,9 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -9697,7 +9571,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -9724,7 +9597,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -9753,7 +9625,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -9784,7 +9655,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, < ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -9822,7 +9692,6 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -9886,7 +9755,6 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -9952,7 +9820,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -10020,7 +9887,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -10089,7 +9955,6 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -10158,7 +10023,6 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -10229,7 +10093,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -10302,7 +10165,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -10381,7 +10243,6 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -10421,7 +10282,6 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -10459,7 +10319,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -10467,7 +10326,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -10506,7 +10364,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -10514,7 +10371,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -10560,7 +10416,6 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -10605,7 +10460,6 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -10646,7 +10500,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -10656,7 +10509,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -10698,7 +10550,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -10708,7 +10559,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -10924,13 +10774,11 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -11059,14 +10907,12 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -12227,11 +12073,9 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -12268,11 +12112,9 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -12324,7 +12166,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12382,7 +12223,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12429,11 +12269,9 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -12487,7 +12325,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12537,11 +12374,9 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -12584,11 +12419,9 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -12646,7 +12479,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12710,7 +12542,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12763,11 +12594,9 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -12827,7 +12656,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -13912,7 +13740,6 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -13952,7 +13779,6 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -13995,7 +13821,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -14039,7 +13864,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -14083,7 +13907,6 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -14127,7 +13950,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -14174,7 +13996,6 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14219,7 +14040,6 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14267,7 +14087,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14316,7 +14135,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14365,7 +14183,6 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14414,7 +14231,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14443,8 +14259,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: @@ -14471,8 +14286,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: @@ -14501,8 +14315,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: @@ -14540,8 +14353,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: @@ -14580,8 +14392,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b ; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: @@ -14612,8 +14423,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, < ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: @@ -15200,11 +15010,9 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -15241,11 +15049,9 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -15291,7 +15097,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15343,7 +15148,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15390,11 +15194,9 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -15442,7 +15244,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15492,11 +15293,9 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -15539,11 +15338,9 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -15595,7 +15392,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15653,7 +15449,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15706,11 +15501,9 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -15764,7 +15557,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16586,7 +16378,6 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16629,7 +16420,6 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16687,7 +16477,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16747,7 +16536,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16796,7 +16584,6 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16856,7 +16643,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16908,7 +16694,6 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16957,7 +16742,6 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -17021,7 +16805,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -17087,7 +16870,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -17142,7 +16924,6 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -17208,7 +16989,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -18331,7 +18111,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -18380,7 +18159,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -18522,7 +18300,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -18576,7 +18353,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -19644,7 +19420,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -19694,7 +19469,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -19843,7 +19617,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -19898,7 +19671,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -21451,7 +21223,6 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21495,7 +21266,6 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21551,7 +21321,6 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21610,7 +21379,6 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21659,7 +21427,6 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21717,7 +21484,6 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21769,7 +21535,6 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21819,7 +21584,6 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21881,7 +21645,6 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21946,7 +21709,6 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -22001,7 +21763,6 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -22065,7 +21826,6 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -23150,7 +22910,6 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23190,7 +22949,6 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23233,7 +22991,6 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23277,7 +23034,6 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23321,7 +23077,6 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23365,7 +23120,6 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23412,7 +23166,6 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23457,7 +23210,6 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23505,7 +23257,6 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23554,7 +23305,6 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23603,7 +23353,6 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23652,7 +23401,6 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23681,8 +23429,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: @@ -23711,8 +23458,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: @@ -23744,8 +23490,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: @@ -23783,8 +23528,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: @@ -23824,8 +23568,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b ; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpnltq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: @@ -23858,8 +23601,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, < ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpnltq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: @@ -24464,7 +24206,6 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24508,7 +24249,6 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24558,7 +24298,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24611,7 +24350,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24660,7 +24398,6 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24712,7 +24449,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24764,7 +24500,6 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24814,7 +24549,6 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24870,7 +24604,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24929,7 +24662,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24984,7 +24716,6 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -25042,7 +24773,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -25894,7 +25624,6 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -25940,7 +25669,6 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26000,7 +25728,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26063,7 +25790,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26114,7 +25840,6 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26176,7 +25901,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26230,7 +25954,6 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26282,7 +26005,6 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26348,7 +26070,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26417,7 +26138,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26474,7 +26194,6 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26542,7 +26261,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -27579,7 +27297,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -27619,7 +27336,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -27659,7 +27375,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -27667,7 +27382,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -27708,7 +27422,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -27716,7 +27429,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -27762,7 +27474,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -27807,7 +27518,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -27850,7 +27560,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -27860,7 +27569,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -27904,7 +27612,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -27914,7 +27621,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -27961,11 +27667,9 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -28011,11 +27715,9 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -28180,7 +27882,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -28210,7 +27911,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -28242,7 +27942,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -28276,7 +27975,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -28317,7 +28015,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -28384,7 +28081,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -28453,7 +28149,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -28524,7 +28219,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -28596,7 +28290,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -28668,7 +28361,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -28742,7 +28434,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -28818,7 +28509,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -28900,7 +28590,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -28943,7 +28632,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -28984,7 +28672,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -28992,7 +28679,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -29034,7 +28720,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -29042,7 +28727,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -29091,7 +28775,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -29139,7 +28822,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -29183,7 +28865,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -29193,7 +28874,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -29238,7 +28918,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -29248,7 +28927,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -29466,7 +29144,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 ; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0 @@ -29475,7 +29152,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -29607,7 +29283,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -29616,7 +29291,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -30826,11 +30500,9 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -30870,11 +30542,9 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -30929,7 +30599,6 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -30990,7 +30659,6 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31040,11 +30708,9 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -31101,7 +30767,6 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31154,11 +30819,9 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -31204,11 +30867,9 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -31269,7 +30930,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31336,7 +30996,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31392,11 +31051,9 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -31459,7 +31116,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -32544,7 +32200,6 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32584,7 +32239,6 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32627,7 +32281,6 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32671,7 +32324,6 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32715,7 +32367,6 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32759,7 +32410,6 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32806,7 +32456,6 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -32851,7 +32500,6 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -32899,7 +32547,6 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -32948,7 +32595,6 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -32997,7 +32643,6 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -33046,7 +32691,6 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -33075,8 +32719,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask: @@ -33106,8 +32749,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: @@ -33139,8 +32781,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: @@ -33181,8 +32822,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: @@ -33224,8 +32864,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b ; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: @@ -33259,8 +32898,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, < ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: @@ -33889,11 +33527,9 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -33933,11 +33569,9 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -33986,7 +33620,6 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -34041,7 +33674,6 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -34091,11 +33723,9 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -34146,7 +33776,6 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -34199,11 +33828,9 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -34249,11 +33876,9 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -34308,7 +33933,6 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -34369,7 +33993,6 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -34425,11 +34048,9 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -34486,7 +34107,6 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35347,7 +34967,6 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35393,7 +35012,6 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35454,7 +35072,6 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35517,7 +35134,6 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35569,7 +35185,6 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35632,7 +35247,6 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35687,7 +35301,6 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35739,7 +35352,6 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35806,7 +35418,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35875,7 +35486,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35933,7 +35543,6 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -36002,7 +35611,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -37177,8 +36785,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -37186,9 +36793,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i6 ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 @@ -37238,8 +36843,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -37247,9 +36851,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2 ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -37300,8 +36902,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -37309,9 +36910,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 @@ -37523,8 +37122,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -37532,9 +37130,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 @@ -37584,8 +37180,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -37593,9 +37188,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -37646,8 +37239,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -37655,9 +37247,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 @@ -37729,11 +37319,9 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -37770,11 +37358,9 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -37813,11 +37399,9 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -37839,8 +37423,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -37853,24 +37436,21 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -37889,8 +37469,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -37903,24 +37482,21 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -37940,8 +37516,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -37954,10 +37529,8 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 @@ -37965,14 +37538,13 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -38015,11 +37587,9 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -38062,11 +37632,9 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -38111,11 +37679,9 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -38140,8 +37706,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -38154,10 +37719,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 @@ -38167,16 +37730,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -38196,8 +37758,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -38210,10 +37771,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -38223,16 +37782,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -38253,8 +37811,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -38267,10 +37824,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 @@ -38281,16 +37836,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -39366,7 +38920,6 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39406,7 +38959,6 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39447,7 +38999,6 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39491,7 +39042,6 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39535,7 +39085,6 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39580,7 +39129,6 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39669,7 +39217,6 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39714,7 +39261,6 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39760,7 +39306,6 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39809,7 +39354,6 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39858,7 +39402,6 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39908,7 +39451,6 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39991,8 +39533,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: @@ -40019,8 +39560,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: @@ -40048,8 +39588,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* % ; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: @@ -40078,18 +39617,14 @@ entry: define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 @@ -40115,18 +39650,14 @@ entry: define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 @@ -40153,18 +39684,14 @@ entry: define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0 @@ -40310,8 +39837,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -40319,9 +39845,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask(i2 zeroext %__u, <2 x i6 ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 @@ -40357,8 +39881,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -40366,9 +39889,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem(i2 zeroext %__u, <2 ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 @@ -40405,8 +39926,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -40414,9 +39934,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b(i2 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0 @@ -40572,8 +40090,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -40581,9 +40098,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask(i2 zeroext %__u, <2 x ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 @@ -40619,8 +40134,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -40628,9 +40142,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem(i2 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 @@ -40667,8 +40179,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -40676,9 +40187,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b(i2 zeroext %__u, ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0 @@ -40736,11 +40245,9 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -40777,11 +40284,9 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -40820,11 +40325,9 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -40846,8 +40349,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -40860,24 +40362,21 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -40896,8 +40395,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -40910,24 +40408,21 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -40947,8 +40442,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -40961,10 +40455,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0 @@ -40972,14 +40464,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -41022,11 +40513,9 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -41069,11 +40558,9 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -41118,11 +40605,9 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -41147,8 +40632,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -41161,10 +40645,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 @@ -41174,16 +40656,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -41203,8 +40684,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -41217,10 +40697,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 @@ -41230,16 +40708,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -41260,8 +40737,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -41274,10 +40750,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0 @@ -41288,16 +40762,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -41487,8 +40960,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -41497,9 +40969,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i6 ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -41550,8 +41020,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -41560,9 +41029,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4 ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -41614,8 +41081,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -41624,9 +41090,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 @@ -41848,8 +41312,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -41858,9 +41321,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -41911,8 +41372,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -41921,9 +41381,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -41975,8 +41433,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -41985,9 +41442,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 @@ -42062,7 +41517,6 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42105,7 +41559,6 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42150,7 +41603,6 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42176,8 +41628,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: vzeroupper @@ -42191,10 +41642,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -42202,14 +41651,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -42228,8 +41676,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: vzeroupper @@ -42243,10 +41690,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -42254,14 +41699,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -42281,8 +41725,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: vzeroupper @@ -42296,10 +41739,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 @@ -42308,14 +41749,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -42360,7 +41800,6 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42409,7 +41848,6 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42460,7 +41898,6 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42489,8 +41926,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: vzeroupper @@ -42504,10 +41940,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -42518,16 +41952,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -42547,8 +41980,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: vzeroupper @@ -42562,10 +41994,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -42576,16 +42006,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -42606,8 +42035,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: vzeroupper @@ -42621,10 +42049,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 @@ -42636,16 +42062,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -43830,7 +43255,6 @@ define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) { ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll index 78c44e4dca3b..45af265a95b0 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -31,11 +31,9 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm0 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax @@ -80,8 +78,7 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i32: @@ -89,8 +86,7 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <4 x i32> %a, %b %x1 = icmp sgt <4 x i32> %c, %d @@ -123,8 +119,7 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) ; AVX512F-NEXT: vcmpltps %xmm0, %xmm1, %k1 ; AVX512F-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4f32: @@ -132,8 +127,7 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) ; AVX512BW-NEXT: vcmpltps %xmm0, %xmm1, %k1 ; AVX512BW-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = fcmp ogt <4 x float> %a, %b %x1 = fcmp ogt <4 x float> %c, %d @@ -165,11 +159,9 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax @@ -318,8 +310,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i8: @@ -335,8 +326,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <2 x i8> %a, %b %x1 = icmp sgt <2 x i8> %c, %d @@ -473,8 +463,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i16: @@ -490,8 +479,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <2 x i16> %a, %b %x1 = icmp sgt <2 x i16> %c, %d @@ -612,8 +600,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: @@ -629,8 +616,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <2 x i32> %a, %b %x1 = icmp sgt <2 x i32> %c, %d @@ -682,8 +668,7 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i64: @@ -691,8 +676,7 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <2 x i64> %a, %b %x1 = icmp sgt <2 x i64> %c, %d @@ -725,8 +709,7 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> ; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %k1 ; AVX512F-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2f64: @@ -734,8 +717,7 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> ; AVX512BW-NEXT: vcmpltpd %xmm0, %xmm1, %k1 ; AVX512BW-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = fcmp ogt <2 x double> %a, %b %x1 = fcmp ogt <2 x double> %c, %d @@ -792,8 +774,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i8: @@ -809,8 +790,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <4 x i8> %a, %b %x1 = icmp sgt <4 x i8> %c, %d @@ -867,8 +847,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i16: @@ -884,8 +863,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <4 x i16> %a, %b %x1 = icmp sgt <4 x i16> %c, %d @@ -944,10 +922,8 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-NEXT: vpmovsxwd %xmm2, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll index fdce65516e32..62480bb0bd25 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -94,8 +94,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512F-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -104,8 +103,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512BW-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %x0 = icmp sgt <4 x i64> %a, %b @@ -148,8 +146,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> ; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %k1 ; AVX512F-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -158,8 +155,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> ; AVX512BW-NEXT: vcmpltpd %ymm0, %ymm1, %k1 ; AVX512BW-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %x0 = fcmp ogt <4 x double> %a, %b @@ -219,11 +215,9 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index 6ef2be99dee5..04cbded7667c 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -44,10 +44,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; ; AVX512-LABEL: ext_i2_2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: andb $3, %dil -; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovd %edi, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq @@ -86,10 +83,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; ; AVX512-LABEL: ext_i4_4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: andb $15, %dil -; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovd %edi, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq @@ -102,8 +96,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; SSE2-SSSE3-LABEL: ext_i8_8i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 @@ -112,8 +106,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX1-LABEL: ext_i8_8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 @@ -239,10 +233,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; ; AVX512-LABEL: ext_i4_4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: andb $15, %dil -; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovd %edi, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: retq @@ -305,8 +296,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; SSE2-SSSE3-LABEL: ext_i16_16i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 @@ -319,8 +310,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; AVX1-LABEL: ext_i16_16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -565,8 +556,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-LABEL: ext_i32_32i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm2 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 @@ -574,8 +565,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [256,512,1024,2048,4096,8192,16384,32768] ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 @@ -586,8 +577,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-LABEL: ext_i32_32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 @@ -599,8 +590,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index 9e77cd11449e..54ba1881f115 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -49,9 +49,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; AVX512F-LABEL: ext_i2_2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: andb $3, %dil -; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 ; AVX512F-NEXT: vzeroupper @@ -59,10 +57,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; ; AVX512VLBW-LABEL: ext_i2_2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: andb $3, %dil -; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: kmovd %edi, %k1 ; AVX512VLBW-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} ; AVX512VLBW-NEXT: retq %1 = bitcast i2 %a0 to <2 x i1> @@ -104,9 +99,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX512F-LABEL: ext_i4_4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: andb $15, %dil -; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 ; AVX512F-NEXT: vzeroupper @@ -114,10 +107,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; ; AVX512VLBW-LABEL: ext_i4_4i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: andb $15, %dil -; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: kmovd %edi, %k1 ; AVX512VLBW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; AVX512VLBW-NEXT: retq %1 = bitcast i4 %a0 to <4 x i1> @@ -129,8 +119,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; SSE2-SSSE3-LABEL: ext_i8_8i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 @@ -140,8 +130,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX1-LABEL: ext_i8_8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 @@ -300,19 +290,14 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; AVX512F-LABEL: ext_i4_4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: andb $15, %dil -; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i4_4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: andb $15, %dil -; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: kmovd %edi, %k1 ; AVX512VLBW-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k1} {z} ; AVX512VLBW-NEXT: retq %1 = bitcast i4 %a0 to <4 x i1> @@ -385,8 +370,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; SSE2-SSSE3-LABEL: ext_i16_16i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 @@ -401,8 +386,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; AVX1-LABEL: ext_i16_16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -723,8 +708,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-LABEL: ext_i32_32i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm2 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 @@ -734,8 +719,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: psrlw $15, %xmm1 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 @@ -748,8 +733,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-LABEL: ext_i32_32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 @@ -763,8 +748,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm2 diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll index 45a48fae146d..8af95dfd5b80 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -43,9 +43,7 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) { ; ; AVX512-LABEL: bitcast_i2_2i1: ; AVX512: # %bb.0: -; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovd %edi, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq @@ -86,9 +84,7 @@ define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) { ; ; AVX512-LABEL: bitcast_i4_4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovd %edi, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq @@ -100,8 +96,8 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) { ; SSE2-SSSE3-LABEL: bitcast_i8_8i1: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 @@ -111,8 +107,8 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) { ; AVX1-LABEL: bitcast_i8_8i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll index 8fdacb7b79d6..a96c1a30e67a 100644 --- a/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-setcc-128.ll @@ -27,7 +27,6 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax @@ -64,16 +63,14 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <4 x i32> %a, %b %res = bitcast <4 x i1> %x to i4 @@ -99,16 +96,14 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltps %xmm0, %xmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4f32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vcmpltps %xmm0, %xmm1, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = fcmp ogt <4 x float> %a, %b %res = bitcast <4 x i1> %x to i4 @@ -134,7 +129,6 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax @@ -226,8 +220,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; AVX512F-NEXT: vpsraq $56, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i8: @@ -238,8 +231,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; AVX512BW-NEXT: vpsraq $56, %xmm0, %xmm0 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <2 x i8> %a, %b %res = bitcast <2 x i1> %x to i2 @@ -320,8 +312,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; AVX512F-NEXT: vpsraq $48, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i16: @@ -332,8 +323,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; AVX512BW-NEXT: vpsraq $48, %xmm0, %xmm0 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <2 x i16> %a, %b %res = bitcast <2 x i1> %x to i2 @@ -406,8 +396,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: @@ -418,8 +407,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <2 x i32> %a, %b %res = bitcast <2 x i1> %x to i2 @@ -455,16 +443,14 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <2 x i64> %a, %b %res = bitcast <2 x i1> %x to i2 @@ -490,16 +476,14 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2f64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vcmpltpd %xmm0, %xmm1, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = fcmp ogt <2 x double> %a, %b %res = bitcast <2 x i1> %x to i2 @@ -537,8 +521,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) { ; AVX512F-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i8: @@ -549,8 +532,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) { ; AVX512BW-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <4 x i8> %a, %b %res = bitcast <4 x i1> %x to i4 @@ -588,8 +570,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { ; AVX512F-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i16: @@ -600,8 +581,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { ; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <4 x i16> %a, %b %res = bitcast <4 x i1> %x to i4 @@ -641,7 +621,6 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll index 48e28c9d26ca..0398f31f12d4 100644 --- a/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-setcc-256.ll @@ -42,7 +42,6 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax @@ -194,11 +193,9 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, (%rsp) ; AVX512F-NEXT: movl (%rsp), %eax @@ -271,8 +268,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -280,8 +276,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %x = icmp sgt <4 x i64> %a, %b @@ -311,8 +306,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -320,8 +314,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b) { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vcmpltpd %ymm0, %ymm1, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %x = fcmp ogt <4 x double> %a, %b diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll index 9914f0b93434..f752068acdf0 100644 --- a/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/test/CodeGen/X86/bitcast-setcc-512.ll @@ -62,14 +62,12 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, (%rsp) ; AVX512F-NEXT: movl (%rsp), %eax @@ -870,21 +868,17 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, (%rsp) ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: movl (%rsp), %ecx diff --git a/test/CodeGen/X86/broadcastm-lowering.ll b/test/CodeGen/X86/broadcastm-lowering.ll index 8548d8b7677d..428eaa19497b 100644 --- a/test/CodeGen/X86/broadcastm-lowering.ll +++ b/test/CodeGen/X86/broadcastm-lowering.ll @@ -8,7 +8,6 @@ define <2 x i64> @test_mm_epi64(<8 x i16> %a, <8 x i16> %b) { ; AVX512CD: # %bb.0: # %entry ; AVX512CD-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512CD-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512CD-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512CD-NEXT: kmovw %k0, %eax ; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -45,7 +44,6 @@ define <4 x i32> @test_mm_epi32(<16 x i8> %a, <16 x i8> %b) { ; AVX512CD: # %bb.0: # %entry ; AVX512CD-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512CD-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512CD-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512CD-NEXT: kmovw %k0, %eax ; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -179,7 +177,6 @@ define <8 x i32> @test_mm256_epi32(<16 x i16> %a, <16 x i16> %b) { ; AVX512CD: # %bb.0: # %entry ; AVX512CD-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512CD-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512CD-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512CD-NEXT: kmovw %k0, %eax ; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/build-vector-128.ll b/test/CodeGen/X86/build-vector-128.ll index da92fe6c3fda..6c0c2d30c312 100644 --- a/test/CodeGen/X86/build-vector-128.ll +++ b/test/CodeGen/X86/build-vector-128.ll @@ -409,3 +409,101 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, %ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15 ret <16 x i8> %ins15 } + +; PR30780 + +define <4 x i32> @test_buildvector_v4i32_splat_sext_i8(i8 %in) { +; SSE-32-LABEL: test_buildvector_v4i32_splat_sext_i8: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm0 +; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: test_buildvector_v4i32_splat_sext_i8: +; SSE-64: # %bb.0: +; SSE-64-NEXT: movsbl %dil, %eax +; SSE-64-NEXT: movd %eax, %xmm0 +; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-64-NEXT: retq +; +; AVX1-32-LABEL: test_buildvector_v4i32_splat_sext_i8: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; AVX1-32-NEXT: vmovd %eax, %xmm0 +; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-32-NEXT: retl +; +; AVX1-64-LABEL: test_buildvector_v4i32_splat_sext_i8: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: movsbl %dil, %eax +; AVX1-64-NEXT: vmovd %eax, %xmm0 +; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v4i32_splat_sext_i8: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; AVX2-32-NEXT: vmovd %eax, %xmm0 +; AVX2-32-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_v4i32_splat_sext_i8: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: movsbl %dil, %eax +; AVX2-64-NEXT: vmovd %eax, %xmm0 +; AVX2-64-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-64-NEXT: retq + %ext = sext i8 %in to i32 + %insert = insertelement <4 x i32> undef, i32 %ext, i32 0 + %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %splat +} + +define <4 x i32> @test_buildvector_v4i32_splat_zext_i8(i8 %in) { +; SSE-32-LABEL: test_buildvector_v4i32_splat_zext_i8: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm0 +; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: test_buildvector_v4i32_splat_zext_i8: +; SSE-64: # %bb.0: +; SSE-64-NEXT: movzbl %dil, %eax +; SSE-64-NEXT: movd %eax, %xmm0 +; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-64-NEXT: retq +; +; AVX1-32-LABEL: test_buildvector_v4i32_splat_zext_i8: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; AVX1-32-NEXT: vmovd %eax, %xmm0 +; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-32-NEXT: retl +; +; AVX1-64-LABEL: test_buildvector_v4i32_splat_zext_i8: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: movzbl %dil, %eax +; AVX1-64-NEXT: vmovd %eax, %xmm0 +; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v4i32_splat_zext_i8: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; AVX2-32-NEXT: vmovd %eax, %xmm0 +; AVX2-32-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_v4i32_splat_zext_i8: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: movzbl %dil, %eax +; AVX2-64-NEXT: vmovd %eax, %xmm0 +; AVX2-64-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-64-NEXT: retq + %ext = zext i8 %in to i32 + %insert = insertelement <4 x i32> undef, i32 %ext, i32 0 + %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %splat +} diff --git a/test/CodeGen/X86/build-vector-256.ll b/test/CodeGen/X86/build-vector-256.ll index f2f17710033d..d2d7a194c701 100644 --- a/test/CodeGen/X86/build-vector-256.ll +++ b/test/CodeGen/X86/build-vector-256.ll @@ -411,3 +411,77 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, %ins31 = insertelement <32 x i8> %ins30, i8 %a31, i32 31 ret <32 x i8> %ins31 } + +; PR30780 + +define <8 x i32> @test_buildvector_v8i32_splat_sext_i8(i8 %in) { +; AVX1-32-LABEL: test_buildvector_v8i32_splat_sext_i8: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; AVX1-32-NEXT: vmovd %eax, %xmm0 +; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-32-NEXT: retl +; +; AVX1-64-LABEL: test_buildvector_v8i32_splat_sext_i8: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: movsbl %dil, %eax +; AVX1-64-NEXT: vmovd %eax, %xmm0 +; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v8i32_splat_sext_i8: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; AVX2-32-NEXT: vmovd %eax, %xmm0 +; AVX2-32-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_v8i32_splat_sext_i8: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: movsbl %dil, %eax +; AVX2-64-NEXT: vmovd %eax, %xmm0 +; AVX2-64-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-64-NEXT: retq + %ext = sext i8 %in to i32 + %insert = insertelement <8 x i32> undef, i32 %ext, i32 0 + %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer + ret <8 x i32> %splat +} + +define <8 x i32> @test_buildvector_v8i32_splat_zext_i8(i8 %in) { +; AVX1-32-LABEL: test_buildvector_v8i32_splat_zext_i8: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; AVX1-32-NEXT: vmovd %eax, %xmm0 +; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-32-NEXT: retl +; +; AVX1-64-LABEL: test_buildvector_v8i32_splat_zext_i8: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: movzbl %dil, %eax +; AVX1-64-NEXT: vmovd %eax, %xmm0 +; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v8i32_splat_zext_i8: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; AVX2-32-NEXT: vmovd %eax, %xmm0 +; AVX2-32-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_v8i32_splat_zext_i8: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: movzbl %dil, %eax +; AVX2-64-NEXT: vmovd %eax, %xmm0 +; AVX2-64-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-64-NEXT: retq + %ext = zext i8 %in to i32 + %insert = insertelement <8 x i32> undef, i32 %ext, i32 0 + %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer + ret <8 x i32> %splat +} diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll index ee63ec653918..ff41083835f4 100644 --- a/test/CodeGen/X86/cast-vsel.ll +++ b/test/CodeGen/X86/cast-vsel.ll @@ -409,11 +409,11 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE2-LABEL: example24: ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: movq $-4096, %rax # imm = 0xF000 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB6_1: # %vector.body @@ -441,11 +441,11 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE41-LABEL: example24: ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000 ; SSE41-NEXT: .p2align 4, 0x90 ; SSE41-NEXT: .LBB6_1: # %vector.body @@ -470,11 +470,11 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; AVX1-LABEL: example24: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovd %esi, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll index 556c858759fc..afb9aa9411ed 100644 --- a/test/CodeGen/X86/cvtv2f32.ll +++ b/test/CodeGen/X86/cvtv2f32.ll @@ -5,8 +5,8 @@ ; uitofp <2 x i32> codegen from buildvector or legalization is different but gives the same results ; across the full 0 - 0xFFFFFFFF u32 range. -define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) { -; X32-LABEL: uitofp_2i32_buildvector: +define <2 x float> @uitofp_2i32_cvt_buildvector(i32 %x, i32 %y, <2 x float> %v) { +; X32-LABEL: uitofp_2i32_cvt_buildvector: ; X32: # %bb.0: ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] @@ -18,7 +18,7 @@ define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) { ; X32-NEXT: mulps %xmm1, %xmm0 ; X32-NEXT: retl ; -; X64-LABEL: uitofp_2i32_buildvector: +; X64-LABEL: uitofp_2i32_cvt_buildvector: ; X64: # %bb.0: ; X64-NEXT: movd %edi, %xmm1 ; X64-NEXT: pinsrd $1, %esi, %xmm1 @@ -38,6 +38,37 @@ define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) { ret <2 x float> %t5 } +define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v) { +; X32-LABEL: uitofp_2i32_buildvector_cvt: +; X32: # %bb.0: +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X32-NEXT: movapd {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] +; X32-NEXT: orpd %xmm1, %xmm2 +; X32-NEXT: subpd %xmm1, %xmm2 +; X32-NEXT: cvtpd2ps %xmm2, %xmm1 +; X32-NEXT: mulps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: uitofp_2i32_buildvector_cvt: +; X64: # %bb.0: +; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: movd %edi, %xmm2 +; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] +; X64-NEXT: por %xmm1, %xmm2 +; X64-NEXT: subpd %xmm1, %xmm2 +; X64-NEXT: cvtpd2ps %xmm2, %xmm1 +; X64-NEXT: mulps %xmm1, %xmm0 +; X64-NEXT: retq + %t1 = insertelement <2 x i32> undef, i32 %x, i32 0 + %t2 = insertelement <2 x i32> %t1, i32 %y, i32 1 + %t3 = uitofp <2 x i32> %t2 to <2 x float> + %t4 = fmul <2 x float> %v, %t3 + ret <2 x float> %t4 +} + define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_legalized: ; X32: # %bb.0: diff --git a/test/CodeGen/X86/fixup-bw-inst.mir b/test/CodeGen/X86/fixup-bw-inst.mir index cea483e1b9bc..e5a5e16108fb 100644 --- a/test/CodeGen/X86/fixup-bw-inst.mir +++ b/test/CodeGen/X86/fixup-bw-inst.mir @@ -26,6 +26,12 @@ ret i16 %i.0 } + define i16 @test4() { + entry: + %t1 = zext i1 undef to i16 + %t2 = or i16 undef, %t1 + ret i16 %t2 + } ... --- # CHECK-LABEL: name: test1 @@ -149,3 +155,47 @@ body: | RETQ %ax ... +--- +# CHECK-LABEL: name: test4 +name: test4 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: +liveins: + - { reg: '%r9d' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +# This code copies r10b into r9b and then uses r9w. We would like to promote +# the copy to a 32-bit copy, but because r9w is used this is not acceptable. +body: | + bb.0.entry: + successors: + liveins: %r9d + + %r9b = MOV8rr undef %r10b, implicit-def %r9d, implicit killed %r9d, implicit-def %eflags + ; CHECK-NOT: MOV32rr + %ax = OR16rr undef %ax, %r9w, implicit-def %eflags + RETQ %ax +... diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll index 1c97e8c768cc..cc434bf18ab3 100644 --- a/test/CodeGen/X86/memset-nonzero.ll +++ b/test/CodeGen/X86/memset-nonzero.ll @@ -206,8 +206,8 @@ define void @memset_16_nonconst_bytes(i8* %x, i8 %c) { ; SSE2FAST: # %bb.0: ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) ; SSE2FAST-NEXT: retq ; @@ -245,8 +245,8 @@ define void @memset_32_nonconst_bytes(i8* %x, i8 %c) { ; SSE2FAST: # %bb.0: ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) ; SSE2FAST-NEXT: retq @@ -292,8 +292,8 @@ define void @memset_64_nonconst_bytes(i8* %x, i8 %c) { ; SSE2FAST: # %bb.0: ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) @@ -351,8 +351,8 @@ define void @memset_128_nonconst_bytes(i8* %x, i8 %c) { ; SSE2FAST: # %bb.0: ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi) @@ -400,8 +400,8 @@ define void @memset_256_nonconst_bytes(i8* %x, i8 %c) { ; SSE2FAST: # %bb.0: ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi) diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll index df97973aecbd..50f44419e823 100644 --- a/test/CodeGen/X86/oddshuffles.ll +++ b/test/CodeGen/X86/oddshuffles.ll @@ -695,7 +695,7 @@ define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounw ; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3] ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movq %xmm1, (%rdi) diff --git a/test/CodeGen/X86/pr33349.ll b/test/CodeGen/X86/pr33349.ll index b1428ba6667c..8f9c861d9ecf 100644 --- a/test/CodeGen/X86/pr33349.ll +++ b/test/CodeGen/X86/pr33349.ll @@ -40,7 +40,7 @@ target triple = "x86_64-unknown-linux-gnu" ; SKX: # %bb.0: # %bb ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 -; SKX-NEXT: kshiftrw $2, %k0, %k1 +; SKX-NEXT: kshiftrb $2, %k0, %k1 ; SKX-NEXT: kshiftrw $1, %k1, %k2 ; SKX-NEXT: kmovd %k2, %eax ; SKX-NEXT: testb $1, %al diff --git a/test/CodeGen/X86/pr35765.ll b/test/CodeGen/X86/pr35765.ll new file mode 100644 index 000000000000..4d097459e33a --- /dev/null +++ b/test/CodeGen/X86/pr35765.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o - | FileCheck %s + +@ll = local_unnamed_addr global i64 0, align 8 +@x = local_unnamed_addr global i64 2651237805702985558, align 8 +@s1 = local_unnamed_addr global { i8, i8 } { i8 123, i8 5 }, align 2 +@s2 = local_unnamed_addr global { i8, i8 } { i8 -122, i8 3 }, align 2 + +define void @PR35765() { +; CHECK-LABEL: PR35765: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzwl {{.*}}(%rip), %ecx +; CHECK-NEXT: addl $-1398, %ecx # imm = 0xFA8A +; CHECK-NEXT: movl $4, %eax +; CHECK-NEXT: # kill: def %cl killed %cl killed %ecx +; CHECK-NEXT: shll %cl, %eax +; CHECK-NEXT: movzwl {{.*}}(%rip), %ecx +; CHECK-NEXT: movzwl {{.*}}(%rip), %edx +; CHECK-NEXT: notl %edx +; CHECK-NEXT: orl $63488, %edx # imm = 0xF800 +; CHECK-NEXT: movzwl %dx, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: xorl %eax, %edx +; CHECK-NEXT: movslq %edx, %rax +; CHECK-NEXT: movq %rax, {{.*}}(%rip) +; CHECK-NEXT: retq +entry: + %bf.load.i = load i16, i16* bitcast ({ i8, i8 }* @s1 to i16*), align 2 + %bf.clear.i = and i16 %bf.load.i, 2047 + %conv.i = zext i16 %bf.clear.i to i32 + %sub.i = add nsw i32 %conv.i, -1398 + %shl.i = shl i32 4, %sub.i + %0 = load i64, i64* @x, align 8 + %bf.load1.i = load i16, i16* bitcast ({ i8, i8 }* @s2 to i16*), align 2 + %bf.clear2.i = and i16 %bf.load1.i, 2047 + %1 = xor i16 %bf.clear2.i, -1 + %neg.i = zext i16 %1 to i64 + %or.i = or i64 %0, %neg.i + %conv5.i = trunc i64 %or.i to i32 + %conv6.i = and i32 %conv5.i, 65535 + %xor.i = xor i32 %conv6.i, %shl.i + %conv7.i = sext i32 %xor.i to i64 + store i64 %conv7.i, i64* @ll, align 8 + ret void +} diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll index 8642bc596f39..490c232a161c 100644 --- a/test/CodeGen/X86/psubus.ll +++ b/test/CodeGen/X86/psubus.ll @@ -54,16 +54,16 @@ define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind { ; SSE-LABEL: test3: ; SSE: # %bb.0: # %vector.ph ; SSE-NEXT: movd %edi, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: psubusw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: test3: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -137,8 +137,8 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind { ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: psubusb %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -267,8 +267,8 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind { ; SSE-LABEL: test9: ; SSE: # %bb.0: # %vector.ph ; SSE-NEXT: movd %edi, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: psubusw %xmm2, %xmm0 ; SSE-NEXT: psubusw %xmm2, %xmm1 ; SSE-NEXT: retq @@ -277,8 +277,8 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind { ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 @@ -392,8 +392,8 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind { ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movd %edi, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: psubusb %xmm2, %xmm0 ; SSE2-NEXT: psubusb %xmm2, %xmm1 ; SSE2-NEXT: retq diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll index 410378ffbad2..1163307a0c34 100644 --- a/test/CodeGen/X86/setcc-wide-types.ll +++ b/test/CodeGen/X86/setcc-wide-types.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ANY --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512BW ; Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization. @@ -14,14 +17,14 @@ define i32 @ne_i128(<2 x i64> %x, <2 x i64> %y) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: ne_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq +; AVXANY-LABEL: ne_i128: +; AVXANY: # %bb.0: +; AVXANY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: xorl %eax, %eax +; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: setne %al +; AVXANY-NEXT: retq %bcx = bitcast <2 x i64> %x to i128 %bcy = bitcast <2 x i64> %y to i128 %cmp = icmp ne i128 %bcx, %bcy @@ -39,14 +42,14 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) { ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: eq_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; AVX2-NEXT: sete %al -; AVX2-NEXT: retq +; AVXANY-LABEL: eq_i128: +; AVXANY: # %bb.0: +; AVXANY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: xorl %eax, %eax +; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: sete %al +; AVXANY-NEXT: retq %bcx = bitcast <2 x i64> %x to i128 %bcy = bitcast <2 x i64> %y to i128 %cmp = icmp eq i128 %bcx, %bcy @@ -80,15 +83,39 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: ne_i256: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: cmpl $-1, %ecx -; AVX2-NEXT: setne %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1-LABEL: ne_i256: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovq %xmm2, %rcx +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: vpextrq $1, %xmm2, %r8 +; AVX1-NEXT: vmovq %xmm1, %rdi +; AVX1-NEXT: xorq %rax, %rdi +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rsi +; AVX1-NEXT: xorq %rcx, %rsi +; AVX1-NEXT: orq %rdi, %rsi +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: xorq %rdx, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: xorq %r8, %rcx +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX256-LABEL: ne_i256: +; AVX256: # %bb.0: +; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpmovmskb %ymm0, %ecx +; AVX256-NEXT: xorl %eax, %eax +; AVX256-NEXT: cmpl $-1, %ecx +; AVX256-NEXT: setne %al +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %bcx = bitcast <4 x i64> %x to i256 %bcy = bitcast <4 x i64> %y to i256 %cmp = icmp ne i256 %bcx, %bcy @@ -122,15 +149,39 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: eq_i256: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: cmpl $-1, %ecx -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1-LABEL: eq_i256: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovq %xmm2, %rcx +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: vpextrq $1, %xmm2, %r8 +; AVX1-NEXT: vmovq %xmm1, %rdi +; AVX1-NEXT: xorq %rax, %rdi +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rsi +; AVX1-NEXT: xorq %rcx, %rsi +; AVX1-NEXT: orq %rdi, %rsi +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: xorq %rdx, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: xorq %r8, %rcx +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX256-LABEL: eq_i256: +; AVX256: # %bb.0: +; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpmovmskb %ymm0, %ecx +; AVX256-NEXT: xorl %eax, %eax +; AVX256-NEXT: cmpl $-1, %ecx +; AVX256-NEXT: sete %al +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %bcx = bitcast <4 x i64> %x to i256 %bcy = bitcast <4 x i64> %y to i256 %cmp = icmp eq i256 %bcx, %bcy @@ -138,43 +189,37 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ret i32 %zext } -; This test models the expansion of 'memcmp(a, b, 32) != 0' +; This test models the expansion of 'memcmp(a, b, 32) != 0' ; if we allowed 2 pairs of 16-byte loads per block. define i32 @ne_i128_pair(i128* %a, i128* %b) { ; SSE2-LABEL: ne_i128_pair: ; SSE2: # %bb.0: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq 8(%rdi), %rcx -; SSE2-NEXT: xorq (%rsi), %rax -; SSE2-NEXT: xorq 8(%rsi), %rcx -; SSE2-NEXT: movq 24(%rdi), %rdx -; SSE2-NEXT: movq 16(%rdi), %rdi -; SSE2-NEXT: xorq 16(%rsi), %rdi -; SSE2-NEXT: orq %rax, %rdi -; SSE2-NEXT: xorq 24(%rsi), %rdx -; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; SSE2-NEXT: movdqu (%rsi), %xmm2 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; SSE2-NEXT: movdqu 16(%rsi), %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: ne_i128_pair: -; AVX2: # %bb.0: -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: xorq (%rsi), %rax -; AVX2-NEXT: xorq 8(%rsi), %rcx -; AVX2-NEXT: movq 24(%rdi), %rdx -; AVX2-NEXT: movq 16(%rdi), %rdi -; AVX2-NEXT: xorq 16(%rsi), %rdi -; AVX2-NEXT: orq %rax, %rdi -; AVX2-NEXT: xorq 24(%rsi), %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq +; AVXANY-LABEL: ne_i128_pair: +; AVXANY: # %bb.0: +; AVXANY-NEXT: vmovdqu (%rdi), %xmm0 +; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 +; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: xorl %eax, %eax +; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: setne %al +; AVXANY-NEXT: retq %a0 = load i128, i128* %a %b0 = load i128, i128* %b %xor1 = xor i128 %a0, %b0 @@ -189,43 +234,37 @@ define i32 @ne_i128_pair(i128* %a, i128* %b) { ret i32 %z } -; This test models the expansion of 'memcmp(a, b, 32) == 0' +; This test models the expansion of 'memcmp(a, b, 32) == 0' ; if we allowed 2 pairs of 16-byte loads per block. define i32 @eq_i128_pair(i128* %a, i128* %b) { ; SSE2-LABEL: eq_i128_pair: ; SSE2: # %bb.0: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq 8(%rdi), %rcx -; SSE2-NEXT: xorq (%rsi), %rax -; SSE2-NEXT: xorq 8(%rsi), %rcx -; SSE2-NEXT: movq 24(%rdi), %rdx -; SSE2-NEXT: movq 16(%rdi), %rdi -; SSE2-NEXT: xorq 16(%rsi), %rdi -; SSE2-NEXT: orq %rax, %rdi -; SSE2-NEXT: xorq 24(%rsi), %rdx -; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; SSE2-NEXT: movdqu (%rsi), %xmm2 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; SSE2-NEXT: movdqu 16(%rsi), %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: eq_i128_pair: -; AVX2: # %bb.0: -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: xorq (%rsi), %rax -; AVX2-NEXT: xorq 8(%rsi), %rcx -; AVX2-NEXT: movq 24(%rdi), %rdx -; AVX2-NEXT: movq 16(%rdi), %rdi -; AVX2-NEXT: xorq 16(%rsi), %rdi -; AVX2-NEXT: orq %rax, %rdi -; AVX2-NEXT: xorq 24(%rsi), %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rdx -; AVX2-NEXT: sete %al -; AVX2-NEXT: retq +; AVXANY-LABEL: eq_i128_pair: +; AVXANY: # %bb.0: +; AVXANY-NEXT: vmovdqu (%rdi), %xmm0 +; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 +; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: xorl %eax, %eax +; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: sete %al +; AVXANY-NEXT: retq %a0 = load i128, i128* %a %b0 = load i128, i128* %b %xor1 = xor i128 %a0, %b0 @@ -240,7 +279,7 @@ define i32 @eq_i128_pair(i128* %a, i128* %b) { ret i32 %z } -; This test models the expansion of 'memcmp(a, b, 64) != 0' +; This test models the expansion of 'memcmp(a, b, 64) != 0' ; if we allowed 2 pairs of 32-byte loads per block. define i32 @ne_i256_pair(i256* %a, i256* %b) { @@ -273,34 +312,48 @@ define i32 @ne_i256_pair(i256* %a, i256* %b) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: ne_i256_pair: -; AVX2: # %bb.0: -; AVX2-NEXT: movq 16(%rdi), %r9 -; AVX2-NEXT: movq 24(%rdi), %r11 -; AVX2-NEXT: movq (%rdi), %r8 -; AVX2-NEXT: movq 8(%rdi), %r10 -; AVX2-NEXT: xorq 8(%rsi), %r10 -; AVX2-NEXT: xorq 24(%rsi), %r11 -; AVX2-NEXT: xorq (%rsi), %r8 -; AVX2-NEXT: xorq 16(%rsi), %r9 -; AVX2-NEXT: movq 48(%rdi), %rdx -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq 56(%rdi), %rcx -; AVX2-NEXT: movq 40(%rdi), %rdi -; AVX2-NEXT: xorq 40(%rsi), %rdi -; AVX2-NEXT: xorq 56(%rsi), %rcx -; AVX2-NEXT: orq %r11, %rcx -; AVX2-NEXT: orq %rdi, %rcx -; AVX2-NEXT: orq %r10, %rcx -; AVX2-NEXT: xorq 32(%rsi), %rax -; AVX2-NEXT: xorq 48(%rsi), %rdx -; AVX2-NEXT: orq %r9, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq +; AVX1-LABEL: ne_i256_pair: +; AVX1: # %bb.0: +; AVX1-NEXT: movq 16(%rdi), %r9 +; AVX1-NEXT: movq 24(%rdi), %r11 +; AVX1-NEXT: movq (%rdi), %r8 +; AVX1-NEXT: movq 8(%rdi), %r10 +; AVX1-NEXT: xorq 8(%rsi), %r10 +; AVX1-NEXT: xorq 24(%rsi), %r11 +; AVX1-NEXT: xorq (%rsi), %r8 +; AVX1-NEXT: xorq 16(%rsi), %r9 +; AVX1-NEXT: movq 48(%rdi), %rdx +; AVX1-NEXT: movq 32(%rdi), %rax +; AVX1-NEXT: movq 56(%rdi), %rcx +; AVX1-NEXT: movq 40(%rdi), %rdi +; AVX1-NEXT: xorq 40(%rsi), %rdi +; AVX1-NEXT: xorq 56(%rsi), %rcx +; AVX1-NEXT: orq %r11, %rcx +; AVX1-NEXT: orq %rdi, %rcx +; AVX1-NEXT: orq %r10, %rcx +; AVX1-NEXT: xorq 32(%rsi), %rax +; AVX1-NEXT: xorq 48(%rsi), %rdx +; AVX1-NEXT: orq %r9, %rdx +; AVX1-NEXT: orq %rax, %rdx +; AVX1-NEXT: orq %r8, %rdx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: setne %al +; AVX1-NEXT: retq +; +; AVX256-LABEL: ne_i256_pair: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqu (%rdi), %ymm0 +; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 +; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpmovmskb %ymm0, %ecx +; AVX256-NEXT: xorl %eax, %eax +; AVX256-NEXT: cmpl $-1, %ecx +; AVX256-NEXT: setne %al +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a0 = load i256, i256* %a %b0 = load i256, i256* %b %xor1 = xor i256 %a0, %b0 @@ -315,7 +368,7 @@ define i32 @ne_i256_pair(i256* %a, i256* %b) { ret i32 %z } -; This test models the expansion of 'memcmp(a, b, 64) == 0' +; This test models the expansion of 'memcmp(a, b, 64) == 0' ; if we allowed 2 pairs of 32-byte loads per block. define i32 @eq_i256_pair(i256* %a, i256* %b) { @@ -348,34 +401,48 @@ define i32 @eq_i256_pair(i256* %a, i256* %b) { ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: eq_i256_pair: -; AVX2: # %bb.0: -; AVX2-NEXT: movq 16(%rdi), %r9 -; AVX2-NEXT: movq 24(%rdi), %r11 -; AVX2-NEXT: movq (%rdi), %r8 -; AVX2-NEXT: movq 8(%rdi), %r10 -; AVX2-NEXT: xorq 8(%rsi), %r10 -; AVX2-NEXT: xorq 24(%rsi), %r11 -; AVX2-NEXT: xorq (%rsi), %r8 -; AVX2-NEXT: xorq 16(%rsi), %r9 -; AVX2-NEXT: movq 48(%rdi), %rdx -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq 56(%rdi), %rcx -; AVX2-NEXT: movq 40(%rdi), %rdi -; AVX2-NEXT: xorq 40(%rsi), %rdi -; AVX2-NEXT: xorq 56(%rsi), %rcx -; AVX2-NEXT: orq %r11, %rcx -; AVX2-NEXT: orq %rdi, %rcx -; AVX2-NEXT: orq %r10, %rcx -; AVX2-NEXT: xorq 32(%rsi), %rax -; AVX2-NEXT: xorq 48(%rsi), %rdx -; AVX2-NEXT: orq %r9, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: sete %al -; AVX2-NEXT: retq +; AVX1-LABEL: eq_i256_pair: +; AVX1: # %bb.0: +; AVX1-NEXT: movq 16(%rdi), %r9 +; AVX1-NEXT: movq 24(%rdi), %r11 +; AVX1-NEXT: movq (%rdi), %r8 +; AVX1-NEXT: movq 8(%rdi), %r10 +; AVX1-NEXT: xorq 8(%rsi), %r10 +; AVX1-NEXT: xorq 24(%rsi), %r11 +; AVX1-NEXT: xorq (%rsi), %r8 +; AVX1-NEXT: xorq 16(%rsi), %r9 +; AVX1-NEXT: movq 48(%rdi), %rdx +; AVX1-NEXT: movq 32(%rdi), %rax +; AVX1-NEXT: movq 56(%rdi), %rcx +; AVX1-NEXT: movq 40(%rdi), %rdi +; AVX1-NEXT: xorq 40(%rsi), %rdi +; AVX1-NEXT: xorq 56(%rsi), %rcx +; AVX1-NEXT: orq %r11, %rcx +; AVX1-NEXT: orq %rdi, %rcx +; AVX1-NEXT: orq %r10, %rcx +; AVX1-NEXT: xorq 32(%rsi), %rax +; AVX1-NEXT: xorq 48(%rsi), %rdx +; AVX1-NEXT: orq %r9, %rdx +; AVX1-NEXT: orq %rax, %rdx +; AVX1-NEXT: orq %r8, %rdx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: sete %al +; AVX1-NEXT: retq +; +; AVX256-LABEL: eq_i256_pair: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqu (%rdi), %ymm0 +; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 +; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpmovmskb %ymm0, %ecx +; AVX256-NEXT: xorl %eax, %eax +; AVX256-NEXT: cmpl $-1, %ecx +; AVX256-NEXT: sete %al +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a0 = load i256, i256* %a %b0 = load i256, i256* %b %xor1 = xor i256 %a0, %b0 diff --git a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll index a65c1d312aa4..f7f9dff9beb0 100644 --- a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll @@ -2,17 +2,6 @@ ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) { -; SSE-LABEL: test_x86_sse_storeu_ps: -; SSE: ## %bb.0: -; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: movups %xmm0, (%eax) -; SSE-NEXT: retl -; -; KNL-LABEL: test_x86_sse_storeu_ps: -; KNL: ## %bb.0: -; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL-NEXT: vmovups %xmm0, (%eax) -; KNL-NEXT: retl ; CHECK-LABEL: test_x86_sse_storeu_ps: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -25,20 +14,6 @@ declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) { -; SSE-LABEL: test_x86_sse_add_ss: -; SSE: ## %bb.0: -; SSE-NEXT: addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_add_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_add_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x58,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse_add_ss: ; CHECK: ## %bb.0: ; CHECK-NEXT: addss %xmm1, %xmm0 @@ -50,20 +25,6 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) { -; SSE-LABEL: test_x86_sse_sub_ss: -; SSE: ## %bb.0: -; SSE-NEXT: subss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5c,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_sub_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5c,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_sub_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vsubss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x5c,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse_sub_ss: ; CHECK: ## %bb.0: ; CHECK-NEXT: subss %xmm1, %xmm0 @@ -75,20 +36,6 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) { -; SSE-LABEL: test_x86_sse_mul_ss: -; SSE: ## %bb.0: -; SSE-NEXT: mulss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x59,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_mul_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmulss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x59,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_mul_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x59,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse_mul_ss: ; CHECK: ## %bb.0: ; CHECK-NEXT: mulss %xmm1, %xmm0 @@ -100,20 +47,6 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) { -; SSE-LABEL: test_x86_sse_div_ss: -; SSE: ## %bb.0: -; SSE-NEXT: divss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5e,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_div_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vdivss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5e,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_div_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vdivss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x5e,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse_div_ss: ; CHECK: ## %bb.0: ; CHECK-NEXT: divss %xmm1, %xmm0 @@ -123,4 +56,3 @@ define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) { } declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone - diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index a75a0597325d..1acf1ad43f6d 100644 --- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2364,8 +2364,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind { ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_set1_epi8: @@ -2373,8 +2373,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind { ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: retq %res0 = insertelement <16 x i8> undef, i8 %a0, i32 0 %res1 = insertelement <16 x i8> %res0, i8 %a0, i32 1 @@ -2401,15 +2401,15 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind { ; X32: # %bb.0: ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 -; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_set1_epi16: ; X64: # %bb.0: ; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: retq %res0 = insertelement <8 x i16> undef, i16 %a0, i32 0 %res1 = insertelement <8 x i16> %res0, i16 %a0, i32 1 diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll index 3571e2968bf8..3dd3be6853f0 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -184,20 +184,6 @@ define <8 x i16> @min_epi16(<8 x i16> %a0, <8 x i16> %a1) { declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) { -; SSE-LABEL: test_x86_sse2_add_sd: -; SSE: ## %bb.0: -; SSE-NEXT: addsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x58,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_add_sd: -; AVX2: ## %bb.0: -; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x58,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_add_sd: -; SKX: ## %bb.0: -; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x58,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse2_add_sd: ; CHECK: ## %bb.0: ; CHECK-NEXT: addsd %xmm1, %xmm0 @@ -209,20 +195,6 @@ declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) { -; SSE-LABEL: test_x86_sse2_sub_sd: -; SSE: ## %bb.0: -; SSE-NEXT: subsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5c,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_sub_sd: -; AVX2: ## %bb.0: -; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5c,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_sub_sd: -; SKX: ## %bb.0: -; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5c,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse2_sub_sd: ; CHECK: ## %bb.0: ; CHECK-NEXT: subsd %xmm1, %xmm0 @@ -234,20 +206,6 @@ declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) { -; SSE-LABEL: test_x86_sse2_mul_sd: -; SSE: ## %bb.0: -; SSE-NEXT: mulsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x59,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_mul_sd: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x59,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_mul_sd: -; SKX: ## %bb.0: -; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x59,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse2_mul_sd: ; CHECK: ## %bb.0: ; CHECK-NEXT: mulsd %xmm1, %xmm0 @@ -259,20 +217,6 @@ declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) { -; SSE-LABEL: test_x86_sse2_div_sd: -; SSE: ## %bb.0: -; SSE-NEXT: divsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5e,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_div_sd: -; AVX2: ## %bb.0: -; AVX2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5e,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_div_sd: -; SKX: ## %bb.0: -; SKX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5e,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse2_div_sd: ; CHECK: ## %bb.0: ; CHECK-NEXT: divsd %xmm1, %xmm0 diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll index bdfc96ba97d5..51f228b414ec 100644 --- a/test/CodeGen/X86/vec_fp_to_int.ll +++ b/test/CodeGen/X86/vec_fp_to_int.ll @@ -919,12 +919,10 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) { ; ; AVX512DQ-LABEL: fptosi_2f32_to_2i64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64: @@ -1448,12 +1446,10 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { ; ; AVX512DQ-LABEL: fptoui_2f32_to_2i64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64: diff --git a/test/CodeGen/X86/vec_set-H.ll b/test/CodeGen/X86/vec_set-H.ll index 03324f02a4fe..d96c8bbc6171 100644 --- a/test/CodeGen/X86/vec_set-H.ll +++ b/test/CodeGen/X86/vec_set-H.ll @@ -5,8 +5,8 @@ define <2 x i64> @doload64(i16 signext %x) nounwind { ; CHECK-LABEL: doload64: ; CHECK: # %bb.0: ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-NEXT: retl %tmp36 = insertelement <8 x i16> undef, i16 %x, i32 0 %tmp37 = insertelement <8 x i16> %tmp36, i16 %x, i32 1 diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll index 2178eb70cdec..2cbf306c8ba0 100644 --- a/test/CodeGen/X86/vector-compare-results.ll +++ b/test/CodeGen/X86/vector-compare-results.ll @@ -5459,38 +5459,30 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind { ; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm4 -; AVX512F-NEXT: vpslld $31, %zmm4, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 ; AVX512F-NEXT: kmovw %k0, 14(%rdi) ; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 ; AVX512F-NEXT: kmovw %k0, 12(%rdi) ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 ; AVX512F-NEXT: kmovw %k0, 10(%rdi) ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, 8(%rdi) ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, 6(%rdi) ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, 4(%rdi) ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, 2(%rdi) ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, (%rdi) ; AVX512F-NEXT: movq %rdi, %rax @@ -5505,38 +5497,30 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind { ; AVX512DQ-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm4 -; AVX512DQ-NEXT: vpslld $31, %zmm4, %zmm4 ; AVX512DQ-NEXT: vptestmd %zmm4, %zmm4, %k0 ; AVX512DQ-NEXT: kmovw %k0, 14(%rdi) ; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0 ; AVX512DQ-NEXT: kmovw %k0, 12(%rdi) ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0 ; AVX512DQ-NEXT: kmovw %k0, 10(%rdi) ; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512DQ-NEXT: kmovw %k0, 8(%rdi) ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512DQ-NEXT: kmovw %k0, 6(%rdi) ; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512DQ-NEXT: kmovw %k0, 4(%rdi) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512DQ-NEXT: kmovw %k0, 2(%rdi) ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512DQ-NEXT: kmovw %k0, (%rdi) ; AVX512DQ-NEXT: movq %rdi, %rax diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll index 782c72e2a4d4..b2c0a4d096c2 100644 --- a/test/CodeGen/X86/vector-pcmp.ll +++ b/test/CodeGen/X86/vector-pcmp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 ; Lower common integer comparisons such as 'isPositive' efficiently: ; https://llvm.org/bugs/show_bug.cgi?id=26701 @@ -84,50 +84,13 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %x) { } define <1 x i128> @test_strange_type(<1 x i128> %x) { -; SSE2-LABEL: test_strange_type: -; SSE2: # %bb.0: -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movq %rsi, %xmm0 -; SSE2-NEXT: notq %rsi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: movq %rsi, %rdx -; SSE2-NEXT: retq -; -; SSE42-LABEL: test_strange_type: -; SSE42: # %bb.0: -; SSE42-NEXT: sarq $63, %rsi -; SSE42-NEXT: movq %rsi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE42-NEXT: pxor %xmm0, %xmm1 -; SSE42-NEXT: movq %xmm1, %rax -; SSE42-NEXT: pextrq $1, %xmm1, %rdx -; SSE42-NEXT: retq -; -; AVX1-LABEL: test_strange_type: -; AVX1: # %bb.0: -; AVX1-NEXT: sarq $63, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_strange_type: -; AVX2: # %bb.0: -; AVX2-NEXT: sarq $63, %rsi -; AVX2-NEXT: vmovq %rsi, %xmm0 -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: retq +; CHECK-LABEL: test_strange_type: +; CHECK: # %bb.0: +; CHECK-NEXT: sarq $63, %rsi +; CHECK-NEXT: notq %rsi +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: movq %rsi, %rdx +; CHECK-NEXT: retq %sign = ashr <1 x i128> %x, %not = xor <1 x i128> %sign, ret <1 x i128> %not diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll index ca670f40ab3f..8aa8682b6e44 100644 --- a/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -796,8 +796,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE2-NEXT: psllw $5, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] @@ -1011,8 +1011,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; X32-SSE-NEXT: psllw $5, %xmm3 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll index 890cedf97c9d..1e5dbea6bc61 100644 --- a/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -663,8 +663,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; SSE2-NEXT: psllw $5, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -816,8 +816,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; X32-SSE-NEXT: psllw $5, %xmm2 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll index 9481e46c0c52..724fd3454eec 100644 --- a/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/test/CodeGen/X86/vector-shift-shl-128.ll @@ -612,8 +612,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; SSE2-NEXT: psllw $5, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -758,8 +758,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; X32-SSE-NEXT: psllw $5, %xmm2 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 2f5a2b116115..2ff7ef4328f5 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -12,8 +12,8 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00( ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -206,14 +206,14 @@ define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07( define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_0101010101010101: ; SSE: # %bb.0: -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v16i8_0101010101010101: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101: @@ -257,8 +257,8 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07( ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -372,14 +372,12 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20( ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,6,5,4] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: @@ -1181,21 +1179,20 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00( ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[1,3,2,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: psrlq $16, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3] +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: psrlq $16, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,1,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] @@ -1203,7 +1200,7 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00( ; SSE2-NEXT: packuswb %xmm5, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 @@ -1489,8 +1486,8 @@ define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v16i8_i32: @@ -1531,8 +1528,8 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) { ; SSE2-NEXT: movsbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8: @@ -1576,8 +1573,8 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32: @@ -1614,8 +1611,8 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32: @@ -1653,8 +1650,8 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) { ; SSE2-NEXT: movsbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: @@ -1706,8 +1703,8 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) { ; SSE2-NEXT: movsbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: @@ -1761,15 +1758,14 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) ; SSE2-NEXT: shll $8, %ecx ; SSE2-NEXT: orl %eax, %ecx ; SSE2-NEXT: movzwl %cx, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3] +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,7] ; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1813,13 +1809,13 @@ define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y) ; SSE2-NEXT: movzbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movzbl (%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 072d71fae570..fc22040578b1 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -63,14 +63,14 @@ define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) { ; SSE-LABEL: shuffle_v8i16_00000000: ; SSE: # %bb.0: -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v8i16_00000000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v8i16_00000000: @@ -1123,33 +1123,44 @@ define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) { } define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) { -; SSE-LABEL: shuffle_v8i16_0213cedf: -; SSE: # %bb.0: -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_0213cedf: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,5,7] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0213cedf: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,5,7] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0213cedf: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq ; ; AVX1-LABEL: shuffle_v8i16_0213cedf: ; AVX1: # %bb.0: +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i16_0213cedf: ; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i16_0213cedf: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,12,13,10,11,14,15] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: retq @@ -1157,14 +1168,14 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) { ; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] ; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] ; AVX512VL-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15] +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,12,13,10,11,14,15] ; AVX512VL-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; AVX512VL-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-FAST-NEXT: retq @@ -2111,79 +2122,115 @@ define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) { } define <8 x i16> @shuffle_v8i16_01100110(<8 x i16> %a) { -; SSE2-LABEL: shuffle_v8i16_01100110: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_01100110: +; SSE: # %bb.0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq ; -; SSSE3-LABEL: shuffle_v8i16_01100110: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] -; SSSE3-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_01100110: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq ; -; SSE41-LABEL: shuffle_v8i16_01100110: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] -; SSE41-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i16_01100110: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-SLOW-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_01100110: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] -; AVX-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v8i16_01100110: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_01100110: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_01100110: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_01u0u110(<8 x i16> %a) { -; SSE2-LABEL: shuffle_v8i16_01u0u110: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_01u0u110: +; SSE: # %bb.0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq ; -; SSSE3-LABEL: shuffle_v8i16_01u0u110: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1] -; SSSE3-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_01u0u110: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq ; -; SSE41-LABEL: shuffle_v8i16_01u0u110: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1] -; SSE41-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i16_01u0u110: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-SLOW-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_01u0u110: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1] -; AVX-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v8i16_01u0u110: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_01u0u110: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_01u0u110: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_467uu675(<8 x i16> %a) { -; SSE2-LABEL: shuffle_v8i16_467uu675: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] -; SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_467uu675: +; SSE: # %bb.0: +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: retq ; -; SSSE3-LABEL: shuffle_v8i16_467uu675: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11] -; SSSE3-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_467uu675: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: retq ; -; SSE41-LABEL: shuffle_v8i16_467uu675: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11] -; SSE41-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i16_467uu675: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_467uu675: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11] -; AVX-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v8i16_467uu675: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_467uu675: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_467uu675: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle } @@ -2471,15 +2518,15 @@ define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) { ; SSE-LABEL: insert_dup_mem_v8i16_i32: ; SSE: # %bb.0: ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_mem_v8i16_i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_i32: @@ -2498,8 +2545,8 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v8i16_sext_i16: @@ -2547,15 +2594,15 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) { ; SSE-LABEL: insert_dup_elt1_mem_v8i16_i32: ; SSE: # %bb.0: ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i32: @@ -2574,8 +2621,8 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32: @@ -2612,8 +2659,8 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: @@ -2665,8 +2712,8 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) { ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index cbd1b83a4eb2..aafc9fc7dcd2 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -8,8 +8,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -24,17 +24,38 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -42,17 +63,38 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -60,17 +102,38 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -78,8 +141,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -96,8 +159,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -114,8 +177,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -132,8 +195,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -152,9 +215,10 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -162,8 +226,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: retq @@ -191,9 +255,10 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,6,7,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -220,8 +285,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -254,8 +319,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -288,8 +353,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -314,8 +379,8 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -340,8 +405,8 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -366,8 +431,8 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -390,18 +455,18 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: @@ -411,8 +476,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0 ; ; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: @@ -426,18 +491,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0 define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: @@ -447,8 +512,8 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1 ; ; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: @@ -606,15 +671,36 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_1 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -622,15 +708,36 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -638,15 +745,36 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -913,8 +1041,8 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2 ; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-SLOW-NEXT: retq ; @@ -965,16 +1093,14 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1 define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1014,20 +1140,12 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,6,7,4,5,2,3,0,1,16,17,18,19,20,21,22,23,22,23,20,21,18,19,16,17] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX512VL: # %bb.0: @@ -1042,17 +1160,35 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,1,0,4,5,6,7,8,8,9,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,1,0,4,5,6,7,8,8,9,8,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1060,17 +1196,35 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,2,4,5,6,7,8,8,8,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,2,4,5,6,7,8,8,8,10,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1078,17 +1232,35 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,0,4,5,6,7,8,8,11,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,0,4,5,6,7,8,8,11,8,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1256,9 +1428,11 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_2 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1273,9 +1447,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1290,9 +1466,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1411,8 +1589,8 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_1 define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1464,8 +1642,8 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_0 define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] @@ -2036,9 +2214,10 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -2046,8 +2225,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0 ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: retq @@ -3261,15 +3440,25 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,2,3,0,1,8,9,10,11,6,7,4,5,16,17,18,19,18,19,16,17,24,25,26,27,22,23,20,21] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,4,5,6,7,6,7,4,5,4,5,6,7,18,19,16,17,20,21,22,23,22,23,20,21,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1,2,3,0,1,8,9,10,11,6,7,4,5,18,19,16,17,18,19,16,17,24,25,26,27,22,23,20,21] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,12,13,14,15,18,19,16,17,22,23,20,21,22,23,20,21,28,29,30,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX512VL: # %bb.0: @@ -3344,17 +3533,16 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1 define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,5,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,4,5,14,15,0,1,4,5,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: @@ -4035,15 +4223,15 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; @@ -4055,8 +4243,8 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, ; ; AVX512VL-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512VL-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VL-SLOW-NEXT: retq ; @@ -4073,8 +4261,8 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, ; AVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -4099,14 +4287,14 @@ define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: @@ -4116,8 +4304,8 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, ; ; AVX512VL-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: @@ -4132,15 +4320,15 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, ; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: @@ -4152,8 +4340,8 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, ; AVX512VL-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: @@ -4236,17 +4424,35 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2 define <16 x i16> @shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,7,5] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11,24,25,28,29,30,31,30,31,24,25,28,29,30,31,26,27] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,5,8,9,10,11,12,14,15,13] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11,24,25,28,29,30,31,26,27,24,25,28,29,30,31,26,27] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,5,8,9,10,11,12,14,15,13] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11,24,25,28,29,30,31,26,27,24,25,28,29,30,31,26,27] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> ret <16 x i16> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 51ef3a18438f..948cfd10076d 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1299,8 +1299,8 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1317,13 +1317,13 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -1331,8 +1331,8 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: retq @@ -1348,8 +1348,8 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; ; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX512VLBW-SLOW: # %bb.0: -; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512VLBW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-SLOW-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX512VLBW-SLOW-NEXT: kmovd %eax, %k1 @@ -1896,7 +1896,7 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_ ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -2540,6 +2540,34 @@ define <32 x i8> @shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_ ret <32 x i8> %shuffle } +; PR33740 +define <32 x i8> @shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: ; AVX1: # %bb.0: @@ -2577,7 +2605,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2586,7 +2614,7 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_ ; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX2OR512VL: # %bb.0: ; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15] ; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index f3433ce834cd..b41fcbe79b0c 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -16,7 +16,6 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq @@ -51,7 +50,6 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512VL-NEXT: movq $-1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq @@ -85,7 +83,6 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq @@ -110,7 +107,6 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 @@ -125,7 +121,6 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0] ; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1 ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 @@ -157,8 +152,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -172,8 +166,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512VL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512VL-NEXT: vpslld $31, %zmm2, %zmm0 -; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -374,8 +367,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper @@ -389,8 +381,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0 -; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax ; AVX512VL-NEXT: vzeroupper @@ -422,8 +413,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper @@ -437,7 +427,6 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7] -; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax @@ -470,8 +459,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper @@ -484,7 +472,6 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax @@ -518,8 +505,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper @@ -534,7 +520,6 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax @@ -566,7 +551,6 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax @@ -578,7 +562,6 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; AVX512VL-NEXT: kmovw %edi, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 -; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %ax killed %ax killed %eax diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index fd4c30fb327b..56395309897f 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -1852,6 +1852,83 @@ entry: ret <16 x i8> %1 } +define <8 x i16> @PR32160(<8 x i32> %x) { +; SSE2-LABEL: PR32160: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR32160: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR32160: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE41-NEXT: retq +; +; AVX1-LABEL: PR32160: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR32160: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: PR32160: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: PR32160: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX512VL-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: PR32160: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: PR32160: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX512BWVL-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %shuf = trunc <8 x i32> %x to <8 x i16> + %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> + ret <8 x i16> %trunc +} + define void @PR34773(i16* %a0, i8* %a1) { ; SSE-LABEL: PR34773: ; SSE: # %bb.0: diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll index 96e97c70dbf4..bd38d901cc8b 100644 --- a/test/CodeGen/X86/widened-broadcast.ll +++ b/test/CodeGen/X86/widened-broadcast.ll @@ -304,14 +304,14 @@ entry: define <16 x i8> @load_splat_16i8_16i8_0101010101010101(<16 x i8>* %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_16i8_16i8_0101010101010101: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: load_splat_16i8_16i8_0101010101010101: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_splat_16i8_16i8_0101010101010101: @@ -384,15 +384,15 @@ entry: define <32 x i8> @load_splat_32i8_16i8_01010101010101010101010101010101(<16 x i8>* %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -459,15 +459,15 @@ entry: define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(<32 x i8>* %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index 792bbbed52e1..e02258a788a1 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -645,9 +645,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) { ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 -; AVX512-NEXT: vpsllw $7, %xmm3, %xmm1 -; AVX512-NEXT: vpmovb2m %zmm1, %k0 -; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512-NEXT: vpmovb2m %zmm3, %k0 ; AVX512-NEXT: vpmovb2m %zmm0, %k1 ; AVX512-NEXT: kxnorw %k1, %k0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 @@ -958,9 +956,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) { ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpsllw $7, %ymm8, %ymm1 -; AVX512-NEXT: vpmovb2m %zmm1, %k0 -; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512-NEXT: vpmovb2m %zmm8, %k0 ; AVX512-NEXT: vpmovb2m %zmm0, %k1 ; AVX512-NEXT: kxnord %k1, %k0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 diff --git a/test/DebugInfo/AArch64/asan-stack-vars.ll b/test/DebugInfo/AArch64/asan-stack-vars.ll index e3725d5f4393..37f9f65c1e13 100644 --- a/test/DebugInfo/AArch64/asan-stack-vars.ll +++ b/test/DebugInfo/AArch64/asan-stack-vars.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -filetype=obj -o - %s | llvm-dwarfdump -v - | FileCheck %s +; RUN: llc -O0 -fast-isel -filetype=obj -o - %s | llvm-dwarfdump -v - | FileCheck %s ; ; Derived from (clang -O0 -g -fsanitize=address -fobjc-arc) ; @protocol NSObject diff --git a/test/DebugInfo/AArch64/frameindices.ll b/test/DebugInfo/AArch64/frameindices.ll index 5b5ab3944069..159813f491dd 100644 --- a/test/DebugInfo/AArch64/frameindices.ll +++ b/test/DebugInfo/AArch64/frameindices.ll @@ -1,4 +1,4 @@ -; RUN: llc -disable-fp-elim -O0 -filetype=obj < %s | llvm-dwarfdump -v - | FileCheck %s +; RUN: llc -disable-fp-elim -O0 -fast-isel -filetype=obj < %s | llvm-dwarfdump -v - | FileCheck %s ; Test that a variable with multiple entries in the MMI table makes it into the ; debug info. ; diff --git a/test/DebugInfo/AArch64/line-header.ll b/test/DebugInfo/AArch64/line-header.ll index 04d82edb2e3c..1d9156debf1c 100644 --- a/test/DebugInfo/AArch64/line-header.ll +++ b/test/DebugInfo/AArch64/line-header.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=aarch64-none-linux -O0 -filetype=obj - < %S/../Inputs/line.ll | llvm-dwarfdump -v - | FileCheck %s -; RUN: llc -mtriple=aarch64_be-none-linux -O0 -filetype=obj - < %S/../Inputs/line.ll | llvm-dwarfdump -v - | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux -O0 -fast-isel -filetype=obj - < %S/../Inputs/line.ll | llvm-dwarfdump -v - | FileCheck %s +; RUN: llc -mtriple=aarch64_be-none-linux -O0 -fast-isel -filetype=obj - < %S/../Inputs/line.ll | llvm-dwarfdump -v - | FileCheck %s ; check line table length is correctly calculated for both big and little endian CHECK-LABEL: .debug_line contents: diff --git a/test/DebugInfo/AArch64/prologue_end.ll b/test/DebugInfo/AArch64/prologue_end.ll index c053122ca6b5..5e6e59dedafe 100644 --- a/test/DebugInfo/AArch64/prologue_end.ll +++ b/test/DebugInfo/AArch64/prologue_end.ll @@ -1,4 +1,4 @@ -; RUN: llc -disable-fp-elim -O0 %s -mtriple aarch64-apple-darwin -o - | FileCheck %s +; RUN: llc -disable-fp-elim -O0 -fast-isel %s -mtriple aarch64-apple-darwin -o - | FileCheck %s ; int func(void); ; void prologue_end_test() { diff --git a/test/MC/AMDGPU/flat-gfx9.s b/test/MC/AMDGPU/flat-gfx9.s index 8d706d49ce44..42ef4eb52a7c 100644 --- a/test/MC/AMDGPU/flat-gfx9.s +++ b/test/MC/AMDGPU/flat-gfx9.s @@ -35,6 +35,54 @@ flat_atomic_add v[3:4], v5 inst_offset:8 slc // GFX9: flat_atomic_add v[3:4], v5 offset:8 slc ; encoding: [0x08,0x00,0x0a,0xdd,0x03,0x05,0x00,0x00] // VIERR: :1: error: invalid operand for instruction +flat_atomic_cmpswap v[1:2], v[3:4] offset:4095 +// GFX9: flat_atomic_cmpswap v[1:2], v[3:4] offset:4095 ; encoding: [0xff,0x0f,0x04,0xdd,0x01,0x03,0x00,0x00] +// VIERR: :1: error: invalid operand for instruction + +flat_atomic_cmpswap v[1:2], v[3:4] offset:4095 slc +// GFX9: flat_atomic_cmpswap v[1:2], v[3:4] offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xdd,0x01,0x03,0x00,0x00] +// VIERR: :1: error: invalid operand for instruction + +flat_atomic_cmpswap v[1:2], v[3:4] +// GFX9: flat_atomic_cmpswap v[1:2], v[3:4] ; encoding: [0x00,0x00,0x04,0xdd,0x01,0x03,0x00,0x00] +// VI: flat_atomic_cmpswap v[1:2], v[3:4] ; encoding: [0x00,0x00,0x04,0xdd,0x01,0x03,0x00,0x00] + +flat_atomic_cmpswap v[1:2], v[3:4] slc +// GFX9: flat_atomic_cmpswap v[1:2], v[3:4] slc ; encoding: [0x00,0x00,0x06,0xdd,0x01,0x03,0x00,0x00] +// VI: flat_atomic_cmpswap v[1:2], v[3:4] slc ; encoding: [0x00,0x00,0x06,0xdd,0x01,0x03,0x00,0x00] + +flat_atomic_cmpswap v[1:2], v[3:4] offset:4095 glc +// GCNERR: error: invalid operand for instruction + +flat_atomic_cmpswap v[1:2], v[3:4] glc +// GCNERR: error: invalid operand for instruction + +flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:4095 glc +// GFX9: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:4095 glc ; encoding: [0xff,0x0f,0x05,0xdd,0x01,0x03,0x00,0x00] +// VIERR: :1: error: invalid operand for instruction + +flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:4095 glc slc +// GFX9: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:4095 glc slc ; encoding: [0xff,0x0f,0x07,0xdd,0x01,0x03,0x00,0x00] +// VIERR: :1: error: invalid operand for instruction + +flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +// GFX9: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; encoding: [0x00,0x00,0x05,0xdd,0x01,0x03,0x00,0x00] +// VI: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; encoding: [0x00,0x00,0x05,0xdd,0x01,0x03,0x00,0x00] + +flat_atomic_cmpswap v0, v[1:2], v[3:4] glc slc +// GFX9: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc slc ; encoding: [0x00,0x00,0x07,0xdd,0x01,0x03,0x00,0x00] +// VI: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc slc ; encoding: [0x00,0x00,0x07,0xdd,0x01,0x03,0x00,0x00] + +flat_atomic_cmpswap v0, v[1:2], v[3:4] +// GFX9: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; encoding: [0x00,0x00,0x05,0xdd,0x01,0x03,0x00,0x00] +// VI: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; encoding: [0x00,0x00,0x05,0xdd,0x01,0x03,0x00,0x00] + +flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:4095 +// GCNERR: error: too few operands for instruction + +flat_atomic_cmpswap v0, v[1:2], v[3:4] slc +// GCNERR: error: invalid operand for instruction + flat_atomic_swap v[3:4], v5 offset:16 // GFX9: flat_atomic_swap v[3:4], v5 offset:16 ; encoding: [0x10,0x00,0x00,0xdd,0x03,0x05,0x00,0x00] // VIERR: :1: error: invalid operand for instruction diff --git a/test/MC/Disassembler/AMDGPU/flat_gfx9.txt b/test/MC/Disassembler/AMDGPU/flat_gfx9.txt index cfe5bfe1ef7c..30be9984bf27 100644 --- a/test/MC/Disassembler/AMDGPU/flat_gfx9.txt +++ b/test/MC/Disassembler/AMDGPU/flat_gfx9.txt @@ -9,6 +9,18 @@ # CHECK: flat_atomic_add v0, v[0:1], v0 offset:4095 glc ; encoding: [0xff,0x0f,0x09,0xdd,0x00,0x00,0x00,0x00] 0xff,0x0f,0x09,0xdd,0x00,0x00,0x00,0x00 +# CHECK: flat_atomic_add v0, v[0:1], v0 offset:4095 glc slc ; encoding: [0xff,0x0f,0x0b,0xdd,0x00,0x00,0x00,0x00] +0xff,0x0f,0x0b,0xdd,0x00,0x00,0x00,0x00 + +# CHECK: flat_atomic_add v0, v[0:1], v0 glc ; encoding: [0x00,0x00,0x09,0xdd,0x00,0x00,0x00,0x00] +0x00,0x00,0x09,0xdd,0x00,0x00,0x00,0x00 + +# CHECK: flat_atomic_add v0, v[0:1], v0 glc slc ; encoding: [0x00,0x00,0x0b,0xdd,0x00,0x00,0x00,0x00] +0x00,0x00,0x0b,0xdd,0x00,0x00,0x00,0x00 + +# CHECK: flat_atomic_add v[0:1], v0 slc ; encoding: [0x00,0x00,0x0a,0xdd,0x00,0x00,0x00,0x00] +0x00,0x00,0x0a,0xdd,0x00,0x00,0x00,0x00 + # CHECK: flat_atomic_add v[0:1], v0 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xdd,0x00,0x00,0x00,0x00] 0xff,0x0f,0x0a,0xdd,0x00,0x00,0x00,0x00 diff --git a/test/MC/ELF/metadata-declaration-errors.s b/test/MC/ELF/metadata-declaration-errors.s new file mode 100644 index 000000000000..ed51a5f54f85 --- /dev/null +++ b/test/MC/ELF/metadata-declaration-errors.s @@ -0,0 +1,10 @@ +// RUN: not llvm-mc -triple x86_64-pc-linux-gnu %s \ +// RUN: -filetype=obj -o %t.o 2>&1 | FileCheck %s + +// Check we do not silently ignore invalid metadata symbol (123). +// CHECK: error: invalid metadata symbol + +.section .foo,"a" +.quad 0 + +.section bar,"ao",@progbits,123 diff --git a/test/MC/X86/intel-syntax-error.s b/test/MC/X86/intel-syntax-error.s index 23ee78135566..ab875b4e982a 100644 --- a/test/MC/X86/intel-syntax-error.s +++ b/test/MC/X86/intel-syntax-error.s @@ -34,3 +34,13 @@ lea RDX, [4 * RAX + 27 * RBX + _pat] lea RDX, [[arr] //CHECK: error: unexpected bracket encountered lea RDX, [arr[] + +.intel_syntax + +// CHECK: error: invalid operand for instruction +punpcklbw mm0, qword ptr [rsp] +// CHECK: error: invalid operand for instruction +punpcklwd mm0, word ptr [rsp] +// CHECK: error: invalid operand for instruction +punpckldq mm0, qword ptr [rsp] + diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s index 5e118674f92f..0421add11d05 100644 --- a/test/MC/X86/intel-syntax.s +++ b/test/MC/X86/intel-syntax.s @@ -867,3 +867,11 @@ movsd qword ptr [rax], xmm0 xlat byte ptr [eax] // CHECK: xlatb // CHECK-STDERR: memory operand is only for determining the size, (R|E)BX will be used for the location + +// CHECK: punpcklbw +punpcklbw mm0, dword ptr [rsp] +// CHECK: punpcklwd +punpcklwd mm0, dword ptr [rsp] +// CHECK: punpckldq +punpckldq mm0, dword ptr [rsp] + diff --git a/test/MC/X86/x86_64-asm-match.s b/test/MC/X86/x86_64-asm-match.s index 3208e4f4e0f0..43888d662c66 100644 --- a/test/MC/X86/x86_64-asm-match.s +++ b/test/MC/X86/x86_64-asm-match.s @@ -39,7 +39,7 @@ // CHECK: Matching formal operand class MCK_VR64 against actual operand at index 2 (): Opcode result: multiple operand mismatches, ignoring this opcode // CHECK:Trying to match opcode MMX_PUNPCKLBWirm // CHECK: Matching formal operand class MCK_VR64 against actual operand at index 1 (): match success using generic matcher -// CHECK: Matching formal operand class MCK_Mem64 against actual operand at index 2 (): match success using generic matcher +// CHECK: Matching formal operand class MCK_Mem32 against actual operand at index 2 (): match success using generic matcher // CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode @@ -49,4 +49,4 @@ pinsrw $3, %ecx, %xmm5 crc32l %gs:0xdeadbeef(%rbx,%rcx,8),%ecx .intel_syntax -punpcklbw mm0, qword ptr [rsp] +punpcklbw mm0, dword ptr [rsp] diff --git a/test/Other/loop-pm-invalidation.ll b/test/Other/loop-pm-invalidation.ll index 9a4f74e1d005..a6f5302462d1 100644 --- a/test/Other/loop-pm-invalidation.ll +++ b/test/Other/loop-pm-invalidation.ll @@ -19,23 +19,39 @@ define void @no_loops() { ; CHECK-LOOP-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on no_loops +; CHECK-LOOP-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-LOOP-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: AssumptionAnalysis +; CHECK-LOOP-INV-NEXT: Running pass: LCSSAPass +; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-LOOP-INV-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}LoopAnalysis ; CHECK-LOOP-INV-NEXT: Invalidating all non-preserved analyses ; CHECK-LOOP-INV-NEXT: Invalidating analysis: LoopAnalysis ; CHECK-LOOP-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on no_loops +; CHECK-LOOP-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-LOOP-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis +; CHECK-LOOP-INV-NEXT: Running pass: LCSSAPass +; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run. ; ; CHECK-SCEV-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on no_loops +; CHECK-SCEV-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-SCEV-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-SCEV-INV-NEXT: Running analysis: LoopAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: AssumptionAnalysis +; CHECK-SCEV-INV-NEXT: Running pass: LCSSAPass +; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-SCEV-INV-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ScalarEvolutionAnalysis ; CHECK-SCEV-INV-NEXT: Invalidating all non-preserved analyses ; CHECK-SCEV-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on no_loops +; CHECK-SCEV-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-SCEV-INV-NEXT: Running pass: LoopSimplifyPass +; CHECK-SCEV-INV-NEXT: Running pass: LCSSAPass +; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run. entry: @@ -44,9 +60,13 @@ entry: define void @one_loop(i1* %ptr) { ; CHECK-LOOP-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on one_loop +; CHECK-LOOP-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-LOOP-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: AssumptionAnalysis +; CHECK-LOOP-INV-NEXT: Running pass: LCSSAPass +; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-LOOP-INV-NEXT: Running analysis: AAManager ; CHECK-LOOP-INV-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis @@ -62,7 +82,11 @@ define void @one_loop(i1* %ptr) { ; CHECK-LOOP-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis ; CHECK-LOOP-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-LOOP-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on one_loop +; CHECK-LOOP-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-LOOP-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis +; CHECK-LOOP-INV-NEXT: Running pass: LCSSAPass +; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-LOOP-INV-NEXT: Starting {{.*}}Loop pass manager run. @@ -71,9 +95,13 @@ define void @one_loop(i1* %ptr) { ; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run. ; ; CHECK-SCEV-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on one_loop +; CHECK-SCEV-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-SCEV-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-SCEV-INV-NEXT: Running analysis: LoopAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: AssumptionAnalysis +; CHECK-SCEV-INV-NEXT: Running pass: LCSSAPass +; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-SCEV-INV-NEXT: Running analysis: AAManager ; CHECK-SCEV-INV-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis @@ -88,6 +116,10 @@ define void @one_loop(i1* %ptr) { ; CHECK-SCEV-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis ; CHECK-SCEV-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-SCEV-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on one_loop +; CHECK-SCEV-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-SCEV-INV-NEXT: Running pass: LoopSimplifyPass +; CHECK-SCEV-INV-NEXT: Running pass: LCSSAPass +; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-SCEV-INV-NEXT: Starting {{.*}}Loop pass manager run. @@ -108,9 +140,13 @@ exit: define void @nested_loops(i1* %ptr) { ; CHECK-LOOP-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on nested_loops +; CHECK-LOOP-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-LOOP-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: AssumptionAnalysis +; CHECK-LOOP-INV-NEXT: Running pass: LCSSAPass +; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-LOOP-INV-NEXT: Running analysis: AAManager ; CHECK-LOOP-INV-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis @@ -130,7 +166,11 @@ define void @nested_loops(i1* %ptr) { ; CHECK-LOOP-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis ; CHECK-LOOP-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-LOOP-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on nested_loops +; CHECK-LOOP-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-LOOP-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis +; CHECK-LOOP-INV-NEXT: Running pass: LCSSAPass +; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-LOOP-INV-NEXT: Starting {{.*}}Loop pass manager run. @@ -142,9 +182,13 @@ define void @nested_loops(i1* %ptr) { ; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run. ; ; CHECK-SCEV-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on nested_loops +; CHECK-SCEV-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-SCEV-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-SCEV-INV-NEXT: Running analysis: LoopAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: AssumptionAnalysis +; CHECK-SCEV-INV-NEXT: Running pass: LCSSAPass +; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-SCEV-INV-NEXT: Running analysis: AAManager ; CHECK-SCEV-INV-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis @@ -163,6 +207,10 @@ define void @nested_loops(i1* %ptr) { ; CHECK-SCEV-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis ; CHECK-SCEV-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-SCEV-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on nested_loops +; CHECK-SCEV-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-SCEV-INV-NEXT: Running pass: LoopSimplifyPass +; CHECK-SCEV-INV-NEXT: Running pass: LCSSAPass +; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-SCEV-INV-NEXT: Starting {{.*}}Loop pass manager run. @@ -193,9 +241,13 @@ exit: define void @dead_loop() { ; CHECK-LOOP-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop +; CHECK-LOOP-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-LOOP-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: AssumptionAnalysis +; CHECK-LOOP-INV-NEXT: Running pass: LCSSAPass +; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-LOOP-INV-NEXT: Running analysis: AAManager ; CHECK-LOOP-INV-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis @@ -211,7 +263,11 @@ define void @dead_loop() { ; CHECK-LOOP-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis ; CHECK-LOOP-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-LOOP-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop +; CHECK-LOOP-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-LOOP-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis +; CHECK-LOOP-INV-NEXT: Running pass: LCSSAPass +; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis ; CHECK-LOOP-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-LOOP-INV-NEXT: Starting {{.*}}Loop pass manager run. @@ -220,9 +276,13 @@ define void @dead_loop() { ; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run. ; ; CHECK-SCEV-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop +; CHECK-SCEV-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-SCEV-INV-NEXT: Running pass: LoopSimplifyPass ; CHECK-SCEV-INV-NEXT: Running analysis: LoopAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: AssumptionAnalysis +; CHECK-SCEV-INV-NEXT: Running pass: LCSSAPass +; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-SCEV-INV-NEXT: Running analysis: AAManager ; CHECK-SCEV-INV-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis @@ -237,6 +297,10 @@ define void @dead_loop() { ; CHECK-SCEV-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis ; CHECK-SCEV-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-SCEV-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop +; CHECK-SCEV-INV-NEXT: Starting {{.*}}Function pass manager run +; CHECK-SCEV-INV-NEXT: Running pass: LoopSimplifyPass +; CHECK-SCEV-INV-NEXT: Running pass: LCSSAPass +; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run ; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis ; CHECK-SCEV-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-SCEV-INV-NEXT: Starting {{.*}}Loop pass manager run. @@ -245,9 +309,13 @@ define void @dead_loop() { ; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run. ; ; CHECK-SCEV-INV-AFTER-DELETE-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop +; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Starting {{.*}}Function pass manager run +; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running pass: LoopSimplifyPass ; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: LoopAnalysis ; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: AssumptionAnalysis +; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running pass: LCSSAPass +; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Finished {{.*}}Function pass manager run ; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: AAManager ; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: ScalarEvolutionAnalysis @@ -264,6 +332,10 @@ define void @dead_loop() { ; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Invalidating analysis: ScalarEvolutionAnalysis ; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop ; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop +; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Starting {{.*}}Function pass manager run +; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running pass: LoopSimplifyPass +; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running pass: LCSSAPass +; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Finished {{.*}}Function pass manager run ; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Finished {{.*}}Function pass manager run. entry: diff --git a/test/Other/new-pass-manager.ll b/test/Other/new-pass-manager.ll index 0826ecd3152b..320fd04ed9d4 100644 --- a/test/Other/new-pass-manager.ll +++ b/test/Other/new-pass-manager.ll @@ -450,10 +450,14 @@ ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}> ; CHECK-REPEAT-LOOP-PASS-NEXT: Starting llvm::Function pass manager run ; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: FunctionToLoopPassAdaptor +; CHECK-REPEAT-LOOP-PASS-NEXT: Starting llvm::Function pass manager run +; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: LoopSimplify ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: LoopAnalysis ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: AssumptionAnalysis ; CHECK-REPEAT-LOOP-PASS-NEXT: Invalidating all non-preserved analyses +; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: LCSSAPass +; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Function pass manager run ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: AAManager ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: ScalarEvolutionAnalysis diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll index 1964a8c2f7cb..96b925f9d698 100644 --- a/test/Other/new-pm-defaults.ll +++ b/test/Other/new-pm-defaults.ll @@ -125,7 +125,11 @@ ; CHECK-O-NEXT: Running pass: ReassociatePass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}> +; CHECK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: LoopSimplifyPass ; CHECK-O-NEXT: Running analysis: LoopAnalysis +; CHECK-O-NEXT: Running pass: LCSSAPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy ; CHECK-O-NEXT: Starting Loop pass manager run. @@ -137,6 +141,10 @@ ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}> +; CHECK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-O-NEXT: Running pass: LCSSAPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Starting Loop pass manager run. ; CHECK-O-NEXT: Running pass: IndVarSimplifyPass ; CHECK-O-NEXT: Running pass: LoopIdiomRecognizePass @@ -168,6 +176,10 @@ ; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass ; CHECK-O-NEXT: Running pass: DSEPass ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}> +; CHECK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-O-NEXT: Running pass: LCSSAPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-EP-SCALAR-LATE-NEXT: Running pass: NoOpFunctionPass ; CHECK-O-NEXT: Running pass: ADCEPass ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis @@ -190,6 +202,10 @@ ; CHECK-O-NEXT: Running pass: Float2IntPass ; CHECK-EP-VECTORIZER-START-NEXT: Running pass: NoOpFunctionPass ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopRotatePass +; CHECK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-O-NEXT: Running pass: LCSSAPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis @@ -205,6 +221,10 @@ ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass +; CHECK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-O-NEXT: Running pass: LCSSAPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass ; CHECK-O-NEXT: Running pass: LoopSinkPass ; CHECK-O-NEXT: Running pass: InstSimplifierPass diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll index 12fd0d7ac476..077d48bb6ae0 100644 --- a/test/Other/new-pm-thinlto-defaults.ll +++ b/test/Other/new-pm-thinlto-defaults.ll @@ -119,7 +119,11 @@ ; CHECK-O-NEXT: Running pass: ReassociatePass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}> +; CHECK-O-NEXT: Starting llvm::Function pass manager run +; CHECK-O-NEXT: Running pass: LoopSimplifyPass ; CHECK-O-NEXT: Running analysis: LoopAnalysis +; CHECK-O-NEXT: Running pass: LCSSAPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run ; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy ; CHECK-O-NEXT: Starting Loop pass manager run. @@ -131,6 +135,10 @@ ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}> +; CHECK-O-NEXT: Starting llvm::Function pass manager run +; CHECK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-O-NEXT: Running pass: LCSSAPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run ; CHECK-O-NEXT: Starting Loop pass manager run. ; CHECK-O-NEXT: Running pass: IndVarSimplifyPass ; CHECK-O-NEXT: Running pass: LoopIdiomRecognizePass @@ -159,6 +167,10 @@ ; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass ; CHECK-O-NEXT: Running pass: DSEPass ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}> +; CHECK-O-NEXT: Starting llvm::Function pass manager run +; CHECK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-O-NEXT: Running pass: LCSSAPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run ; CHECK-O-NEXT: Running pass: ADCEPass ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O-NEXT: Running pass: SimplifyCFGPass @@ -178,6 +190,10 @@ ; CHECK-POSTLINK-O-NEXT: Starting llvm::Function pass manager run. ; CHECK-POSTLINK-O-NEXT: Running pass: Float2IntPass ; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopRotatePass +; CHECK-POSTLINK-O-NEXT: Starting llvm::Function pass manager run +; CHECK-POSTLINK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-POSTLINK-O-NEXT: Running pass: LCSSAPass +; CHECK-POSTLINK-O-NEXT: Finished llvm::Function pass manager run ; CHECK-POSTLINK-O-NEXT: Running pass: LoopDistributePass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis @@ -193,6 +209,10 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass +; CHECK-POSTLINK-O-NEXT: Starting llvm::Function pass manager run +; CHECK-POSTLINK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-POSTLINK-O-NEXT: Running pass: LCSSAPass +; CHECK-POSTLINK-O-NEXT: Finished llvm::Function pass manager run ; CHECK-POSTLINK-O-NEXT: Running pass: AlignmentFromAssumptionsPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopSinkPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifierPass diff --git a/test/Transforms/InstCombine/bswap-fold.ll b/test/Transforms/InstCombine/bswap-fold.ll index 260e2330996e..8fdecb628b85 100644 --- a/test/Transforms/InstCombine/bswap-fold.ll +++ b/test/Transforms/InstCombine/bswap-fold.ll @@ -13,16 +13,6 @@ define i32 @test4(i32 %a) nounwind { ret i32 %tmp4 } -; A -define i32 @test5(i32 %a) nounwind { -; CHECK-LABEL: @test5( -; CHECK-NEXT: ret i32 %a -; - %tmp2 = tail call i32 @llvm.bswap.i32( i32 %a ) - %tmp4 = tail call i32 @llvm.bswap.i32( i32 %tmp2 ) - ret i32 %tmp4 -} - ; a >> 24 define i32 @test6(i32 %a) nounwind { ; CHECK-LABEL: @test6( diff --git a/test/Transforms/InstCombine/call.ll b/test/Transforms/InstCombine/call.ll index 5307dcb6df72..c494bfb62c79 100644 --- a/test/Transforms/InstCombine/call.ll +++ b/test/Transforms/InstCombine/call.ll @@ -287,3 +287,14 @@ entry: ; CHECK-LABEL: @test17( ; CHECK: call i32 @pr28655(i32 0) ; CHECK: ret i32 0 + +define void @non_vararg(i8*, i32) { + ret void +} + +define void @test_cast_to_vararg(i8* %this) { +; CHECK-LABEL: test_cast_to_vararg +; CHECK: call void @non_vararg(i8* %this, i32 42) + call void (i8*, ...) bitcast (void (i8*, i32)* @non_vararg to void (i8*, ...)*)(i8* %this, i32 42) + ret void +} diff --git a/test/Transforms/InstCombine/extractelement.ll b/test/Transforms/InstCombine/extractelement.ll index 66fbd25947dc..f4043335c4e4 100644 --- a/test/Transforms/InstCombine/extractelement.ll +++ b/test/Transforms/InstCombine/extractelement.ll @@ -3,9 +3,17 @@ define i32 @extractelement_out_of_range(<2 x i32> %x) { ; CHECK-LABEL: @extractelement_out_of_range( -; CHECK-NEXT: [[E1:%.*]] = extractelement <2 x i32> [[X:%.*]], i8 16 -; CHECK-NEXT: ret i32 [[E1]] +; CHECK-NEXT: ret i32 undef ; %E1 = extractelement <2 x i32> %x, i8 16 ret i32 %E1 } + +define i32 @extractelement_type_out_of_range(<2 x i32> %x) { +; CHECK-LABEL: @extractelement_type_out_of_range( +; CHECK-NEXT: [[E1:%.*]] = extractelement <2 x i32> [[X:%.*]], i128 0 +; CHECK-NEXT: ret i32 [[E1]] +; + %E1 = extractelement <2 x i32> %x, i128 0 + ret i32 %E1 +} diff --git a/test/Transforms/InstCombine/fmul-sqrt.ll b/test/Transforms/InstCombine/fmul-sqrt.ll new file mode 100644 index 000000000000..0031a61059ab --- /dev/null +++ b/test/Transforms/InstCombine/fmul-sqrt.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -instcombine < %s | FileCheck %s + +declare double @llvm.sqrt.f64(double) nounwind readnone speculatable +declare void @use(double) + +; sqrt(a) * sqrt(b) no math flags +define double @sqrt_a_sqrt_b(double %a, double %b) { +; CHECK-LABEL: @sqrt_a_sqrt_b( +; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.sqrt.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.sqrt.f64(double [[B:%.*]]) +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret double [[MUL]] +; + %1 = call double @llvm.sqrt.f64(double %a) + %2 = call double @llvm.sqrt.f64(double %b) + %mul = fmul double %1, %2 + ret double %mul +} + +; sqrt(a) * sqrt(b) fast-math, multiple uses +define double @sqrt_a_sqrt_b_multiple_uses(double %a, double %b) { +; CHECK-LABEL: @sqrt_a_sqrt_b_multiple_uses( +; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.sqrt.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.sqrt.f64(double [[B:%.*]]) +; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[TMP1]], [[TMP2]] +; CHECK-NEXT: call void @use(double [[TMP2]]) +; CHECK-NEXT: ret double [[MUL]] +; + %1 = call fast double @llvm.sqrt.f64(double %a) + %2 = call fast double @llvm.sqrt.f64(double %b) + %mul = fmul fast double %1, %2 + call void @use(double %2) + ret double %mul +} + +; sqrt(a) * sqrt(b) => sqrt(a*b) with fast-math +define double @sqrt_a_sqrt_b_fast(double %a, double %b) { +; CHECK-LABEL: @sqrt_a_sqrt_b_fast( +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast double [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.sqrt.f64(double [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %1 = call fast double @llvm.sqrt.f64(double %a) + %2 = call fast double @llvm.sqrt.f64(double %b) + %mul = fmul fast double %1, %2 + ret double %mul +} + +; sqrt(a) * sqrt(b) * sqrt(c) * sqrt(d) => sqrt(a*b*c+d) with fast-math +define double @sqrt_a_sqrt_b_sqrt_c_sqrt_d_fast(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @sqrt_a_sqrt_b_sqrt_c_sqrt_d_fast( +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast double [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast double [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP2]], [[D:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast double @llvm.sqrt.f64(double [[TMP3]]) +; CHECK-NEXT: ret double [[TMP4]] +; + %1 = call fast double @llvm.sqrt.f64(double %a) + %2 = call fast double @llvm.sqrt.f64(double %b) + %mul = fmul fast double %1, %2 + %3 = call fast double @llvm.sqrt.f64(double %c) + %mul1 = fmul fast double %mul, %3 + %4 = call fast double @llvm.sqrt.f64(double %d) + %mul2 = fmul fast double %mul1, %4 + ret double %mul2 +} diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll index e0698f8b3b77..73f1cd920164 100644 --- a/test/Transforms/InstCombine/intrinsics.ll +++ b/test/Transforms/InstCombine/intrinsics.ll @@ -262,20 +262,12 @@ define void @powi(double %V, double *%P) { %A = tail call double @llvm.powi.f64(double %V, i32 -1) nounwind store volatile double %A, double* %P - %B = tail call double @llvm.powi.f64(double %V, i32 0) nounwind - store volatile double %B, double* %P - - %C = tail call double @llvm.powi.f64(double %V, i32 1) nounwind - store volatile double %C, double* %P - %D = tail call double @llvm.powi.f64(double %V, i32 2) nounwind store volatile double %D, double* %P ret void ; CHECK-LABEL: @powi( ; CHECK: %A = fdiv double 1.0{{.*}}, %V ; CHECK: store volatile double %A, -; CHECK: store volatile double 1.0 -; CHECK: store volatile double %V ; CHECK: %D = fmul double %V, %V ; CHECK: store volatile double %D } diff --git a/test/Transforms/InstCombine/udiv-simplify.ll b/test/Transforms/InstCombine/udiv-simplify.ll index 6f43fee92a6a..1794e26d389d 100644 --- a/test/Transforms/InstCombine/udiv-simplify.ll +++ b/test/Transforms/InstCombine/udiv-simplify.ll @@ -25,10 +25,10 @@ define i64 @test2(i32 %x) nounwind { define i64 @test1_PR2274(i32 %x, i32 %g) nounwind { ; CHECK-LABEL: @test1_PR2274( -; CHECK-NEXT: [[Y:%.*]] = lshr i32 %x, 30 -; CHECK-NEXT: [[R:%.*]] = udiv i32 [[Y]], %g -; CHECK-NEXT: [[Z1:%.*]] = zext i32 [[R]] to i64 -; CHECK-NEXT: ret i64 [[Z1]] +; CHECK-NEXT: [[Y:%.*]] = lshr i32 [[X:%.*]], 30 +; CHECK-NEXT: [[R:%.*]] = udiv i32 [[Y]], [[G:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[R]] to i64 +; CHECK-NEXT: ret i64 [[TMP1]] ; %y = lshr i32 %x, 30 %r = udiv i32 %y, %g @@ -37,10 +37,10 @@ define i64 @test1_PR2274(i32 %x, i32 %g) nounwind { } define i64 @test2_PR2274(i32 %x, i32 %v) nounwind { ; CHECK-LABEL: @test2_PR2274( -; CHECK-NEXT: [[Y:%.*]] = lshr i32 %x, 31 -; CHECK-NEXT: [[R:%.*]] = udiv i32 [[Y]], %v -; CHECK-NEXT: [[Z1:%.*]] = zext i32 [[R]] to i64 -; CHECK-NEXT: ret i64 [[Z1]] +; CHECK-NEXT: [[Y:%.*]] = lshr i32 [[X:%.*]], 31 +; CHECK-NEXT: [[R:%.*]] = udiv i32 [[Y]], [[V:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[R]] to i64 +; CHECK-NEXT: ret i64 [[TMP1]] ; %y = lshr i32 %x, 31 %r = udiv i32 %y, %v @@ -54,7 +54,7 @@ define i64 @test2_PR2274(i32 %x, i32 %v) nounwind { define i32 @PR30366(i1 %a) { ; CHECK-LABEL: @PR30366( -; CHECK-NEXT: [[Z:%.*]] = zext i1 %a to i32 +; CHECK-NEXT: [[Z:%.*]] = zext i1 [[A:%.*]] to i32 ; CHECK-NEXT: [[D:%.*]] = lshr i32 [[Z]], zext (i16 ptrtoint ([1 x i16]* @b to i16) to i32) ; CHECK-NEXT: ret i32 [[D]] ; @@ -62,3 +62,24 @@ define i32 @PR30366(i1 %a) { %d = udiv i32 %z, zext (i16 shl (i16 1, i16 ptrtoint ([1 x i16]* @b to i16)) to i32) ret i32 %d } + +; OSS-Fuzz #4857 +; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=4857 +define i177 @ossfuzz_4857(i177 %X, i177 %Y) { +; CHECK-LABEL: @ossfuzz_4857( +; CHECK-NEXT: store i1 false, i1* undef, align 1 +; CHECK-NEXT: ret i177 0 +; + %B5 = udiv i177 %Y, -1 + %B4 = add i177 %B5, -1 + %B2 = add i177 %B4, -1 + %B6 = mul i177 %B5, %B2 + %B3 = add i177 %B2, %B2 + %B9 = xor i177 %B4, %B3 + %B13 = ashr i177 %Y, %B2 + %B22 = add i177 %B9, %B13 + %B1 = udiv i177 %B5, %B6 + %C9 = icmp ult i177 %Y, %B22 + store i1 %C9, i1* undef + ret i177 %B1 +} diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll index 9d59efbad738..318df6cf76c1 100644 --- a/test/Transforms/InstCombine/vec_demanded_elts.ll +++ b/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -191,11 +191,11 @@ define <4 x i32> @inselt_shuf_no_demand_multiuse(i32 %a0, i32 %a1, <4 x i32> %b) define <4 x float> @inselt_shuf_no_demand_bogus_insert_index_in_chain(float %a1, float %a2, float %a3, i32 %variable_index) { ; CHECK-LABEL: @inselt_shuf_no_demand_bogus_insert_index_in_chain( -; CHECK-NEXT: [[OUT1:%.*]] = insertelement <4 x float> undef, float %a1, i32 1 -; CHECK-NEXT: ret <4 x float> [[OUT1]] +; CHECK-NEXT: [[OUT12:%.*]] = insertelement <4 x float> undef, float [[A2:%.*]], i32 [[VARIABLE_INDEX:%.*]] +; CHECK-NEXT: ret <4 x float> [[OUT12]] ; %out1 = insertelement <4 x float> undef, float %a1, i32 1 - %out12 = insertelement <4 x float> %out1, float %a2, i32 undef ; something unexpected + %out12 = insertelement <4 x float> %out1, float %a2, i32 %variable_index ; something unexpected %out123 = insertelement <4 x float> %out12, float %a3, i32 3 %shuffle = shufflevector <4 x float> %out123, <4 x float> undef, <4 x i32> ret <4 x float> %shuffle diff --git a/test/Transforms/InstCombine/vector_insertelt_shuffle.ll b/test/Transforms/InstCombine/vector_insertelt_shuffle.ll index 41c6370e48e9..e5da60863193 100644 --- a/test/Transforms/InstCombine/vector_insertelt_shuffle.ll +++ b/test/Transforms/InstCombine/vector_insertelt_shuffle.ll @@ -66,9 +66,7 @@ define <4 x float> @bazzz(<4 x float> %x) { define <4 x float> @bazzzz(<4 x float> %x) { ; CHECK-LABEL: @bazzzz( -; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x float> %x, float 1.000000e+00, i32 undef -; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x float> %x, float 2.000000e+00, i32 2 -; CHECK-NEXT: ret <4 x float> [[INS2]] +; CHECK-NEXT: ret <4 x float> ; %ins1 = insertelement<4 x float> %x, float 1.0, i32 undef %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2 diff --git a/test/Transforms/InstCombine/bitreverse-fold.ll b/test/Transforms/InstSimplify/bitreverse-fold.ll similarity index 98% rename from test/Transforms/InstCombine/bitreverse-fold.ll rename to test/Transforms/InstSimplify/bitreverse-fold.ll index b798ad33b3f0..eab4b07a16e9 100644 --- a/test/Transforms/InstCombine/bitreverse-fold.ll +++ b/test/Transforms/InstSimplify/bitreverse-fold.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instsimplify -S | FileCheck %s define i32 @identity_bitreverse_i32(i32 %p) { ; CHECK-LABEL: @identity_bitreverse_i32( diff --git a/test/Transforms/InstSimplify/exp-intrinsic.ll b/test/Transforms/InstSimplify/exp-intrinsic.ll new file mode 100644 index 000000000000..0fbd7e1a5e9c --- /dev/null +++ b/test/Transforms/InstSimplify/exp-intrinsic.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instsimplify -S | FileCheck %s + +declare double @llvm.exp.f64(double) +declare double @llvm.log.f64(double) + +define double @exp_log(double %a) { +; CHECK-LABEL: @exp_log( +; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.log.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.exp.f64(double [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %1 = call double @llvm.log.f64(double %a) + %2 = call double @llvm.exp.f64(double %1) + ret double %2 +} + +define double @exp_log_fast(double %a) { +; CHECK-LABEL: @exp_log_fast( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call fast double @llvm.log.f64(double %a) + %2 = call fast double @llvm.exp.f64(double %1) + ret double %2 +} + +define double @exp_fast_log_strict(double %a) { +; CHECK-LABEL: @exp_fast_log_strict( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call double @llvm.log.f64(double %a) + %2 = call fast double @llvm.exp.f64(double %1) + ret double %2 +} + +define double @exp_strict_log_fast(double %a) { +; CHECK-LABEL: @exp_strict_log_fast( +; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.log.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.exp.f64(double [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %1 = call fast double @llvm.log.f64(double %a) + %2 = call double @llvm.exp.f64(double %1) + ret double %2 +} + +define double @exp_log_exp_log(double %a) { +; CHECK-LABEL: @exp_log_exp_log( +; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.log.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.exp.f64(double [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.log.f64(double [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.exp.f64(double [[TMP3]]) +; CHECK-NEXT: ret double [[TMP4]] +; + %1 = call double @llvm.log.f64(double %a) + %2 = call double @llvm.exp.f64(double %1) + %3 = call double @llvm.log.f64(double %2) + %4 = call double @llvm.exp.f64(double %3) + ret double %4 +} + +define double @exp_log_exp_log_fast(double %a) { +; CHECK-LABEL: @exp_log_exp_log_fast( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call fast double @llvm.log.f64(double %a) + %2 = call fast double @llvm.exp.f64(double %1) + %3 = call fast double @llvm.log.f64(double %2) + %4 = call fast double @llvm.exp.f64(double %3) + ret double %4 +} diff --git a/test/Transforms/InstSimplify/exp2-intrinsic.ll b/test/Transforms/InstSimplify/exp2-intrinsic.ll new file mode 100644 index 000000000000..6b93b14b7118 --- /dev/null +++ b/test/Transforms/InstSimplify/exp2-intrinsic.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instsimplify -S | FileCheck %s + +declare double @llvm.exp2.f64(double) +declare double @llvm.log2.f64(double) + +define double @exp2_log2(double %a) { +; CHECK-LABEL: @exp2_log2( +; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.log2.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.exp2.f64(double [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %1 = call double @llvm.log2.f64(double %a) + %2 = call double @llvm.exp2.f64(double %1) + ret double %2 +} + +define double @exp2_log2_fast(double %a) { +; CHECK-LABEL: @exp2_log2_fast( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call fast double @llvm.log2.f64(double %a) + %2 = call fast double @llvm.exp2.f64(double %1) + ret double %2 +} + +define double @exp2_fast_log2_strict(double %a) { +; CHECK-LABEL: @exp2_fast_log2_strict( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call double @llvm.log2.f64(double %a) + %2 = call fast double @llvm.exp2.f64(double %1) + ret double %2 +} + +define double @exp2_strict_log2_fast(double %a) { +; CHECK-LABEL: @exp2_strict_log2_fast( +; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.log2.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.exp2.f64(double [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %1 = call fast double @llvm.log2.f64(double %a) + %2 = call double @llvm.exp2.f64(double %1) + ret double %2 +} + +define double @exp2_log2_exp2_log2(double %a) { +; CHECK-LABEL: @exp2_log2_exp2_log2( +; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.log2.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.exp2.f64(double [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.log2.f64(double [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.exp2.f64(double [[TMP3]]) +; CHECK-NEXT: ret double [[TMP4]] +; + %1 = call double @llvm.log2.f64(double %a) + %2 = call double @llvm.exp2.f64(double %1) + %3 = call double @llvm.log2.f64(double %2) + %4 = call double @llvm.exp2.f64(double %3) + ret double %4 +} + +define double @exp2_log2_exp2_log2_fast(double %a) { +; CHECK-LABEL: @exp2_log2_exp2_log2_fast( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call fast double @llvm.log2.f64(double %a) + %2 = call fast double @llvm.exp2.f64(double %1) + %3 = call fast double @llvm.log2.f64(double %2) + %4 = call fast double @llvm.exp2.f64(double %3) + ret double %4 +} diff --git a/test/Transforms/InstSimplify/extract-element.ll b/test/Transforms/InstSimplify/extract-element.ll index 8ee75a603cd1..051478913127 100644 --- a/test/Transforms/InstSimplify/extract-element.ll +++ b/test/Transforms/InstSimplify/extract-element.ll @@ -5,9 +5,43 @@ define i129 @vec_extract_negidx(<3 x i129> %a) { ; CHECK-LABEL: @vec_extract_negidx( -; CHECK-NEXT: [[E1:%.*]] = extractelement <3 x i129> [[A:%.*]], i129 -1 -; CHECK-NEXT: ret i129 [[E1]] +; CHECK-NEXT: ret i129 undef ; %E1 = extractelement <3 x i129> %a, i129 -1 ret i129 %E1 } + +define i129 @vec_extract_out_of_bounds(<3 x i129> %a) { +; CHECK-LABEL: @vec_extract_out_of_bounds( +; CHECK-NEXT: ret i129 undef +; + %E1 = extractelement <3 x i129> %a, i129 3 + ret i129 %E1 +} + +define i129 @vec_extract_out_of_bounds2(<3 x i129> %a) { +; CHECK-LABEL: @vec_extract_out_of_bounds2( +; CHECK-NEXT: ret i129 undef +; + %E1 = extractelement <3 x i129> %a, i129 999999999999999 + ret i129 %E1 +} + + +define i129 @vec_extract_undef_index(<3 x i129> %a) { +; CHECK-LABEL: @vec_extract_undef_index( +; CHECK-NEXT: ret i129 undef +; + %E1 = extractelement <3 x i129> %a, i129 undef + ret i129 %E1 +} + + +define i129 @vec_extract_in_bounds(<3 x i129> %a) { +; CHECK-LABEL: @vec_extract_in_bounds( +; CHECK-NEXT: %E1 = extractelement <3 x i129> %a, i129 2 +; CHECK-NEXT: ret i129 %E1 +; + %E1 = extractelement <3 x i129> %a, i129 2 + ret i129 %E1 +} diff --git a/test/Transforms/InstSimplify/fold-intrinsics.ll b/test/Transforms/InstSimplify/fold-intrinsics.ll new file mode 100644 index 000000000000..e484704e8a7a --- /dev/null +++ b/test/Transforms/InstSimplify/fold-intrinsics.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -instsimplify -S | FileCheck %s + +declare double @llvm.powi.f64(double, i32) nounwind readonly +declare i32 @llvm.bswap.i32(i32) + +; A +define i32 @test_bswap(i32 %a) nounwind { +; CHECK-LABEL: @test_bswap( +; CHECK-NEXT: ret i32 %a +; + %tmp2 = tail call i32 @llvm.bswap.i32( i32 %a ) + %tmp4 = tail call i32 @llvm.bswap.i32( i32 %tmp2 ) + ret i32 %tmp4 +} + +define void @powi(double %V, double *%P) { + %B = tail call double @llvm.powi.f64(double %V, i32 0) nounwind + store volatile double %B, double* %P + + %C = tail call double @llvm.powi.f64(double %V, i32 1) nounwind + store volatile double %C, double* %P + + ret void +; CHECK-LABEL: @powi( +; CHECK: store volatile double 1.0 +; CHECK: store volatile double %V +} diff --git a/test/Transforms/InstSimplify/insertelement.ll b/test/Transforms/InstSimplify/insertelement.ll index 3acd921cbad8..3524f2145acb 100644 --- a/test/Transforms/InstSimplify/insertelement.ll +++ b/test/Transforms/InstSimplify/insertelement.ll @@ -23,3 +23,9 @@ define <4 x i32> @test4(<4 x i32> %A) { ; CHECK: ret <4 x i32> undef ret <4 x i32> %I } + +define <4 x i32> @test5(<4 x i32> %A) { + %I = insertelement <4 x i32> %A, i32 5, i64 undef + ; CHECK: ret <4 x i32> undef + ret <4 x i32> %I +} diff --git a/test/Transforms/InstSimplify/log-intrinsic.ll b/test/Transforms/InstSimplify/log-intrinsic.ll new file mode 100644 index 000000000000..5d9820e20baa --- /dev/null +++ b/test/Transforms/InstSimplify/log-intrinsic.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instsimplify -S | FileCheck %s + +declare double @llvm.log.f64(double) +declare double @llvm.exp.f64(double) + +define double @log_exp(double %a) { +; CHECK-LABEL: @log_exp( +; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.exp.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.log.f64(double [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %1 = call double @llvm.exp.f64(double %a) + %2 = call double @llvm.log.f64(double %1) + ret double %2 +} + +define double @log_exp_fast(double %a) { +; CHECK-LABEL: @log_exp_fast( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call fast double @llvm.exp.f64(double %a) + %2 = call fast double @llvm.log.f64(double %1) + ret double %2 +} + +define double @log_fast_exp_strict(double %a) { +; CHECK-LABEL: @log_fast_exp_strict( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call double @llvm.exp.f64(double %a) + %2 = call fast double @llvm.log.f64(double %1) + ret double %2 +} + +define double @log_strict_exp_fast(double %a) { +; CHECK-LABEL: @log_strict_exp_fast( +; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.exp.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.log.f64(double [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %1 = call fast double @llvm.exp.f64(double %a) + %2 = call double @llvm.log.f64(double %1) + ret double %2 +} + +define double @log_exp_log_exp(double %a) { +; CHECK-LABEL: @log_exp_log_exp( +; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.exp.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.log.f64(double [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.exp.f64(double [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.log.f64(double [[TMP3]]) +; CHECK-NEXT: ret double [[TMP4]] +; + %1 = call double @llvm.exp.f64(double %a) + %2 = call double @llvm.log.f64(double %1) + %3 = call double @llvm.exp.f64(double %2) + %4 = call double @llvm.log.f64(double %3) + ret double %4 +} + +define double @log_exp_log_exp_fast(double %a) { +; CHECK-LABEL: @log_exp_log_exp_fast( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call fast double @llvm.exp.f64(double %a) + %2 = call fast double @llvm.log.f64(double %1) + %3 = call fast double @llvm.exp.f64(double %2) + %4 = call fast double @llvm.log.f64(double %3) + ret double %4 +} diff --git a/test/Transforms/InstSimplify/log2-intrinsic.ll b/test/Transforms/InstSimplify/log2-intrinsic.ll new file mode 100644 index 000000000000..dab0cdf97972 --- /dev/null +++ b/test/Transforms/InstSimplify/log2-intrinsic.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instsimplify -S | FileCheck %s + +declare double @llvm.log2.f64(double) +declare double @llvm.exp2.f64(double) + +define double @log2_exp2(double %a) { +; CHECK-LABEL: @log2_exp2( +; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.exp2.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.log2.f64(double [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %1 = call double @llvm.exp2.f64(double %a) + %2 = call double @llvm.log2.f64(double %1) + ret double %2 +} + +define double @log2_exp2_fast(double %a) { +; CHECK-LABEL: @log2_exp2_fast( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call fast double @llvm.exp2.f64(double %a) + %2 = call fast double @llvm.log2.f64(double %1) + ret double %2 +} + +define double @log2_fast_exp2_strict(double %a) { +; CHECK-LABEL: @log2_fast_exp2_strict( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call double @llvm.exp2.f64(double %a) + %2 = call fast double @llvm.log2.f64(double %1) + ret double %2 +} + +define double @log2_strict_exp2_fast(double %a) { +; CHECK-LABEL: @log2_strict_exp2_fast( +; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.exp2.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.log2.f64(double [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %1 = call fast double @llvm.exp2.f64(double %a) + %2 = call double @llvm.log2.f64(double %1) + ret double %2 +} + +define double @log2_exp2_log2_exp2(double %a) { +; CHECK-LABEL: @log2_exp2_log2_exp2( +; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.exp2.f64(double [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.log2.f64(double [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.exp2.f64(double [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.log2.f64(double [[TMP3]]) +; CHECK-NEXT: ret double [[TMP4]] +; + %1 = call double @llvm.exp2.f64(double %a) + %2 = call double @llvm.log2.f64(double %1) + %3 = call double @llvm.exp2.f64(double %2) + %4 = call double @llvm.log2.f64(double %3) + ret double %4 +} + +define double @log2_exp2_log2_exp2_fast(double %a) { +; CHECK-LABEL: @log2_exp2_log2_exp2_fast( +; CHECK-NEXT: ret double [[A:%.*]] +; + %1 = call fast double @llvm.exp2.f64(double %a) + %2 = call fast double @llvm.log2.f64(double %1) + %3 = call fast double @llvm.exp2.f64(double %2) + %4 = call fast double @llvm.log2.f64(double %3) + ret double %4 +} diff --git a/test/Transforms/LoopRotate/pr35210.ll b/test/Transforms/LoopRotate/pr35210.ll index 492922038b25..356c7db243b1 100644 --- a/test/Transforms/LoopRotate/pr35210.ll +++ b/test/Transforms/LoopRotate/pr35210.ll @@ -9,8 +9,12 @@ ; CHECK-NEXT: Running analysis: DominatorTreeAnalysis on f ; CHECK-NEXT: Running analysis: PostDominatorTreeAnalysis on f ; CHECK-NEXT: Running pass: FunctionToLoopPassAdaptor{{.*}} on f +; CHECK-NEXT: Starting llvm::Function pass manager run. +; CHECK-NEXT: Running pass: LoopSimplifyPass on f ; CHECK-NEXT: Running analysis: LoopAnalysis on f ; CHECK-NEXT: Running analysis: AssumptionAnalysis on f +; CHECK-NEXT: Running pass: LCSSAPass on f +; CHECK-NEXT: Finished llvm::Function pass manager run. ; CHECK-NEXT: Running analysis: AAManager on f ; CHECK-NEXT: Running analysis: TargetLibraryAnalysis on f ; CHECK-NEXT: Running analysis: ScalarEvolutionAnalysis on f diff --git a/test/Transforms/LoopSimplify/unreachable-loop-pred.ll b/test/Transforms/LoopSimplify/unreachable-loop-pred.ll index 76b7bb21e468..1e92ee4ee3bc 100644 --- a/test/Transforms/LoopSimplify/unreachable-loop-pred.ll +++ b/test/Transforms/LoopSimplify/unreachable-loop-pred.ll @@ -18,3 +18,52 @@ while.body115: ; preds = %9, %if.end192, %if. foo: br label %while.body115 } + +; When loopsimplify generates dedicated exit block for blocks that are landing +; pads (i.e. innerLoopExit in this test), we should not get confused with the +; unreachable pred (unreachableB) to innerLoopExit. +define align 8 void @baz(i32 %trip) personality i32* ()* @wobble { +entry: + br label %outerHeader + +outerHeader: + invoke void @foo() + to label %innerPreheader unwind label %innerLoopExit + +innerPreheader: + br label %innerH + +innerH: + %tmp50 = invoke i8 * undef() + to label %innerLatch unwind label %innerLoopExit + +innerLatch: + %cmp = icmp slt i32 %trip, 42 + br i1 %cmp, label %innerH, label %retblock + +unreachableB: ; No predecessors! + %tmp62 = invoke i8 * undef() + to label %retblock unwind label %innerLoopExit + +; undedicated exit block (preds from inner and outer loop) +; Also has unreachableB as pred. +innerLoopExit: + %tmp65 = landingpad { i8*, i32 } + cleanup + invoke void @foo() + to label %outerHeader unwind label %unwindblock + +unwindblock: + %tmp67 = landingpad { i8*, i32 } + cleanup + ret void + +retblock: + ret void +} + +; Function Attrs: nounwind +declare i32* @wobble() + +; Function Attrs: uwtable +declare void @foo() diff --git a/test/Transforms/SimplifyCFG/pr35774.ll b/test/Transforms/SimplifyCFG/pr35774.ll new file mode 100644 index 000000000000..149252085d51 --- /dev/null +++ b/test/Transforms/SimplifyCFG/pr35774.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -simplifycfg -S %s | FileCheck %s + +%foo = type { i32 (%foo)*, i32 } + +declare i32 @putchar(i32) + +define i32 @intercept(%foo %f) { +; CHECK-LABEL: @intercept( +; CHECK-NEXT: [[FN:%.*]] = extractvalue [[FOO:%.*]] %f, 0 +; CHECK-NEXT: [[X:%.*]] = extractvalue [[FOO]] %f, 1 +; CHECK-NEXT: [[X0:%.*]] = icmp eq i32 [[X]], 0 +; CHECK-NEXT: br i1 [[X0]], label [[ZERO:%.*]], label [[NONZERO:%.*]] +; CHECK: Zero: +; CHECK-NEXT: [[R0:%.*]] = musttail call i32 [[FN]](%foo [[F:%.*]]) +; CHECK-NEXT: ret i32 [[R0]] +; CHECK: Nonzero: +; CHECK-NEXT: [[R1:%.*]] = tail call i32 [[FN]](%foo [[F]]) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @putchar(i32 [[R1]]) +; CHECK-NEXT: ret i32 [[R1]] +; + %fn = extractvalue %foo %f, 0 + %x = extractvalue %foo %f, 1 + %x0 = icmp eq i32 %x, 0 + br i1 %x0, label %Zero, label %Nonzero + +Zero: + %r0 = musttail call i32 %fn(%foo %f) + ret i32 %r0 + +Nonzero: + %r1 = tail call i32 %fn(%foo %f) + %1 = tail call i32 @putchar(i32 %r1) + ret i32 %r1 +} diff --git a/test/tools/llvm-objcopy/symbol-copy.test b/test/tools/llvm-objcopy/symbol-copy.test index 83e2c0ea70a6..3e346701fb93 100644 --- a/test/tools/llvm-objcopy/symbol-copy.test +++ b/test/tools/llvm-objcopy/symbol-copy.test @@ -28,11 +28,22 @@ Symbols: Section: .text Value: 0x1000 Size: 4 + - Name: bam + Type: STT_FUNC + Section: .text + Value: 0x1001 + Size: 4 + Visibility: STV_HIDDEN - Name: foo Type: STT_FUNC Section: .text - Section: .text Value: 0x1004 + - Name: faz + Type: STT_OBJECT + Section: .data + Value: 0x2002 + Size: 2 + Visibility: STV_INTERNAL - Name: bar Type: STT_OBJECT Section: .data @@ -64,6 +75,17 @@ Symbols: #CHECK-NEXT: Section: .text #CHECK-NEXT: } #CHECK-NEXT: Symbol { +#CHECK-NEXT: Name: bam +#CHECK-NEXT: Value: 0x1001 +#CHECK-NEXT: Size: 4 +#CHECK-NEXT: Binding: Global +#CHECK-NEXT: Type: Function +#CHECK-NEXT: Other [ +#CHECK-NEXT: STV_HIDDEN +#CHECK-NEXT: ] +#CHECK-NEXT: Section: .text +#CHECK-NEXT: } +#CHECK-NEXT: Symbol { #CHECK-NEXT: Name: foo #CHECK-NEXT: Value: 0x1004 #CHECK-NEXT: Size: 0 @@ -73,6 +95,17 @@ Symbols: #CHECK-NEXT: Section: .text #CHECK-NEXT: } #CHECK-NEXT: Symbol { +#CHECK-NEXT: Name: faz +#CHECK-NEXT: Value: 0x2002 +#CHECK-NEXT: Size: 2 +#CHECK-NEXT: Binding: Global +#CHECK-NEXT: Type: Object +#CHECK-NEXT: Other [ +#CHECK-NEXT: STV_INTERNAL +#CHECK-NEXT: ] +#CHECK-NEXT: Section: .data +#CHECK-NEXT: } +#CHECK-NEXT: Symbol { #CHECK-NEXT: Name: bar #CHECK-NEXT: Value: 0x2000 #CHECK-NEXT: Size: 4 diff --git a/tools/dsymutil/DwarfLinker.cpp b/tools/dsymutil/DwarfLinker.cpp index 50ffc69dfaa0..0f5713b6b4fd 100644 --- a/tools/dsymutil/DwarfLinker.cpp +++ b/tools/dsymutil/DwarfLinker.cpp @@ -672,8 +672,12 @@ bool DwarfStreamer::init(Triple TheTriple) { MC.reset(new MCContext(MAI.get(), MRI.get(), MOFI.get())); MOFI->InitMCObjectFileInfo(TheTriple, /*PIC*/ false, *MC); + MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", "")); + if (!MSTI) + return error("no subtarget info for target " + TripleName, Context); + MCTargetOptions Options; - MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, "", Options); + MAB = TheTarget->createMCAsmBackend(*MSTI, *MRI, Options); if (!MAB) return error("no asm backend for target " + TripleName, Context); @@ -681,10 +685,6 @@ bool DwarfStreamer::init(Triple TheTriple) { if (!MII) return error("no instr info info for target " + TripleName, Context); - MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", "")); - if (!MSTI) - return error("no subtarget info for target " + TripleName, Context); - MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, *MC); if (!MCE) return error("no code emitter for target " + TripleName, Context); diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp index dbbe61bf3b06..f577635473ec 100644 --- a/tools/llvm-dwp/llvm-dwp.cpp +++ b/tools/llvm-dwp/llvm-dwp.cpp @@ -673,8 +673,13 @@ int main(int argc, char **argv) { MCContext MC(MAI.get(), MRI.get(), &MOFI); MOFI.InitMCObjectFileInfo(TheTriple, /*PIC*/ false, MC); + std::unique_ptr MSTI( + TheTarget->createMCSubtargetInfo(TripleName, "", "")); + if (!MSTI) + return error("no subtarget info for target " + TripleName, Context); + MCTargetOptions Options; - auto MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, "", Options); + auto MAB = TheTarget->createMCAsmBackend(*MSTI, *MRI, Options); if (!MAB) return error("no asm backend for target " + TripleName, Context); @@ -682,11 +687,6 @@ int main(int argc, char **argv) { if (!MII) return error("no instr info info for target " + TripleName, Context); - std::unique_ptr MSTI( - TheTarget->createMCSubtargetInfo(TripleName, "", "")); - if (!MSTI) - return error("no subtarget info for target " + TripleName, Context); - MCCodeEmitter *MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, MC); if (!MCE) return error("no code emitter for target " + TripleName, Context); diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp index e925346eb2d1..3987be2bd688 100644 --- a/tools/llvm-mc/llvm-mc.cpp +++ b/tools/llvm-mc/llvm-mc.cpp @@ -567,7 +567,7 @@ int main(int argc, char **argv) { MCAsmBackend *MAB = nullptr; if (ShowEncoding) { CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx); - MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, MCPU, MCOptions); + MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions); } auto FOut = llvm::make_unique(*OS); Str.reset(TheTarget->createAsmStreamer( @@ -588,8 +588,7 @@ int main(int argc, char **argv) { } MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx); - MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, MCPU, - MCOptions); + MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions); Str.reset(TheTarget->createMCObjectStreamer( TheTriple, Ctx, std::unique_ptr(MAB), *OS, std::unique_ptr(CE), *STI, MCOptions.MCRelaxAll, diff --git a/tools/llvm-objcopy/Object.cpp b/tools/llvm-objcopy/Object.cpp index d5dfcac40e4e..9e82448187ea 100644 --- a/tools/llvm-objcopy/Object.cpp +++ b/tools/llvm-objcopy/Object.cpp @@ -141,7 +141,8 @@ uint16_t Symbol::getShndx() const { void SymbolTableSection::addSymbol(StringRef Name, uint8_t Bind, uint8_t Type, SectionBase *DefinedIn, uint64_t Value, - uint16_t Shndx, uint64_t Sz) { + uint8_t Visibility, uint16_t Shndx, + uint64_t Sz) { Symbol Sym; Sym.Name = Name; Sym.Binding = Bind; @@ -154,6 +155,7 @@ void SymbolTableSection::addSymbol(StringRef Name, uint8_t Bind, uint8_t Type, Sym.ShndxType = SYMBOL_SIMPLE_INDEX; } Sym.Value = Value; + Sym.Visibility = Visibility; Sym.Size = Sz; Sym.Index = Symbols.size(); Symbols.emplace_back(llvm::make_unique(Sym)); @@ -221,6 +223,7 @@ void SymbolTableSectionImpl::writeSection(FileOutputBuffer &Out) const { Sym->st_name = Symbol->NameIndex; Sym->st_value = Symbol->Value; Sym->st_size = Symbol->Size; + Sym->st_other = Symbol->Visibility; Sym->setBinding(Symbol->Binding); Sym->setType(Symbol->Type); Sym->st_shndx = Symbol->getShndx(); @@ -425,7 +428,7 @@ void Object::initSymbolTable(const object::ELFFile &ElfFile, } SymTab->addSymbol(Name, Sym.getBinding(), Sym.getType(), DefSection, - Sym.getValue(), Sym.st_shndx, Sym.st_size); + Sym.getValue(), Sym.st_other, Sym.st_shndx, Sym.st_size); } } diff --git a/tools/llvm-objcopy/Object.h b/tools/llvm-objcopy/Object.h index b04b0c1a6415..639f0f29ceba 100644 --- a/tools/llvm-objcopy/Object.h +++ b/tools/llvm-objcopy/Object.h @@ -193,6 +193,7 @@ struct Symbol { uint64_t Size; uint8_t Type; uint64_t Value; + uint8_t Visibility; uint16_t getShndx() const; }; @@ -207,8 +208,8 @@ class SymbolTableSection : public SectionBase { public: void setStrTab(StringTableSection *StrTab) { SymbolNames = StrTab; } void addSymbol(StringRef Name, uint8_t Bind, uint8_t Type, - SectionBase *DefinedIn, uint64_t Value, uint16_t Shndx, - uint64_t Sz); + SectionBase *DefinedIn, uint64_t Value, uint8_t Visibility, + uint16_t Shndx, uint64_t Sz); void addSymbolNames(); const SectionBase *getStrTab() const { return SymbolNames; } const Symbol *getSymbolByIndex(uint32_t Index) const; diff --git a/unittests/DebugInfo/DWARF/DwarfGenerator.cpp b/unittests/DebugInfo/DWARF/DwarfGenerator.cpp index 3aa52a0d5b8f..092591aad985 100644 --- a/unittests/DebugInfo/DWARF/DwarfGenerator.cpp +++ b/unittests/DebugInfo/DWARF/DwarfGenerator.cpp @@ -152,8 +152,13 @@ llvm::Error dwarfgen::Generator::init(Triple TheTriple, uint16_t V) { MC.reset(new MCContext(MAI.get(), MRI.get(), MOFI.get())); MOFI->InitMCObjectFileInfo(TheTriple, /*PIC*/ false, *MC); + MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", "")); + if (!MSTI) + return make_error("no subtarget info for target " + TripleName, + inconvertibleErrorCode()); + MCTargetOptions MCOptions = InitMCTargetOptionsFromFlags(); - MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, "", MCOptions); + MAB = TheTarget->createMCAsmBackend(*MSTI, *MRI, MCOptions); if (!MAB) return make_error("no asm backend for target " + TripleName, inconvertibleErrorCode()); @@ -164,11 +169,6 @@ llvm::Error dwarfgen::Generator::init(Triple TheTriple, uint16_t V) { TripleName, inconvertibleErrorCode()); - MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", "")); - if (!MSTI) - return make_error("no subtarget info for target " + TripleName, - inconvertibleErrorCode()); - MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, *MC); if (!MCE) return make_error("no code emitter for target " + TripleName, diff --git a/unittests/IR/BasicBlockTest.cpp b/unittests/IR/BasicBlockTest.cpp index f1777e35b82c..08a41ff36938 100644 --- a/unittests/IR/BasicBlockTest.cpp +++ b/unittests/IR/BasicBlockTest.cpp @@ -33,6 +33,12 @@ TEST(BasicBlockTest, PhiRange) { std::unique_ptr BB2(BasicBlock::Create(Context)); BranchInst::Create(BB.get(), BB2.get()); + // Make sure this doesn't crash if there are no phis. + for (auto &PN : BB->phis()) { + (void)PN; + EXPECT_TRUE(false) << "empty block should have no phis"; + } + // Make it a cycle. auto *BI = BranchInst::Create(BB.get(), BB.get()); diff --git a/unittests/Support/CommandLineTest.cpp b/unittests/Support/CommandLineTest.cpp index 1fb0213b4d18..36ff4e247e9f 100644 --- a/unittests/Support/CommandLineTest.cpp +++ b/unittests/Support/CommandLineTest.cpp @@ -207,6 +207,85 @@ TEST(CommandLineTest, TokenizeWindowsCommandLine) { array_lengthof(Output)); } +TEST(CommandLineTest, TokenizeConfigFile1) { + const char *Input = "\\"; + const char *const Output[] = { "\\" }; + testCommandLineTokenizer(cl::tokenizeConfigFile, Input, Output, + array_lengthof(Output)); +} + +TEST(CommandLineTest, TokenizeConfigFile2) { + const char *Input = "\\abc"; + const char *const Output[] = { "abc" }; + testCommandLineTokenizer(cl::tokenizeConfigFile, Input, Output, + array_lengthof(Output)); +} + +TEST(CommandLineTest, TokenizeConfigFile3) { + const char *Input = "abc\\"; + const char *const Output[] = { "abc\\" }; + testCommandLineTokenizer(cl::tokenizeConfigFile, Input, Output, + array_lengthof(Output)); +} + +TEST(CommandLineTest, TokenizeConfigFile4) { + const char *Input = "abc\\\n123"; + const char *const Output[] = { "abc123" }; + testCommandLineTokenizer(cl::tokenizeConfigFile, Input, Output, + array_lengthof(Output)); +} + +TEST(CommandLineTest, TokenizeConfigFile5) { + const char *Input = "abc\\\r\n123"; + const char *const Output[] = { "abc123" }; + testCommandLineTokenizer(cl::tokenizeConfigFile, Input, Output, + array_lengthof(Output)); +} + +TEST(CommandLineTest, TokenizeConfigFile6) { + const char *Input = "abc\\\n"; + const char *const Output[] = { "abc" }; + testCommandLineTokenizer(cl::tokenizeConfigFile, Input, Output, + array_lengthof(Output)); +} + +TEST(CommandLineTest, TokenizeConfigFile7) { + const char *Input = "abc\\\r\n"; + const char *const Output[] = { "abc" }; + testCommandLineTokenizer(cl::tokenizeConfigFile, Input, Output, + array_lengthof(Output)); +} + +TEST(CommandLineTest, TokenizeConfigFile8) { + SmallVector Actual; + BumpPtrAllocator A; + StringSaver Saver(A); + cl::tokenizeConfigFile("\\\n", Saver, Actual, /*MarkEOLs=*/false); + EXPECT_TRUE(Actual.empty()); +} + +TEST(CommandLineTest, TokenizeConfigFile9) { + SmallVector Actual; + BumpPtrAllocator A; + StringSaver Saver(A); + cl::tokenizeConfigFile("\\\r\n", Saver, Actual, /*MarkEOLs=*/false); + EXPECT_TRUE(Actual.empty()); +} + +TEST(CommandLineTest, TokenizeConfigFile10) { + const char *Input = "\\\nabc"; + const char *const Output[] = { "abc" }; + testCommandLineTokenizer(cl::tokenizeConfigFile, Input, Output, + array_lengthof(Output)); +} + +TEST(CommandLineTest, TokenizeConfigFile11) { + const char *Input = "\\\r\nabc"; + const char *const Output[] = { "abc" }; + testCommandLineTokenizer(cl::tokenizeConfigFile, Input, Output, + array_lengthof(Output)); +} + TEST(CommandLineTest, AliasesWithArguments) { static const size_t ARGC = 3; const char *const Inputs[][ARGC] = { @@ -648,4 +727,58 @@ TEST(CommandLineTest, SetDefautValue) { EXPECT_TRUE(Opt3 == 3); } +TEST(CommandLineTest, ReadConfigFile) { + llvm::SmallVector Argv; + + llvm::SmallString<128> TestDir; + std::error_code EC = + llvm::sys::fs::createUniqueDirectory("unittest", TestDir); + EXPECT_TRUE(!EC); + + llvm::SmallString<128> TestCfg; + llvm::sys::path::append(TestCfg, TestDir, "foo"); + std::ofstream ConfigFile(TestCfg.c_str()); + EXPECT_TRUE(ConfigFile.is_open()); + ConfigFile << "# Comment\n" + "-option_1\n" + "@subconfig\n" + "-option_3=abcd\n" + "-option_4=\\\n" + "cdef\n"; + ConfigFile.close(); + + llvm::SmallString<128> TestCfg2; + llvm::sys::path::append(TestCfg2, TestDir, "subconfig"); + std::ofstream ConfigFile2(TestCfg2.c_str()); + EXPECT_TRUE(ConfigFile2.is_open()); + ConfigFile2 << "-option_2\n" + "\n" + " # comment\n"; + ConfigFile2.close(); + + // Make sure the current directory is not the directory where config files + // resides. In this case the code that expands response files will not find + // 'subconfig' unless it resolves nested inclusions relative to the including + // file. + llvm::SmallString<128> CurrDir; + EC = llvm::sys::fs::current_path(CurrDir); + EXPECT_TRUE(!EC); + EXPECT_TRUE(StringRef(CurrDir) != StringRef(TestDir)); + + llvm::BumpPtrAllocator A; + llvm::StringSaver Saver(A); + bool Result = llvm::cl::readConfigFile(TestCfg, Saver, Argv); + + EXPECT_TRUE(Result); + EXPECT_EQ(Argv.size(), 4U); + EXPECT_STREQ(Argv[0], "-option_1"); + EXPECT_STREQ(Argv[1], "-option_2"); + EXPECT_STREQ(Argv[2], "-option_3=abcd"); + EXPECT_STREQ(Argv[3], "-option_4=cdef"); + + llvm::sys::fs::remove(TestCfg2); + llvm::sys::fs::remove(TestCfg); + llvm::sys::fs::remove(TestDir); +} + } // anonymous namespace diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp index 64cf23314497..493066ec234b 100644 --- a/utils/TableGen/CodeGenDAGPatterns.cpp +++ b/utils/TableGen/CodeGenDAGPatterns.cpp @@ -2656,6 +2656,10 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){ for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i) Children.push_back(ParseTreePattern(Dag->getArg(i), Dag->getArgNameStr(i))); + // Get the actual number of results before Operator is converted to an intrinsic + // node (which is hard-coded to have either zero or one result). + unsigned NumResults = GetNumNodeResults(Operator, CDP); + // If the operator is an intrinsic, then this is just syntactic sugar for for // (intrinsic_* , ..children..). Pick the right intrinsic node, and // convert the intrinsic name to a number. @@ -2698,7 +2702,6 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){ } } - unsigned NumResults = GetNumNodeResults(Operator, CDP); TreePatternNode *Result = new TreePatternNode(Operator, Children, NumResults); Result->setName(OpName);